Skip to content

Commit d17e29a

Browse files
authored
Merge pull request #61 from rutar-forks/handle-windows-newline
2 parents 474edc7 + 21fc173 commit d17e29a

File tree

3 files changed

+154
-30
lines changed

3 files changed

+154
-30
lines changed

matcher/src/chars.rs

+11-4
Original file line numberDiff line numberDiff line change
@@ -189,10 +189,17 @@ pub(crate) enum CharClass {
189189
pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ {
190190
#[cfg(feature = "unicode-segmentation")]
191191
let res = text.graphemes(true).map(|grapheme| {
192-
grapheme
193-
.chars()
194-
.next()
195-
.expect("graphemes must be non-empty")
192+
// we need to special-case this check since `\r\n` is a single grapheme and is
193+
// therefore the exception to the rule that normalization of a grapheme should
194+
// map to the first character.
195+
if grapheme == "\r\n" {
196+
'\n'
197+
} else {
198+
grapheme
199+
.chars()
200+
.next()
201+
.expect("graphemes must be non-empty")
202+
}
196203
});
197204
#[cfg(not(feature = "unicode-segmentation"))]
198205
let res = text.chars();

matcher/src/utf32_str.rs

+99-26
Original file line numberDiff line numberDiff line change
@@ -1,56 +1,115 @@
1+
#[cfg(test)]
2+
mod tests;
3+
14
use std::borrow::Cow;
25
use std::ops::{Bound, RangeBounds};
36
use std::{fmt, slice};
47

8+
use memchr::memmem;
9+
510
use crate::chars;
611

7-
/// A UTF32 encoded (char array) string that is used as an input to (fuzzy) matching.
12+
/// Check if a given string can be represented internally as the `Ascii` variant in a
13+
/// [`Utf32String`] or a [`Utf32Str`].
14+
///
15+
/// This returns true if the string is ASCII and does not contain a windows-style newline
16+
/// `'\r'`.
17+
/// The additional carriage return check is required since even for strings consisting only
18+
/// of ASCII, the windows-style newline `\r\n` is treated as a single grapheme.
19+
#[inline]
20+
fn has_ascii_graphemes(string: &str) -> bool {
21+
string.is_ascii() && memmem::find(string.as_bytes(), b"\r\n").is_none()
22+
}
23+
24+
/// A UTF-32 encoded (char array) string that is used as an input to (fuzzy) matching.
25+
///
26+
/// This is mostly intended as an internal string type, but some methods are exposed for
27+
/// convenience. We make the following API guarantees for `Utf32Str(ing)`s produced from a string
28+
/// using one of its `From<T>` constructors for string types `T` or from the
29+
/// [`Utf32Str::new`] method.
30+
///
31+
/// 1. The `Ascii` variant contains a byte buffer which is guaranteed to be a valid string
32+
/// slice.
33+
/// 2. It is guaranteed that the string slice internal to the `Ascii` variant is identical
34+
/// to the original string.
35+
/// 3. The length of a `Utf32Str(ing)` is exactly the number of graphemes in the original string.
36+
///
37+
/// Since `Utf32Str(ing)`s variants may be constructed directly, you **must not** make these
38+
/// assumptions when handling `Utf32Str(ing)`s of unknown origin.
39+
///
40+
/// ## Caveats
41+
/// Despite the name, this type is quite far from being a true string type. Here are some
42+
/// examples demonstrating this.
843
///
9-
/// Usually rusts' utf8 encoded strings are great. However during fuzzy matching
10-
/// operates on codepoints (it should operate on graphemes but that's too much
11-
/// hassle to deal with). We want to quickly iterate these codepoints between
12-
/// (up to 5 times) during matching.
44+
/// ### String conversions are not round-trip
45+
/// In the presence of a multi-codepoint grapheme (e.g. `"u\u{0308}"` which is `u +
46+
/// COMBINING_DIAERESIS`), the trailing codepoints are truncated.
47+
/// ```
48+
/// # use nucleo_matcher::Utf32String;
49+
/// assert_eq!(Utf32String::from("u\u{0308}").to_string(), "u");
50+
/// ```
51+
///
52+
/// ### Indexing is done by grapheme
53+
/// Indexing into a string is done by grapheme rather than by codepoint.
54+
/// ```
55+
/// # use nucleo_matcher::Utf32String;
56+
/// assert!(Utf32String::from("au\u{0308}").len() == 2);
57+
/// ```
58+
///
59+
/// ### A `Unicode` variant may be produced by all-ASCII characters.
60+
/// Since the windows-style newline `\r\n` is ASCII only but considered to be a single grapheme,
61+
/// strings containing `\r\n` will still result in a `Unicode` variant.
62+
/// ```
63+
/// # use nucleo_matcher::Utf32String;
64+
/// let s = Utf32String::from("\r\n");
65+
/// assert!(!s.slice(..).is_ascii());
66+
/// assert!(s.len() == 1);
67+
/// assert!(s.slice(..).get(0) == '\n');
68+
/// ```
69+
///
70+
/// ## Design rationale
71+
/// Usually Rust's UTF-8 encoded strings are great. However, since fuzzy matching
72+
/// operates on codepoints (ideally, it should operate on graphemes but that's too
73+
/// much hassle to deal with), we want to quickly iterate over codepoints (up to 5
74+
/// times) during matching.
1375
///
1476
/// Doing codepoint segmentation on the fly not only blows trough the cache
15-
/// (lookuptables and Icache) but also has nontrivial runtime compared to the
16-
/// matching itself. Furthermore there are a lot of exta optimizations available
17-
/// for ascii only text (but checking during each match has too much overhead).
77+
/// (lookup tables and I-cache) but also has nontrivial runtime compared to the
78+
/// matching itself. Furthermore there are many extra optimizations available
79+
/// for ASCII only text, but checking each match has too much overhead.
1880
///
19-
/// Ofcourse this comes at exta memory cost as we usually still need the ut8
20-
/// encoded variant for rendering. In the (dominant) case of ascii-only text
81+
/// Of course, this comes at extra memory cost as we usually still need the UTF-8
82+
/// encoded variant for rendering. In the (dominant) case of ASCII-only text
2183
/// we don't require a copy. Furthermore fuzzy matching usually is applied while
2284
/// the user is typing on the fly so the same item is potentially matched many
23-
/// times (making the the upfront cost more worth it). That means that its
24-
/// basically always worth it to presegment the string.
85+
/// times (making the the up-front cost more worth it). That means that its
86+
/// basically always worth it to pre-segment the string.
2587
///
2688
/// For usecases that only match (a lot of) strings once its possible to keep
27-
/// char buffer around that is filled with the presegmented chars
89+
/// char buffer around that is filled with the presegmented chars.
2890
///
2991
/// Another advantage of this approach is that the matcher will naturally
30-
/// produce char indices (instead of utf8 offsets) anyway. With a
92+
/// produce grapheme indices (instead of utf8 offsets) anyway. With a
3193
/// codepoint basic representation like this the indices can be used
3294
/// directly
3395
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
3496
pub enum Utf32Str<'a> {
3597
/// A string represented as ASCII encoded bytes.
36-
/// Correctness invariant: must only contain valid ASCII (<=127)
98+
/// Correctness invariant: must only contain valid ASCII (`<= 127`)
3799
Ascii(&'a [u8]),
38100
/// A string represented as an array of unicode codepoints (basically UTF-32).
39101
Unicode(&'a [char]),
40102
}
41103

42104
impl<'a> Utf32Str<'a> {
43-
/// Convenience method to construct a `Utf32Str` from a normal utf8 str
105+
/// Convenience method to construct a `Utf32Str` from a normal UTF-8 str
44106
pub fn new(str: &'a str, buf: &'a mut Vec<char>) -> Self {
45-
if str.is_ascii() {
107+
if has_ascii_graphemes(str) {
46108
Utf32Str::Ascii(str.as_bytes())
47109
} else {
48110
buf.clear();
49111
buf.extend(crate::chars::graphemes(str));
50-
if buf.iter().all(|c| c.is_ascii()) {
51-
return Utf32Str::Ascii(str.as_bytes());
52-
}
53-
Utf32Str::Unicode(&*buf)
112+
Utf32Str::Unicode(buf)
54113
}
55114
}
56115

@@ -107,7 +166,7 @@ impl<'a> Utf32Str<'a> {
107166
}
108167
}
109168

110-
/// Returns the number of leading whitespaces in this string
169+
/// Returns the number of trailing whitespaces in this string
111170
#[inline]
112171
pub(crate) fn trailing_white_space(self) -> usize {
113172
match self {
@@ -144,25 +203,36 @@ impl<'a> Utf32Str<'a> {
144203
}
145204
}
146205

147-
/// Returns whether this string only contains ascii text.
206+
/// Returns whether this string only contains graphemes which are single ASCII chars.
207+
///
208+
/// This is almost equivalent to the string being ASCII, except with the additional requirement
209+
/// that the string cannot contain a windows-style newline `\r\n` which is treated as a single
210+
/// grapheme.
148211
pub fn is_ascii(self) -> bool {
149212
matches!(self, Utf32Str::Ascii(_))
150213
}
151214

152-
/// Returns the `n`th character in this string.
215+
/// Returns the `n`th character in this string, zero-indexed
153216
pub fn get(self, n: u32) -> char {
154217
match self {
155218
Utf32Str::Ascii(bytes) => bytes[n as usize] as char,
156219
Utf32Str::Unicode(codepoints) => codepoints[n as usize],
157220
}
158221
}
222+
223+
/// Returns the last character in this string.
224+
///
225+
/// Panics if the string is empty.
159226
pub(crate) fn last(self) -> char {
160227
match self {
161228
Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char,
162229
Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1],
163230
}
164231
}
165232

233+
/// Returns the first character in this string.
234+
///
235+
/// Panics if the string is empty.
166236
pub(crate) fn first(self) -> char {
167237
match self {
168238
Utf32Str::Ascii(bytes) => bytes[0] as char,
@@ -204,6 +274,7 @@ pub enum Chars<'a> {
204274
Ascii(slice::Iter<'a, u8>),
205275
Unicode(slice::Iter<'a, char>),
206276
}
277+
207278
impl Iterator for Chars<'_> {
208279
type Item = char;
209280

@@ -226,6 +297,8 @@ impl DoubleEndedIterator for Chars<'_> {
226297

227298
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)]
228299
/// An owned version of [`Utf32Str`].
300+
///
301+
/// See the API documentation for [`Utf32Str`] for more detail.
229302
pub enum Utf32String {
230303
/// A string represented as ASCII encoded bytes.
231304
/// Correctness invariant: must only contain valid ASCII (<=127)
@@ -307,7 +380,7 @@ impl Utf32String {
307380
impl From<&str> for Utf32String {
308381
#[inline]
309382
fn from(value: &str) -> Self {
310-
if value.is_ascii() {
383+
if has_ascii_graphemes(value) {
311384
Self::Ascii(value.to_owned().into_boxed_str())
312385
} else {
313386
Self::Unicode(chars::graphemes(value).collect())
@@ -317,7 +390,7 @@ impl From<&str> for Utf32String {
317390

318391
impl From<Box<str>> for Utf32String {
319392
fn from(value: Box<str>) -> Self {
320-
if value.is_ascii() {
393+
if has_ascii_graphemes(&value) {
321394
Self::Ascii(value)
322395
} else {
323396
Self::Unicode(chars::graphemes(&value).collect())

matcher/src/utf32_str/tests.rs

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
use crate::{Utf32Str, Utf32String};
2+
3+
#[test]
4+
fn test_utf32str_ascii() {
5+
/// Helper function for testing
6+
fn expect_ascii(src: &str, is_ascii: bool) {
7+
let mut buffer = Vec::new();
8+
assert!(Utf32Str::new(src, &mut buffer).is_ascii() == is_ascii);
9+
assert!(Utf32String::from(src).slice(..).is_ascii() == is_ascii);
10+
assert!(Utf32String::from(src.to_owned()).slice(..).is_ascii() == is_ascii);
11+
}
12+
13+
// ascii
14+
expect_ascii("", true);
15+
expect_ascii("a", true);
16+
expect_ascii("a\nb", true);
17+
expect_ascii("\n\r", true);
18+
19+
// not ascii
20+
expect_ascii("aü", false);
21+
expect_ascii("au\u{0308}", false);
22+
23+
// windows-style newline
24+
expect_ascii("a\r\nb", false);
25+
expect_ascii(\r\n", false);
26+
expect_ascii("\r\n", false);
27+
}
28+
29+
#[test]
30+
fn test_grapheme_truncation() {
31+
// ascii is preserved
32+
let s = Utf32String::from("ab");
33+
assert_eq!(s.slice(..).get(0), 'a');
34+
assert_eq!(s.slice(..).get(1), 'b');
35+
36+
// windows-style newline is truncated to '\n'
37+
let s = Utf32String::from("\r\n");
38+
assert_eq!(s.slice(..).get(0), '\n');
39+
40+
// normal graphemes are truncated to the first character
41+
let s = Utf32String::from("u\u{0308}\r\n");
42+
assert_eq!(s.slice(..).get(0), 'u');
43+
assert_eq!(s.slice(..).get(1), '\n');
44+
}

0 commit comments

Comments
 (0)