1
+ #[ cfg( test) ]
2
+ mod tests;
3
+
1
4
use std:: borrow:: Cow ;
2
5
use std:: ops:: { Bound , RangeBounds } ;
3
6
use std:: { fmt, slice} ;
4
7
8
+ use memchr:: memmem;
9
+
5
10
use crate :: chars;
6
11
7
- /// A UTF32 encoded (char array) string that is used as an input to (fuzzy) matching.
12
+ /// Check if a given string can be represented internally as the `Ascii` variant in a
13
+ /// [`Utf32String`] or a [`Utf32Str`].
14
+ ///
15
+ /// This returns true if the string is ASCII and does not contain a windows-style newline
16
+ /// `'\r'`.
17
+ /// The additional carriage return check is required since even for strings consisting only
18
+ /// of ASCII, the windows-style newline `\r\n` is treated as a single grapheme.
19
+ #[ inline]
20
+ fn has_ascii_graphemes ( string : & str ) -> bool {
21
+ string. is_ascii ( ) && memmem:: find ( string. as_bytes ( ) , b"\r \n " ) . is_none ( )
22
+ }
23
+
24
+ /// A UTF-32 encoded (char array) string that is used as an input to (fuzzy) matching.
25
+ ///
26
+ /// This is mostly intended as an internal string type, but some methods are exposed for
27
+ /// convenience. We make the following API guarantees for `Utf32Str(ing)`s produced from a string
28
+ /// using one of its `From<T>` constructors for string types `T` or from the
29
+ /// [`Utf32Str::new`] method.
30
+ ///
31
+ /// 1. The `Ascii` variant contains a byte buffer which is guaranteed to be a valid string
32
+ /// slice.
33
+ /// 2. It is guaranteed that the string slice internal to the `Ascii` variant is identical
34
+ /// to the original string.
35
+ /// 3. The length of a `Utf32Str(ing)` is exactly the number of graphemes in the original string.
36
+ ///
37
+ /// Since `Utf32Str(ing)`s variants may be constructed directly, you **must not** make these
38
+ /// assumptions when handling `Utf32Str(ing)`s of unknown origin.
39
+ ///
40
+ /// ## Caveats
41
+ /// Despite the name, this type is quite far from being a true string type. Here are some
42
+ /// examples demonstrating this.
8
43
///
9
- /// Usually rusts' utf8 encoded strings are great. However during fuzzy matching
10
- /// operates on codepoints (it should operate on graphemes but that's too much
11
- /// hassle to deal with). We want to quickly iterate these codepoints between
12
- /// (up to 5 times) during matching.
44
+ /// ### String conversions are not round-trip
45
+ /// In the presence of a multi-codepoint grapheme (e.g. `"u\u{0308}"` which is `u +
46
+ /// COMBINING_DIAERESIS`), the trailing codepoints are truncated.
47
+ /// ```
48
+ /// # use nucleo_matcher::Utf32String;
49
+ /// assert_eq!(Utf32String::from("u\u{0308}").to_string(), "u");
50
+ /// ```
51
+ ///
52
+ /// ### Indexing is done by grapheme
53
+ /// Indexing into a string is done by grapheme rather than by codepoint.
54
+ /// ```
55
+ /// # use nucleo_matcher::Utf32String;
56
+ /// assert!(Utf32String::from("au\u{0308}").len() == 2);
57
+ /// ```
58
+ ///
59
+ /// ### A `Unicode` variant may be produced by all-ASCII characters.
60
+ /// Since the windows-style newline `\r\n` is ASCII only but considered to be a single grapheme,
61
+ /// strings containing `\r\n` will still result in a `Unicode` variant.
62
+ /// ```
63
+ /// # use nucleo_matcher::Utf32String;
64
+ /// let s = Utf32String::from("\r\n");
65
+ /// assert!(!s.slice(..).is_ascii());
66
+ /// assert!(s.len() == 1);
67
+ /// assert!(s.slice(..).get(0) == '\n');
68
+ /// ```
69
+ ///
70
+ /// ## Design rationale
71
+ /// Usually Rust's UTF-8 encoded strings are great. However, since fuzzy matching
72
+ /// operates on codepoints (ideally, it should operate on graphemes but that's too
73
+ /// much hassle to deal with), we want to quickly iterate over codepoints (up to 5
74
+ /// times) during matching.
13
75
///
14
76
/// Doing codepoint segmentation on the fly not only blows trough the cache
15
- /// (lookuptables and Icache ) but also has nontrivial runtime compared to the
16
- /// matching itself. Furthermore there are a lot of exta optimizations available
17
- /// for ascii only text ( but checking during each match has too much overhead) .
77
+ /// (lookup tables and I-cache ) but also has nontrivial runtime compared to the
78
+ /// matching itself. Furthermore there are many extra optimizations available
79
+ /// for ASCII only text, but checking each match has too much overhead.
18
80
///
19
- /// Ofcourse this comes at exta memory cost as we usually still need the ut8
20
- /// encoded variant for rendering. In the (dominant) case of ascii -only text
81
+ /// Of course, this comes at extra memory cost as we usually still need the UTF-8
82
+ /// encoded variant for rendering. In the (dominant) case of ASCII -only text
21
83
/// we don't require a copy. Furthermore fuzzy matching usually is applied while
22
84
/// the user is typing on the fly so the same item is potentially matched many
23
- /// times (making the the upfront cost more worth it). That means that its
24
- /// basically always worth it to presegment the string.
85
+ /// times (making the the up-front cost more worth it). That means that its
86
+ /// basically always worth it to pre-segment the string.
25
87
///
26
88
/// For usecases that only match (a lot of) strings once its possible to keep
27
- /// char buffer around that is filled with the presegmented chars
89
+ /// char buffer around that is filled with the presegmented chars.
28
90
///
29
91
/// Another advantage of this approach is that the matcher will naturally
30
- /// produce char indices (instead of utf8 offsets) anyway. With a
92
+ /// produce grapheme indices (instead of utf8 offsets) anyway. With a
31
93
/// codepoint basic representation like this the indices can be used
32
94
/// directly
33
95
#[ derive( PartialEq , Eq , PartialOrd , Ord , Clone , Copy , Hash ) ]
34
96
pub enum Utf32Str < ' a > {
35
97
/// A string represented as ASCII encoded bytes.
36
- /// Correctness invariant: must only contain valid ASCII (<= 127)
98
+ /// Correctness invariant: must only contain valid ASCII (`<= 127` )
37
99
Ascii ( & ' a [ u8 ] ) ,
38
100
/// A string represented as an array of unicode codepoints (basically UTF-32).
39
101
Unicode ( & ' a [ char ] ) ,
40
102
}
41
103
42
104
impl < ' a > Utf32Str < ' a > {
43
- /// Convenience method to construct a `Utf32Str` from a normal utf8 str
105
+ /// Convenience method to construct a `Utf32Str` from a normal UTF-8 str
44
106
pub fn new ( str : & ' a str , buf : & ' a mut Vec < char > ) -> Self {
45
- if str . is_ascii ( ) {
107
+ if has_ascii_graphemes ( str ) {
46
108
Utf32Str :: Ascii ( str. as_bytes ( ) )
47
109
} else {
48
110
buf. clear ( ) ;
49
111
buf. extend ( crate :: chars:: graphemes ( str) ) ;
50
- if buf. iter ( ) . all ( |c| c. is_ascii ( ) ) {
51
- return Utf32Str :: Ascii ( str. as_bytes ( ) ) ;
52
- }
53
- Utf32Str :: Unicode ( & * buf)
112
+ Utf32Str :: Unicode ( buf)
54
113
}
55
114
}
56
115
@@ -107,7 +166,7 @@ impl<'a> Utf32Str<'a> {
107
166
}
108
167
}
109
168
110
- /// Returns the number of leading whitespaces in this string
169
+ /// Returns the number of trailing whitespaces in this string
111
170
#[ inline]
112
171
pub ( crate ) fn trailing_white_space ( self ) -> usize {
113
172
match self {
@@ -144,25 +203,36 @@ impl<'a> Utf32Str<'a> {
144
203
}
145
204
}
146
205
147
- /// Returns whether this string only contains ascii text.
206
+ /// Returns whether this string only contains graphemes which are single ASCII chars.
207
+ ///
208
+ /// This is almost equivalent to the string being ASCII, except with the additional requirement
209
+ /// that the string cannot contain a windows-style newline `\r\n` which is treated as a single
210
+ /// grapheme.
148
211
pub fn is_ascii ( self ) -> bool {
149
212
matches ! ( self , Utf32Str :: Ascii ( _) )
150
213
}
151
214
152
- /// Returns the `n`th character in this string.
215
+ /// Returns the `n`th character in this string, zero-indexed
153
216
pub fn get ( self , n : u32 ) -> char {
154
217
match self {
155
218
Utf32Str :: Ascii ( bytes) => bytes[ n as usize ] as char ,
156
219
Utf32Str :: Unicode ( codepoints) => codepoints[ n as usize ] ,
157
220
}
158
221
}
222
+
223
+ /// Returns the last character in this string.
224
+ ///
225
+ /// Panics if the string is empty.
159
226
pub ( crate ) fn last ( self ) -> char {
160
227
match self {
161
228
Utf32Str :: Ascii ( bytes) => bytes[ bytes. len ( ) - 1 ] as char ,
162
229
Utf32Str :: Unicode ( codepoints) => codepoints[ codepoints. len ( ) - 1 ] ,
163
230
}
164
231
}
165
232
233
+ /// Returns the first character in this string.
234
+ ///
235
+ /// Panics if the string is empty.
166
236
pub ( crate ) fn first ( self ) -> char {
167
237
match self {
168
238
Utf32Str :: Ascii ( bytes) => bytes[ 0 ] as char ,
@@ -204,6 +274,7 @@ pub enum Chars<'a> {
204
274
Ascii ( slice:: Iter < ' a , u8 > ) ,
205
275
Unicode ( slice:: Iter < ' a , char > ) ,
206
276
}
277
+
207
278
impl Iterator for Chars < ' _ > {
208
279
type Item = char ;
209
280
@@ -226,6 +297,8 @@ impl DoubleEndedIterator for Chars<'_> {
226
297
227
298
#[ derive( PartialEq , Eq , PartialOrd , Ord , Clone , Hash ) ]
228
299
/// An owned version of [`Utf32Str`].
300
+ ///
301
+ /// See the API documentation for [`Utf32Str`] for more detail.
229
302
pub enum Utf32String {
230
303
/// A string represented as ASCII encoded bytes.
231
304
/// Correctness invariant: must only contain valid ASCII (<=127)
@@ -307,7 +380,7 @@ impl Utf32String {
307
380
impl From < & str > for Utf32String {
308
381
#[ inline]
309
382
fn from ( value : & str ) -> Self {
310
- if value . is_ascii ( ) {
383
+ if has_ascii_graphemes ( value ) {
311
384
Self :: Ascii ( value. to_owned ( ) . into_boxed_str ( ) )
312
385
} else {
313
386
Self :: Unicode ( chars:: graphemes ( value) . collect ( ) )
@@ -317,7 +390,7 @@ impl From<&str> for Utf32String {
317
390
318
391
impl From < Box < str > > for Utf32String {
319
392
fn from ( value : Box < str > ) -> Self {
320
- if value . is_ascii ( ) {
393
+ if has_ascii_graphemes ( & value ) {
321
394
Self :: Ascii ( value)
322
395
} else {
323
396
Self :: Unicode ( chars:: graphemes ( & value) . collect ( ) )
0 commit comments