From 2fab467f117ccf4e4ee43417606870935c6490b4 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Mon, 23 Sep 2024 18:04:15 -0700 Subject: [PATCH 1/5] Add get_strict fn to ZeroTrie --- utils/zerotrie/src/options.rs | 27 +++++++++++++++++++++++++++ utils/zerotrie/src/reader.rs | 23 +++++++++++++++++++---- utils/zerotrie/src/zerotrie.rs | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 4 deletions(-) diff --git a/utils/zerotrie/src/options.rs b/utils/zerotrie/src/options.rs index af9e90fd87a..d0c1bc101d7 100644 --- a/utils/zerotrie/src/options.rs +++ b/utils/zerotrie/src/options.rs @@ -73,6 +73,15 @@ pub(crate) enum CaseSensitivity { IgnoreCase, } +/// How to handle lookup for strings with mixed ASCII case. Only used in ignore-case tries +#[derive(Copy, Clone)] +pub(crate) enum LookupStrictness { + /// Select strings that differ in case so long as their `to_ascii_lowercase` matches + Normal, + /// Select strings only if they match exactly + Strict, +} + impl CaseSensitivity { #[cfg(feature = "serde")] const fn to_u8_flag(self) -> u8 { @@ -89,6 +98,7 @@ pub(crate) struct ZeroTrieBuilderOptions { pub ascii_mode: AsciiMode, pub capacity_mode: CapacityMode, pub case_sensitivity: CaseSensitivity, + pub lookup_strictness: LookupStrictness, } impl ZeroTrieBuilderOptions { @@ -113,6 +123,7 @@ impl ZeroTrieWithOptions for crate::ZeroTrieSimpleAscii { ascii_mode: AsciiMode::AsciiOnly, capacity_mode: CapacityMode::Normal, case_sensitivity: CaseSensitivity::Sensitive, + lookup_strictness: LookupStrictness::Normal, }; } @@ -129,6 +140,7 @@ impl ZeroTrieWithOptions for crate::ZeroAsciiIgnoreCaseTrie { ascii_mode: AsciiMode::AsciiOnly, capacity_mode: CapacityMode::Normal, case_sensitivity: CaseSensitivity::IgnoreCase, + lookup_strictness: LookupStrictness::Normal, }; } @@ -137,6 +149,19 @@ impl crate::ZeroAsciiIgnoreCaseTrie { pub(crate) const FLAGS: u8 = Self::OPTIONS.to_u8_flags(); } +/// Internal struct to power `get_strict` +pub(crate) struct ZeroAsciiIgnoreCaseStrictTrie; + +impl ZeroTrieWithOptions for ZeroAsciiIgnoreCaseStrictTrie { + const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { + phf_mode: PhfMode::BinaryOnly, + ascii_mode: AsciiMode::AsciiOnly, + capacity_mode: CapacityMode::Normal, + case_sensitivity: CaseSensitivity::IgnoreCase, + lookup_strictness: LookupStrictness::Strict, + }; +} + /// Branch nodes could be either binary search or PHF. impl ZeroTrieWithOptions for crate::ZeroTriePerfectHash { const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { @@ -144,6 +169,7 @@ impl ZeroTrieWithOptions for crate::ZeroTriePerfectHash { ascii_mode: AsciiMode::BinarySpans, capacity_mode: CapacityMode::Normal, case_sensitivity: CaseSensitivity::Sensitive, + lookup_strictness: LookupStrictness::Normal, }; } @@ -159,6 +185,7 @@ impl ZeroTrieWithOptions for crate::ZeroTrieExtendedCapacity { ascii_mode: AsciiMode::BinarySpans, capacity_mode: CapacityMode::Extended, case_sensitivity: CaseSensitivity::Sensitive, + lookup_strictness: LookupStrictness::Normal, }; } diff --git a/utils/zerotrie/src/reader.rs b/utils/zerotrie/src/reader.rs index eed1c80aaad..54805969118 100644 --- a/utils/zerotrie/src/reader.rs +++ b/utils/zerotrie/src/reader.rs @@ -321,7 +321,7 @@ pub(crate) fn get_parameterized( }; if let Some((c, temp)) = ascii.split_first() { if matches!(byte_type, NodeType::Ascii) { - let is_match = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) + let is_match = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) && matches!(T::OPTIONS.lookup_strictness, LookupStrictness::Normal) { b.to_ascii_lowercase() == c.to_ascii_lowercase() } else { @@ -369,10 +369,25 @@ pub(crate) fn get_parameterized( (search, trie) = trie.debug_split_at(x); let bsearch_result = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) { - search.binary_search_by_key(&c.to_ascii_lowercase(), |x| { - x.to_ascii_lowercase() - }) + if matches!(T::OPTIONS.lookup_strictness, LookupStrictness::Normal) { + // Ordering: (A=a), (B=b), (C=c), ..., (Z=z) + search.binary_search_by_key(&c.to_ascii_lowercase(), |x| { + x.to_ascii_lowercase() + }) + } else { + // Ordering: A, a, B, b, C, c, ..., Z, z + let c_lowercase = c.to_ascii_lowercase(); + search.binary_search_by(move |p| { + let p_lowercase = p.to_ascii_lowercase(); + if c_lowercase == p_lowercase { + p.cmp(c) + } else { + p_lowercase.cmp(&c_lowercase) + } + }) + } } else { + // Ordering: A, B, C, ..., Z, a, b, c, ..., z search.binary_search(c) }; i = bsearch_result.ok()?; diff --git a/utils/zerotrie/src/zerotrie.rs b/utils/zerotrie/src/zerotrie.rs index 21d6b430de2..a76b70b823f 100644 --- a/utils/zerotrie/src/zerotrie.rs +++ b/utils/zerotrie/src/zerotrie.rs @@ -665,6 +665,39 @@ impl_zerotrie_subtype!( Vec::into_boxed_slice ); +impl ZeroAsciiIgnoreCaseTrie +where +Store: AsRef<[u8]> + ?Sized, +{ + /// Queries the trie for a string, requiring that it matches case. + /// + /// # Examples + /// + /// ``` + /// use litemap::LiteMap; + /// use zerotrie::ZeroAsciiIgnoreCaseTrie; + /// + /// let mut map = LiteMap::new_vec(); + /// map.insert(&b"foo"[..], 1); + /// map.insert(b"Bar", 2); + /// map.insert(b"Bingo", 3); + /// + /// let trie = ZeroAsciiIgnoreCaseTrie::try_from(&map)?; + /// + /// assert_eq!(trie.get(b"foo"), Some(1)); + /// assert_eq!(trie.get(b"bar"), Some(2)); + /// assert_eq!(trie.get(b"BaR"), Some(2)); + /// assert_eq!(trie.get_strict(b"bar"), None); + /// assert_eq!(trie.get_strict(b"BaR"), None); + /// assert_eq!(trie.get_strict(b"Bar"), Some(2)); + /// + /// # Ok::<_, zerotrie::ZeroTrieBuildError>(()) + /// ``` + pub fn get_strict(&self, key: K) -> Option where K: AsRef<[u8]> { + reader::get_parameterized::(self.store.as_ref(), key.as_ref()) + } +} + macro_rules! impl_dispatch { ($self:ident, $inner_fn:ident()) => { match $self.0 { From 272670bc0decfc996fb48bcfd31e196a2bfc754d Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Mon, 23 Sep 2024 18:06:48 -0700 Subject: [PATCH 2/5] Add TODO --- utils/zerotrie/src/reader.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/zerotrie/src/reader.rs b/utils/zerotrie/src/reader.rs index 54805969118..91e28ce1d6d 100644 --- a/utils/zerotrie/src/reader.rs +++ b/utils/zerotrie/src/reader.rs @@ -367,6 +367,7 @@ pub(crate) fn get_parameterized( if matches!(T::OPTIONS.phf_mode, PhfMode::BinaryOnly) || x < 16 { // binary search (search, trie) = trie.debug_split_at(x); + // TODO(#5584): Consider making all of these have the same order of elements let bsearch_result = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) { if matches!(T::OPTIONS.lookup_strictness, LookupStrictness::Normal) { From f70f0cdfa82d71cb6d365afe37330e8c50c29d00 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Mon, 23 Sep 2024 18:19:04 -0700 Subject: [PATCH 3/5] fmt --- utils/zerotrie/src/reader.rs | 3 ++- utils/zerotrie/src/zerotrie.rs | 14 ++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/utils/zerotrie/src/reader.rs b/utils/zerotrie/src/reader.rs index 91e28ce1d6d..64ae6e1651f 100644 --- a/utils/zerotrie/src/reader.rs +++ b/utils/zerotrie/src/reader.rs @@ -321,7 +321,8 @@ pub(crate) fn get_parameterized( }; if let Some((c, temp)) = ascii.split_first() { if matches!(byte_type, NodeType::Ascii) { - let is_match = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) && matches!(T::OPTIONS.lookup_strictness, LookupStrictness::Normal) + let is_match = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) + && matches!(T::OPTIONS.lookup_strictness, LookupStrictness::Normal) { b.to_ascii_lowercase() == c.to_ascii_lowercase() } else { diff --git a/utils/zerotrie/src/zerotrie.rs b/utils/zerotrie/src/zerotrie.rs index a76b70b823f..e82c9262ec5 100644 --- a/utils/zerotrie/src/zerotrie.rs +++ b/utils/zerotrie/src/zerotrie.rs @@ -667,10 +667,10 @@ impl_zerotrie_subtype!( impl ZeroAsciiIgnoreCaseTrie where -Store: AsRef<[u8]> + ?Sized, + Store: AsRef<[u8]> + ?Sized, { /// Queries the trie for a string, requiring that it matches case. - /// + /// /// # Examples /// /// ``` @@ -693,8 +693,14 @@ Store: AsRef<[u8]> + ?Sized, /// /// # Ok::<_, zerotrie::ZeroTrieBuildError>(()) /// ``` - pub fn get_strict(&self, key: K) -> Option where K: AsRef<[u8]> { - reader::get_parameterized::(self.store.as_ref(), key.as_ref()) + pub fn get_strict(&self, key: K) -> Option + where + K: AsRef<[u8]>, + { + reader::get_parameterized::( + self.store.as_ref(), + key.as_ref(), + ) } } From 460fef007bdc4139489492882b743825ef2be84a Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 25 Sep 2024 15:41:32 -0500 Subject: [PATCH 4/5] Update utils/zerotrie/src/options.rs Co-authored-by: Robert Bastian <4706271+robertbastian@users.noreply.github.com> --- utils/zerotrie/src/options.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/utils/zerotrie/src/options.rs b/utils/zerotrie/src/options.rs index d0c1bc101d7..55e3fe80f3f 100644 --- a/utils/zerotrie/src/options.rs +++ b/utils/zerotrie/src/options.rs @@ -154,11 +154,8 @@ pub(crate) struct ZeroAsciiIgnoreCaseStrictTrie; impl ZeroTrieWithOptions for ZeroAsciiIgnoreCaseStrictTrie { const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { - phf_mode: PhfMode::BinaryOnly, - ascii_mode: AsciiMode::AsciiOnly, - capacity_mode: CapacityMode::Normal, - case_sensitivity: CaseSensitivity::IgnoreCase, lookup_strictness: LookupStrictness::Strict, + ..ZeroAsciiIgnoreCaseTrie::OPTIONS }; } From 8336a9d985d96ae99a16f9b398a6f743bf5c8342 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 25 Sep 2024 16:26:15 -0500 Subject: [PATCH 5/5] Update options.rs --- utils/zerotrie/src/options.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/zerotrie/src/options.rs b/utils/zerotrie/src/options.rs index 55e3fe80f3f..df37a4415d1 100644 --- a/utils/zerotrie/src/options.rs +++ b/utils/zerotrie/src/options.rs @@ -155,7 +155,7 @@ pub(crate) struct ZeroAsciiIgnoreCaseStrictTrie; impl ZeroTrieWithOptions for ZeroAsciiIgnoreCaseStrictTrie { const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { lookup_strictness: LookupStrictness::Strict, - ..ZeroAsciiIgnoreCaseTrie::OPTIONS + ..crate::ZeroAsciiIgnoreCaseTrie::OPTIONS }; }