saluki_common/
iter.rs

1use std::marker::PhantomData;
2
3use crate::{collections::PrehashedHashSet, hash::hash_single_fast};
4
5/// A helper for deduplicating items in an iterator while amortizing the storage cost of deduplication.
6///
7/// When deduplicating items in an iterator, a set of which items have been seen must be kept. This involves underlying
8/// storage that grows as more items are seen for the first time. When many iterators are being deduplicated, this
9/// storage cost can add up in terms of the number of underlying allocations, and resizing operations of the set.
10///
11/// `ReusableDeduplicator` allows for reuse of the underlying set between iterations.
12///
13/// # Safety
14///
15/// In order to support reusing the underlying storage between iterations, without having to clone the items, we
16/// directly hash items and track the presence of their _hash_ value. This is a subtle distinction compared to using the
17/// items themselves, as it means that we lose the typical fallback of comparing items for equality if there is a hash
18/// collision detected. Consistent with other code in this repository that deals with things like metric tags, we use a
19/// high-quality hash function and make a calculated bet that collisions are _extremely_ unlikely.
20#[derive(Debug)]
21pub struct ReusableDeduplicator<T> {
22    seen: PrehashedHashSet<u64>,
23    _item: PhantomData<T>,
24}
25
26impl<T: Eq + std::hash::Hash> ReusableDeduplicator<T> {
27    /// Create a new `ReusableDeduplicator`.
28    pub fn new() -> Self {
29        Self {
30            seen: PrehashedHashSet::default(),
31            _item: PhantomData,
32        }
33    }
34
35    /// Creates a wrapper iterator over the given iterator that deduplicates the items.
36    pub fn deduplicated<'item, I>(&mut self, iter: I) -> Deduplicated<'_, 'item, I, T>
37    where
38        I: Iterator<Item = &'item T>,
39    {
40        self.seen.clear();
41
42        Deduplicated {
43            iter,
44            seen: &mut self.seen,
45        }
46    }
47}
48
49/// An iterator that deduplicates items based on their hash values.
50pub struct Deduplicated<'seen, 'item, I, T>
51where
52    I: Iterator<Item = &'item T>,
53    T: Eq + std::hash::Hash + 'item,
54{
55    iter: I,
56    seen: &'seen mut PrehashedHashSet<u64>,
57}
58
59impl<'seen, 'item, I, T> Iterator for Deduplicated<'seen, 'item, I, T>
60where
61    I: Iterator<Item = &'item T>,
62    T: Eq + std::hash::Hash + 'item,
63{
64    type Item = &'item T;
65
66    fn next(&mut self) -> Option<Self::Item> {
67        for item in self.iter.by_ref() {
68            let hash = hash_single_fast(item);
69            if !self.seen.contains(&hash) {
70                self.seen.insert(hash);
71                return Some(item);
72            }
73        }
74        None
75    }
76}
77
78#[cfg(test)]
79mod tests {
80    use std::collections::HashSet;
81
82    use proptest::{prelude::*, proptest};
83
84    use super::*;
85
86    #[test]
87    fn basic_no_duplicates() {
88        let mut deduplicator = ReusableDeduplicator::new();
89
90        let input = vec![1, 2, 3];
91        assert!(!input.is_empty());
92
93        let expected = input.clone();
94        let actual = deduplicator.deduplicated(input.iter()).copied().collect::<Vec<_>>();
95        assert_eq!(actual, expected);
96    }
97
98    #[test]
99    fn basic_duplicates() {
100        let mut deduplicator = ReusableDeduplicator::new();
101
102        let input = vec![1, 3, 2, 1, 3, 2, 3, 1];
103        assert!(!input.is_empty());
104
105        let mut expected = input.clone();
106        expected.sort();
107        expected.dedup();
108
109        // We have to sort the deduplicated results otherwise it might not match the order of `expected`, which
110        // we had to sort to actually deduplicate it.
111        let mut actual = deduplicator.deduplicated(input.iter()).copied().collect::<Vec<_>>();
112        actual.sort();
113
114        assert_eq!(actual, expected);
115    }
116
117    #[test]
118    fn empty() {
119        let mut deduplicator = ReusableDeduplicator::<i32>::new();
120
121        let input = [];
122        assert!(input.is_empty());
123
124        let actual = deduplicator.deduplicated(input.iter()).copied().collect::<Vec<_>>();
125        assert!(actual.is_empty());
126    }
127
128    #[test]
129    fn overlapping_seen_with_reuse() {
130        // We're specifically exercising here that the deduplicator clears its set storage after each iteration
131        // so that subsequent iterations do not incorrectly exclude elements that were seen in a previous iteration.
132        let mut deduplicator = ReusableDeduplicator::new();
133
134        // Create two sets of inputs that have an overlap, and assert that they have an overlap:
135        let input1 = vec![1, 2, 3, 4, 5];
136        assert!(!input1.is_empty());
137
138        let input2 = vec![4, 5, 6, 7, 8];
139        assert!(!input2.is_empty());
140
141        let input1_set = input1.iter().collect::<HashSet<_>>();
142        let input2_set = input2.iter().collect::<HashSet<_>>();
143        assert!(!input1_set.is_disjoint(&input2_set));
144
145        // Run both sets of inputs through the deduplicator as separate usages:
146        let expected1 = input1.clone();
147        let actual1 = deduplicator.deduplicated(input1.iter()).copied().collect::<Vec<_>>();
148        assert_eq!(actual1, expected1);
149
150        let expected2 = input2.clone();
151        let actual2 = deduplicator.deduplicated(input2.iter()).copied().collect::<Vec<_>>();
152        assert_eq!(actual2, expected2);
153    }
154
155    proptest! {
156        #[test]
157        fn property_test_basic(input in any::<Vec<i32>>()) {
158            let mut deduplicator = ReusableDeduplicator::new();
159
160            let mut expected = input.clone();
161            expected.sort();
162            expected.dedup();
163
164            // We have to sort the deduplicated results otherwise it might not match the order of `expected`, which
165            // we had to sort to actually deduplicate it.
166            let mut actual = deduplicator.deduplicated(input.iter()).copied().collect::<Vec<_>>();
167            actual.sort();
168
169            prop_assert_eq!(actual, expected);
170        }
171    }
172}