dd_sds/scanner/regex_rule/
regex_store.rs

1use crate::stats::GLOBAL_STATS;
2use ahash::AHashMap;
3use lazy_static::lazy_static;
4use regex_automata::meta::{Cache, Regex as MetaRegex};
5use slotmap::{SlotMap, new_key_type};
6use std::ops::Deref;
7use std::sync::Weak;
8use std::sync::{Arc, Mutex};
9
10struct WeakSharedRegex {
11    regex: Weak<MetaRegex>,
12    // number of bytes used for the cache. Just used for metrics.
13    cache_size: usize,
14}
15
16#[derive(Debug, Clone)]
17pub struct SharedRegex {
18    pub regex: Arc<MetaRegex>,
19    pub cache_key: RegexCacheKey,
20}
21
22impl Deref for SharedRegex {
23    type Target = MetaRegex;
24
25    fn deref(&self) -> &Self::Target {
26        self.regex.deref()
27    }
28}
29
30pub fn get_memoized_regex<T>(
31    pattern: &str,
32    regex_factory: impl FnOnce(&str) -> Result<regex_automata::meta::Regex, T>,
33) -> Result<SharedRegex, T> {
34    get_memoized_regex_with_custom_store(pattern, regex_factory, &REGEX_STORE)
35}
36
37fn get_memoized_regex_with_custom_store<T>(
38    pattern: &str,
39    regex_factory: impl FnOnce(&str) -> Result<regex_automata::meta::Regex, T>,
40    store: &Mutex<RegexStore>,
41) -> Result<SharedRegex, T> {
42    {
43        let regex_store = store.lock().unwrap();
44        if let Some(exiting_regex) = regex_store.get(pattern) {
45            return Ok(exiting_regex);
46        }
47    }
48
49    // Create the new regex after the RegexStore lock is released, since this can be slow
50    let regex = regex_factory(pattern)?;
51
52    let mut regex_store = store.lock().unwrap();
53    Ok(regex_store.insert(pattern, regex))
54}
55
56// A GC of the regex store happens every N insertions
57// This is needed to occasionally clean out Weak references that have been dropped.
58const GC_FREQUENCY: u64 = 1_000;
59
60lazy_static! {
61    static ref REGEX_STORE: Arc<Mutex<RegexStore>> = Arc::new(Mutex::new(RegexStore::new()));
62}
63new_key_type! { pub struct RegexCacheKey; }
64
65struct RegexStore {
66    pattern_index: AHashMap<String, RegexCacheKey>,
67    key_map: SlotMap<RegexCacheKey, WeakSharedRegex>,
68    // used to decide when to GC. Counts up to `GC_FREQUENCY` and is reset to 0 when a GC happens
69    gc_counter: u64,
70}
71
72impl RegexStore {
73    pub fn new() -> Self {
74        Self {
75            pattern_index: AHashMap::new(),
76            key_map: SlotMap::with_key(),
77            gc_counter: 0,
78        }
79    }
80
81    /// Cleans up any configuration no longer used in Scanners. Should be called periodically.
82    fn gc(&mut self) {
83        self.gc_counter = 0;
84        self.pattern_index.retain(|_, cache_key| {
85            if self.key_map.get(*cache_key).unwrap().regex.strong_count() == 0 {
86                if let Some(old_regex) = self.key_map.remove(*cache_key) {
87                    GLOBAL_STATS.add_total_regex_cache(-(old_regex.cache_size as i64));
88                }
89                false
90            } else {
91                true
92            }
93        });
94        GLOBAL_STATS.set_total_regexes(self.key_map.len());
95    }
96
97    /// Check if a regex for this pattern already exists, and returns a copy if it does
98    pub fn get(&self, pattern: &str) -> Option<SharedRegex> {
99        self.pattern_index.get(pattern).and_then(|cache_key| {
100            self.key_map
101                .get(*cache_key)
102                .and_then(|x| x.regex.upgrade())
103                .map(|regex| SharedRegex {
104                    regex,
105                    cache_key: *cache_key,
106                })
107        })
108    }
109
110    #[cfg(test)]
111    fn len(&self) -> usize {
112        debug_assert_eq!(self.pattern_index.len(), self.key_map.len());
113        self.key_map.len()
114    }
115
116    /// Inserts a new rule into the cache. The "memoized" rule is returned and should be
117    /// used instead of the one passed in. This ensures that if there were duplicates of
118    /// a rule being created at the same time, only one is kept.
119    pub fn insert(&mut self, pattern: &str, regex: MetaRegex) -> SharedRegex {
120        self.gc_counter += 1;
121        if self.gc_counter >= GC_FREQUENCY {
122            self.gc();
123        }
124        match self.get(pattern) {
125            Some(existing_regex) => existing_regex,
126            _ => {
127                let shared_regex = Arc::new(regex);
128
129                let regex_cache = shared_regex.create_cache();
130                let cache_key = self.key_map.insert(WeakSharedRegex {
131                    regex: Arc::downgrade(&shared_regex),
132                    cache_size: regex_cache.memory_usage() + std::mem::size_of::<Cache>(),
133                });
134                if let Some(old_cache_key) =
135                    self.pattern_index.insert(pattern.to_owned(), cache_key)
136                {
137                    // cleanup old value (which must be a "dead" reference since `get` returned None)
138                    if let Some(weak_ref) = self.key_map.remove(old_cache_key) {
139                        GLOBAL_STATS.add_total_regex_cache(-(weak_ref.cache_size as i64));
140                        debug_assert!(weak_ref.regex.strong_count() == 0)
141                    }
142                }
143
144                GLOBAL_STATS.set_total_regexes(self.key_map.len());
145
146                SharedRegex {
147                    regex: shared_regex,
148                    cache_key,
149                }
150            }
151        }
152    }
153}
154
155#[cfg(test)]
156mod test {
157    use crate::scanner::regex_rule::regex_store::{
158        GC_FREQUENCY, RegexStore, get_memoized_regex_with_custom_store,
159    };
160    use regex_automata::meta::Regex;
161    use std::sync::Mutex;
162
163    #[test]
164    fn dropped_regexes_should_be_removed_from_global_store() {
165        let store = Mutex::new(RegexStore::new());
166
167        let regex = get_memoized_regex_with_custom_store("test", Regex::new, &store).unwrap();
168
169        assert_eq!(store.lock().unwrap().len(), 1);
170
171        drop(regex);
172
173        // force an early GC
174        store.lock().unwrap().gc();
175
176        assert_eq!(store.lock().unwrap().len(), 0);
177    }
178
179    #[test]
180    fn test_automatic_gc() {
181        let store = Mutex::new(RegexStore::new());
182
183        let regex = get_memoized_regex_with_custom_store("test", Regex::new, &store).unwrap();
184        drop(regex);
185
186        // insert enough new patterns to trigger a GC
187        for i in 0..(GC_FREQUENCY - 1) {
188            let regex =
189                get_memoized_regex_with_custom_store(&format!("test-{i}"), Regex::new, &store)
190                    .unwrap();
191            drop(regex)
192        }
193        // The insertion that triggered the GC is itself not cleaned up yet, but everything else is
194        assert_eq!(store.lock().unwrap().len(), 1);
195    }
196}