saluki_common/
scrubber.rs

1//! A YAML scrubber for redacting sensitive information.
2
3use std::io::{BufRead, BufReader};
4use std::sync::OnceLock;
5
6use regex::bytes::Regex;
7
8static COMMENT_REGEX: OnceLock<Regex> = OnceLock::new();
9static BLANK_REGEX: OnceLock<Regex> = OnceLock::new();
10
11fn comment_regex() -> &'static Regex {
12    COMMENT_REGEX.get_or_init(|| Regex::new(r"^\s*#.*$").unwrap())
13}
14
15fn blank_regex() -> &'static Regex {
16    BLANK_REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap())
17}
18
19type ReplFunc = Box<dyn Fn(&[u8]) -> Vec<u8> + Send + Sync>;
20
21/// Defines a rule for scrubbing sensitive information.
22pub struct Replacer {
23    /// `regex` must match the sensitive information within a value.
24    pub regex: Option<Regex>,
25
26    /// `hints`, if given, are strings which must also be present in the text for the
27    /// `regex` to match. This can be used to limit the contexts where an otherwise
28    /// very broad `regex` is actually applied.
29    pub hints: Option<Vec<String>>,
30
31    /// `repl` is the byte slice to replace the substring matching `regex`. It can use
32    /// the `regex` crate's replacement-string syntax (e.g., `$1` to refer to the
33    /// first capture group).
34    pub repl: Option<Vec<u8>>,
35
36    /// `repl_func`, if set, is called with the matched byte slice. The return value
37    /// is used as the replacement. Only one of `repl` and `repl_func` should be set.
38    pub repl_func: Option<ReplFunc>,
39}
40
41static DEFAULT_SCRUBBER: OnceLock<Scrubber> = OnceLock::new();
42
43/// Returns a reference to the default, lazily-initialized global scrubber.
44///
45/// This function ensures that the default scrubber, with its associated regex compilation,
46/// is only initialized once for the lifetime of the application.
47pub fn default_scrubber() -> &'static Scrubber {
48    DEFAULT_SCRUBBER.get_or_init(Scrubber::default)
49}
50
51impl Default for Scrubber {
52    fn default() -> Self {
53        let hinted_api_key_replacer = Replacer {
54            regex: Some(Regex::new(r"(api_?key=)[a-zA-Z0-9]+([a-zA-Z0-9]{5})\b").unwrap()),
55            repl: Some(b"$1***************************$2".to_vec()),
56            hints: Some(vec!["api_key".to_string(), "apikey".to_string()]),
57            repl_func: None,
58        };
59
60        let hinted_app_key_replacer = Replacer {
61            regex: Some(Regex::new(r"(ap(?:p|plication)_?key=)[a-zA-Z0-9]+([a-zA-Z0-9]{5})\b").unwrap()),
62            repl: Some(b"$1***********************************$2".to_vec()),
63            hints: Some(vec![
64                "appkey".to_string(),
65                "app_key".to_string(),
66                "application_key".to_string(),
67            ]),
68            repl_func: None,
69        };
70
71        // Non-hinted API key replacer: matches 32 hex chars, keeps last 5
72        let api_key_replacer = Replacer {
73            regex: Some(Regex::new(r"\b[a-fA-F0-9]{27}([a-fA-F0-9]{5})\b").unwrap()),
74            repl: Some(b"***************************$1".to_vec()),
75            hints: None,
76            repl_func: None,
77        };
78
79        // YAML-specific replacers that are aware of quotes and other syntax
80        let api_key_replacer_yaml = Replacer {
81            regex: Some(Regex::new(r#"(\-|\:|,|\[|\{)(\s+)?\b[a-fA-F0-9]{27}([a-fA-F0-9]{5})\b"#).unwrap()),
82            repl: Some(b"$1$2\"***************************$3\"".to_vec()),
83            hints: None,
84            repl_func: None,
85        };
86
87        let app_key_replacer_yaml = Replacer {
88            regex: Some(Regex::new(r#"(\-|\:|,|\[|\{)(\s+)?\b[a-fA-F0-9]{35}([a-fA-F0-9]{5})\b"#).unwrap()),
89            repl: Some(b"$1$2\"***********************************$3\"".to_vec()),
90            hints: None,
91            repl_func: None,
92        };
93
94        let app_key_replacer = Replacer {
95            regex: Some(Regex::new(r"\b[a-fA-F0-9]{35}([a-fA-F0-9]{5})\b").unwrap()),
96            repl: Some(b"***********************************$1".to_vec()),
97            hints: None,
98            repl_func: None,
99        };
100
101        // Replacer for DDRCM App Key
102        let rc_app_key_replacer = Replacer {
103            regex: Some(Regex::new(r"\bDDRCM_[A-Z0-9]+([A-Z0-9]{5})\b").unwrap()),
104            repl: Some(b"***********************************$1".to_vec()),
105            hints: None,
106            repl_func: None,
107        };
108
109        // Replacer for URI passwords (e.g., protocol://user:password@host)
110        let uri_password_replacer = Replacer {
111            regex: Some(Regex::new(r#"(?i)([a-z][a-z0-9+-.]+://|\b)([^:\s]+):([^\s|"]+)@"#).unwrap()),
112            repl: Some(b"$1$2:********@".to_vec()),
113            hints: None,
114            repl_func: None,
115        };
116
117        let password_replacer = Replacer {
118            regex: Some(Regex::new(r#"(?i)(\"?(?:pass(?:word)?|pswd|pwd)\"?)((?:=| = |: )\"?)([0-9A-Za-z#!$%&'()*+,\-./:;<=>?@\[\\\]^_{|}~]+)"#).unwrap()),
119            repl: Some(b"$1$2********".to_vec()),
120            hints: None,
121            repl_func: None,
122        };
123
124        Self {
125            replacers: vec![
126                hinted_api_key_replacer,
127                hinted_app_key_replacer,
128                api_key_replacer_yaml,
129                app_key_replacer_yaml,
130                api_key_replacer,
131                app_key_replacer,
132                rc_app_key_replacer,
133                uri_password_replacer,
134                password_replacer,
135            ],
136        }
137    }
138}
139
140/// A YAML scrubber that can be configured with different replacers.
141pub struct Scrubber {
142    replacers: Vec<Replacer>,
143}
144
145impl Scrubber {
146    /// Creates a new `Scrubber` with no replacers.
147    pub fn new() -> Self {
148        Self { replacers: vec![] }
149    }
150
151    /// Adds a replacer to the scrubber.
152    pub fn add_replacer(&mut self, replacer: Replacer) {
153        self.replacers.push(replacer);
154    }
155
156    /// Scrubs sensitive data from a byte slice.
157    ///
158    /// This method will scrub the data, returning a new byte vector.
159    pub fn scrub_bytes(&self, data: &[u8]) -> Vec<u8> {
160        let mut reader = BufReader::new(data);
161        self.scrub_reader(&mut reader)
162    }
163
164    fn scrub_reader(&self, reader: &mut BufReader<&[u8]>) -> Vec<u8> {
165        let mut scrubbed_lines = Vec::new();
166        let mut line = Vec::new();
167        let mut first = true;
168        while let Ok(bytes_read) = reader.read_until(b'\n', &mut line) {
169            if bytes_read == 0 {
170                break; // EOF
171            }
172
173            if blank_regex().is_match(&line) {
174                scrubbed_lines.push(b"\n".to_vec());
175            } else if !comment_regex().is_match(&line) {
176                let b = self.scrub(&line, &self.replacers);
177                if !first {
178                    scrubbed_lines.push(b"\n".to_vec());
179                }
180                scrubbed_lines.push(b);
181                first = false;
182            }
183            line.clear();
184        }
185        scrubbed_lines.join(&b'\n')
186    }
187
188    /// Applies the replacers to the data.
189    fn scrub(&self, data: &[u8], replacers: &[Replacer]) -> Vec<u8> {
190        let mut scrubbed_data = data.to_vec();
191        for replacer in replacers {
192            if replacer.regex.is_none() {
193                continue;
194            }
195
196            let contains_hint = if let Some(hints) = &replacer.hints {
197                hints.iter().any(|hint| {
198                    let needle = hint.as_bytes();
199                    data.windows(needle.len()).any(|window| window == needle)
200                })
201            } else {
202                false
203            };
204
205            if replacer.hints.as_ref().is_none_or(|h| h.is_empty() || contains_hint) {
206                if let Some(re) = &replacer.regex {
207                    if let Some(repl_func) = &replacer.repl_func {
208                        scrubbed_data = re
209                            .replace_all(&scrubbed_data, |caps: &regex::bytes::Captures| repl_func(&caps[0]))
210                            .into_owned();
211                    } else if let Some(repl) = &replacer.repl {
212                        scrubbed_data = re.replace_all(&scrubbed_data, repl.as_slice()).into_owned();
213                    }
214                }
215            }
216        }
217        scrubbed_data
218    }
219}
220
221#[cfg(test)]
222mod tests {
223    use super::*;
224
225    fn assert_clean(contents: &str, clean_contents: &str) {
226        let scrubber = default_scrubber();
227        let cleaned = scrubber.scrub_bytes(contents.as_bytes());
228        let cleaned_string = String::from_utf8(cleaned).unwrap();
229        assert_eq!(cleaned_string.trim(), clean_contents.trim());
230    }
231
232    #[test]
233    fn test_config_strip_api_key() {
234        assert_clean(
235            "api_key: aaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb",
236            "api_key: \"***************************abbbb\"",
237        );
238        assert_clean(
239            "api_key: AAAAAAAAAAAAAAAAAAAAAAAAAAAABBBB",
240            "api_key: \"***************************ABBBB\"",
241        );
242        assert_clean(
243            "api_key: aaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb",
244            "api_key: \"***************************abbbb\"",
245        );
246        assert_clean(
247            "api_key: 'aaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb'",
248            "api_key: '***************************abbbb'",
249        );
250        assert_clean(
251            "   api_key:   'aaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb'   ",
252            "   api_key:   '***************************abbbb'   ",
253        );
254    }
255
256    #[test]
257    fn test_config_app_key() {
258        assert_clean(
259            "app_key: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb",
260            "app_key: \"***********************************abbbb\"",
261        );
262        assert_clean(
263            "app_key: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBB",
264            "app_key: \"***********************************ABBBB\"",
265        );
266        assert_clean(
267            "app_key: \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb\"",
268            "app_key: \"***********************************abbbb\"",
269        );
270        assert_clean(
271            "app_key: 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb'",
272            "app_key: '***********************************abbbb'",
273        );
274        assert_clean(
275            "   app_key:   'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb'   ",
276            "   app_key:   '***********************************abbbb'   ",
277        );
278    }
279
280    #[test]
281    fn test_config_rc_app_key() {
282        assert_clean(
283            "key: \"DDRCM_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABCDE\"",
284            "key: \"***********************************ABCDE\"",
285        );
286    }
287
288    #[test]
289    fn test_text_strip_api_key() {
290        assert_clean(
291            "Error status code 500 : http://dog.tld/api?key=3290abeefc68e1bbe852a25252bad88c",
292            "Error status code 500 : http://dog.tld/api?key=***************************ad88c",
293        );
294        assert_clean(
295            "hintedAPIKeyReplacer : http://dog.tld/api_key=InvalidLength12345abbbb",
296            "hintedAPIKeyReplacer : http://dog.tld/api_key=***************************abbbb",
297        );
298        assert_clean(
299            "hintedAPIKeyReplacer : http://dog.tld/apikey=InvalidLength12345abbbb",
300            "hintedAPIKeyReplacer : http://dog.tld/apikey=***************************abbbb",
301        );
302        assert_clean(
303            "apiKeyReplacer: https://agent-http-intake.logs.datadoghq.com/v1/input/aaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb",
304            "apiKeyReplacer: https://agent-http-intake.logs.datadoghq.com/v1/input/***************************abbbb",
305        );
306    }
307
308    #[test]
309    fn test_config_strip_url_password() {
310        assert_clean(
311            "proxy: random_url_key: http://user:password@host:port",
312            "proxy: random_url_key: http://user:********@host:port",
313        );
314        assert_clean(
315            "random_url_key http://user:password@host:port",
316            "random_url_key http://user:********@host:port",
317        );
318        assert_clean(
319            "random_url_key: http://user:password@host:port",
320            "random_url_key: http://user:********@host:port",
321        );
322        assert_clean(
323            "random_url_key: http://user:p@ssw0r)@host:port",
324            "random_url_key: http://user:********@host:port",
325        );
326        assert_clean(
327            "random_url_key: http://user:🔑🔒🔐🔓@host:port",
328            "random_url_key: http://user:********@host:port",
329        );
330        assert_clean(
331            "random_url_key: http://user:password@host",
332            "random_url_key: http://user:********@host",
333        );
334        assert_clean(
335            "random_url_key: protocol://user:p@ssw0r)@host:port",
336            "random_url_key: protocol://user:********@host:port",
337        );
338        assert_clean(
339            "random_url_key: \"http://user:password@host:port\"",
340            "random_url_key: \"http://user:********@host:port\"",
341        );
342        assert_clean(
343            "random_url_key: 'http://user:password@host:port'",
344            "random_url_key: 'http://user:********@host:port'",
345        );
346        assert_clean(
347            "random_domain_key: 'user:password@host:port'",
348            "random_domain_key: 'user:********@host:port'",
349        );
350        assert_clean(
351            "   random_url_key:   'http://user:password@host:port'   ",
352            "   random_url_key:   'http://user:********@host:port'   ",
353        );
354        assert_clean(
355            "   random_url_key:   'mongodb+s.r-v://user:password@host:port'   ",
356            "   random_url_key:   'mongodb+s.r-v://user:********@host:port'   ",
357        );
358        assert_clean(
359            "   random_url_key:   'mongodb+srv://user:pass-with-hyphen@abc.example.com/database'   ",
360            "   random_url_key:   'mongodb+srv://user:********@abc.example.com/database'   ",
361        );
362    }
363
364    #[test]
365    fn test_text_strip_app_key() {
366        assert_clean(
367            "hintedAPPKeyReplacer : http://dog.tld/app_key=InvalidLength12345abbbb",
368            "hintedAPPKeyReplacer : http://dog.tld/app_key=***********************************abbbb",
369        );
370        assert_clean(
371            "hintedAPPKeyReplacer : http://dog.tld/appkey=InvalidLength12345abbbb",
372            "hintedAPPKeyReplacer : http://dog.tld/appkey=***********************************abbbb",
373        );
374        assert_clean(
375            "hintedAPPKeyReplacer : http://dog.tld/application_key=InvalidLength12345abbbb",
376            "hintedAPPKeyReplacer : http://dog.tld/application_key=***********************************abbbb",
377        );
378        assert_clean(
379            "appKeyReplacer: http://dog.tld/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb",
380            "appKeyReplacer: http://dog.tld/***********************************abbbb",
381        );
382    }
383}