Skip to main content

saluki_common/
scrubber.rs

1//! A YAML scrubber for redacting sensitive information.
2
3use std::io::{BufRead, BufReader};
4use std::sync::OnceLock;
5
6use regex::bytes::Regex;
7
8static COMMENT_REGEX: OnceLock<Regex> = OnceLock::new();
9static BLANK_REGEX: OnceLock<Regex> = OnceLock::new();
10
11fn comment_regex() -> &'static Regex {
12    COMMENT_REGEX.get_or_init(|| Regex::new(r"^\s*#.*$").unwrap())
13}
14
15fn blank_regex() -> &'static Regex {
16    BLANK_REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap())
17}
18
19type ReplFunc = Box<dyn Fn(&[u8]) -> Vec<u8> + Send + Sync>;
20
21/// Defines a rule for scrubbing sensitive information.
22pub struct Replacer {
23    /// `regex` must match the sensitive information within a value.
24    pub regex: Option<Regex>,
25
26    /// `hints`, if given, are strings which must also be present in the text for the
27    /// `regex` to match. This can be used to limit the contexts where an otherwise
28    /// very broad `regex` is actually applied.
29    pub hints: Option<Vec<String>>,
30
31    /// `repl` is the byte slice to replace the substring matching `regex`. It can use
32    /// the `regex` crate's replacement-string syntax (for example, `$1` to refer to the
33    /// first capture group).
34    pub repl: Option<Vec<u8>>,
35
36    /// `repl_func`, if set, is called with the matched byte slice. The return value
37    /// is used as the replacement. Only one of `repl` and `repl_func` should be set.
38    pub repl_func: Option<ReplFunc>,
39}
40
41static DEFAULT_SCRUBBER: OnceLock<Scrubber> = OnceLock::new();
42
43/// Returns a reference to the default, lazily-initialized global scrubber.
44///
45/// This function ensures that the default scrubber, with its associated regex compilation,
46/// is only initialized once for the lifetime of the application.
47pub fn default_scrubber() -> &'static Scrubber {
48    DEFAULT_SCRUBBER.get_or_init(Scrubber::default)
49}
50
51impl Default for Scrubber {
52    fn default() -> Self {
53        let hinted_api_key_replacer = Replacer {
54            regex: Some(Regex::new(r"(api_?key=)[a-zA-Z0-9]+([a-zA-Z0-9]{5})\b").unwrap()),
55            repl: Some(b"$1***************************$2".to_vec()),
56            hints: Some(vec!["api_key".to_string(), "apikey".to_string()]),
57            repl_func: None,
58        };
59
60        let hinted_app_key_replacer = Replacer {
61            regex: Some(Regex::new(r"(ap(?:p|plication)_?key=)[a-zA-Z0-9]+([a-zA-Z0-9]{5})\b").unwrap()),
62            repl: Some(b"$1***********************************$2".to_vec()),
63            hints: Some(vec![
64                "appkey".to_string(),
65                "app_key".to_string(),
66                "application_key".to_string(),
67            ]),
68            repl_func: None,
69        };
70
71        // Non-hinted API key replacer: matches 32 hex chars, keeps last 5
72        let api_key_replacer = Replacer {
73            regex: Some(Regex::new(r"\b[a-fA-F0-9]{27}([a-fA-F0-9]{5})\b").unwrap()),
74            repl: Some(b"***************************$1".to_vec()),
75            hints: None,
76            repl_func: None,
77        };
78
79        // YAML-specific replacers that are aware of quotes and other syntax
80        let api_key_replacer_yaml = Replacer {
81            regex: Some(Regex::new(r#"(\-|\:|,|\[|\{)(\s+)?\b[a-fA-F0-9]{27}([a-fA-F0-9]{5})\b"#).unwrap()),
82            repl: Some(b"$1$2\"***************************$3\"".to_vec()),
83            hints: None,
84            repl_func: None,
85        };
86
87        let app_key_replacer_yaml = Replacer {
88            regex: Some(Regex::new(r#"(\-|\:|,|\[|\{)(\s+)?\b[a-fA-F0-9]{35}([a-fA-F0-9]{5})\b"#).unwrap()),
89            repl: Some(b"$1$2\"***********************************$3\"".to_vec()),
90            hints: None,
91            repl_func: None,
92        };
93
94        let app_key_replacer = Replacer {
95            regex: Some(Regex::new(r"\b[a-fA-F0-9]{35}([a-fA-F0-9]{5})\b").unwrap()),
96            repl: Some(b"***********************************$1".to_vec()),
97            hints: None,
98            repl_func: None,
99        };
100
101        // Replacer for DDRCM App Key
102        let rc_app_key_replacer = Replacer {
103            regex: Some(Regex::new(r"\bDDRCM_[A-Z0-9]+([A-Z0-9]{5})\b").unwrap()),
104            repl: Some(b"***********************************$1".to_vec()),
105            hints: None,
106            repl_func: None,
107        };
108
109        // Replacer for URI passwords (for example, protocol://user:password@host)
110        let uri_password_replacer = Replacer {
111            regex: Some(Regex::new(r#"(?i)([a-z][a-z0-9+-.]+://|\b)([^:\s]+):([^\s|"]+)@"#).unwrap()),
112            repl: Some(b"$1$2:********@".to_vec()),
113            hints: None,
114            repl_func: None,
115        };
116
117        // Capture the optional closing `"` as $4 so the replacement preserves it for JSON values without breaking
118        // unquoted values (plain text / YAML). Without $4, `"password":"secret"` → `"password":"********` (invalid JSON).
119        // `:[ ]?` matches both compact JSON (`"password":"secret"`) and spaced YAML (`password: secret`).
120        let password_replacer = Replacer {
121            regex: Some(Regex::new(r#"(?i)(\"?(?:pass(?:word)?|pswd|pwd)\"?)((?:=| = |:[ ]?)\"?)([0-9A-Za-z#!$%&'()*+,\-./:;<=>?@\[\\\]^_{|}~]+)(\"?)"#).unwrap()),
122            repl: Some(b"$1$2********$4".to_vec()),
123            hints: None,
124            repl_func: None,
125        };
126
127        Self {
128            replacers: vec![
129                hinted_api_key_replacer,
130                hinted_app_key_replacer,
131                api_key_replacer_yaml,
132                app_key_replacer_yaml,
133                api_key_replacer,
134                app_key_replacer,
135                rc_app_key_replacer,
136                uri_password_replacer,
137                password_replacer,
138            ],
139        }
140    }
141}
142
143/// A YAML scrubber that can be configured with different replacers.
144pub struct Scrubber {
145    replacers: Vec<Replacer>,
146}
147
148impl Scrubber {
149    /// Creates a new `Scrubber` with no replacers.
150    pub fn new() -> Self {
151        Self { replacers: vec![] }
152    }
153
154    /// Adds a replacer to the scrubber.
155    pub fn add_replacer(&mut self, replacer: Replacer) {
156        self.replacers.push(replacer);
157    }
158
159    /// Scrubs sensitive data from a byte slice.
160    ///
161    /// This method will scrub the data, returning a new byte vector.
162    pub fn scrub_bytes(&self, data: &[u8]) -> Vec<u8> {
163        let mut reader = BufReader::new(data);
164        self.scrub_reader(&mut reader)
165    }
166
167    fn scrub_reader(&self, reader: &mut BufReader<&[u8]>) -> Vec<u8> {
168        let mut scrubbed_lines = Vec::new();
169        let mut line = Vec::new();
170        let mut first = true;
171        while let Ok(bytes_read) = reader.read_until(b'\n', &mut line) {
172            if bytes_read == 0 {
173                break; // EOF
174            }
175
176            if blank_regex().is_match(&line) {
177                scrubbed_lines.push(b"\n".to_vec());
178            } else if !comment_regex().is_match(&line) {
179                let b = self.scrub(&line, &self.replacers);
180                if !first {
181                    scrubbed_lines.push(b"\n".to_vec());
182                }
183                scrubbed_lines.push(b);
184                first = false;
185            }
186            line.clear();
187        }
188        scrubbed_lines.join(&b'\n')
189    }
190
191    /// Applies the replacers to the data.
192    fn scrub(&self, data: &[u8], replacers: &[Replacer]) -> Vec<u8> {
193        let mut scrubbed_data = data.to_vec();
194        for replacer in replacers {
195            if replacer.regex.is_none() {
196                continue;
197            }
198
199            let contains_hint = if let Some(hints) = &replacer.hints {
200                hints.iter().any(|hint| {
201                    let needle = hint.as_bytes();
202                    data.windows(needle.len()).any(|window| window == needle)
203                })
204            } else {
205                false
206            };
207
208            if replacer.hints.as_ref().is_none_or(|h| h.is_empty() || contains_hint) {
209                if let Some(re) = &replacer.regex {
210                    if let Some(repl_func) = &replacer.repl_func {
211                        scrubbed_data = re
212                            .replace_all(&scrubbed_data, |caps: &regex::bytes::Captures| repl_func(&caps[0]))
213                            .into_owned();
214                    } else if let Some(repl) = &replacer.repl {
215                        scrubbed_data = re.replace_all(&scrubbed_data, repl.as_slice()).into_owned();
216                    }
217                }
218            }
219        }
220        scrubbed_data
221    }
222}
223
224#[cfg(test)]
225mod tests {
226    use super::*;
227
228    fn assert_clean(contents: &str, clean_contents: &str) {
229        let scrubber = default_scrubber();
230        let cleaned = scrubber.scrub_bytes(contents.as_bytes());
231        let cleaned_string = String::from_utf8(cleaned).unwrap();
232        assert_eq!(cleaned_string.trim(), clean_contents.trim());
233    }
234
235    #[test]
236    fn test_config_strip_api_key() {
237        assert_clean(
238            "api_key: aaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb",
239            "api_key: \"***************************abbbb\"",
240        );
241        assert_clean(
242            "api_key: AAAAAAAAAAAAAAAAAAAAAAAAAAAABBBB",
243            "api_key: \"***************************ABBBB\"",
244        );
245        assert_clean(
246            "api_key: aaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb",
247            "api_key: \"***************************abbbb\"",
248        );
249        assert_clean(
250            "api_key: 'aaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb'",
251            "api_key: '***************************abbbb'",
252        );
253        assert_clean(
254            "   api_key:   'aaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb'   ",
255            "   api_key:   '***************************abbbb'   ",
256        );
257    }
258
259    #[test]
260    fn test_config_app_key() {
261        assert_clean(
262            "app_key: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb",
263            "app_key: \"***********************************abbbb\"",
264        );
265        assert_clean(
266            "app_key: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBB",
267            "app_key: \"***********************************ABBBB\"",
268        );
269        assert_clean(
270            "app_key: \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb\"",
271            "app_key: \"***********************************abbbb\"",
272        );
273        assert_clean(
274            "app_key: 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb'",
275            "app_key: '***********************************abbbb'",
276        );
277        assert_clean(
278            "   app_key:   'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb'   ",
279            "   app_key:   '***********************************abbbb'   ",
280        );
281    }
282
283    #[test]
284    fn test_config_rc_app_key() {
285        assert_clean(
286            "key: \"DDRCM_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABCDE\"",
287            "key: \"***********************************ABCDE\"",
288        );
289    }
290
291    #[test]
292    fn test_text_strip_api_key() {
293        assert_clean(
294            "Error status code 500 : http://dog.tld/api?key=3290abeefc68e1bbe852a25252bad88c",
295            "Error status code 500 : http://dog.tld/api?key=***************************ad88c",
296        );
297        assert_clean(
298            "hintedAPIKeyReplacer : http://dog.tld/api_key=InvalidLength12345abbbb",
299            "hintedAPIKeyReplacer : http://dog.tld/api_key=***************************abbbb",
300        );
301        assert_clean(
302            "hintedAPIKeyReplacer : http://dog.tld/apikey=InvalidLength12345abbbb",
303            "hintedAPIKeyReplacer : http://dog.tld/apikey=***************************abbbb",
304        );
305        assert_clean(
306            "apiKeyReplacer: https://agent-http-intake.logs.datadoghq.com/v1/input/aaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb",
307            "apiKeyReplacer: https://agent-http-intake.logs.datadoghq.com/v1/input/***************************abbbb",
308        );
309    }
310
311    #[test]
312    fn test_config_strip_url_password() {
313        assert_clean(
314            "proxy: random_url_key: http://user:password@host:port",
315            "proxy: random_url_key: http://user:********@host:port",
316        );
317        assert_clean(
318            "random_url_key http://user:password@host:port",
319            "random_url_key http://user:********@host:port",
320        );
321        assert_clean(
322            "random_url_key: http://user:password@host:port",
323            "random_url_key: http://user:********@host:port",
324        );
325        assert_clean(
326            "random_url_key: http://user:p@ssw0r)@host:port",
327            "random_url_key: http://user:********@host:port",
328        );
329        assert_clean(
330            "random_url_key: http://user:🔑🔒🔐🔓@host:port",
331            "random_url_key: http://user:********@host:port",
332        );
333        assert_clean(
334            "random_url_key: http://user:password@host",
335            "random_url_key: http://user:********@host",
336        );
337        assert_clean(
338            "random_url_key: protocol://user:p@ssw0r)@host:port",
339            "random_url_key: protocol://user:********@host:port",
340        );
341        assert_clean(
342            "random_url_key: \"http://user:password@host:port\"",
343            "random_url_key: \"http://user:********@host:port\"",
344        );
345        assert_clean(
346            "random_url_key: 'http://user:password@host:port'",
347            "random_url_key: 'http://user:********@host:port'",
348        );
349        assert_clean(
350            "random_domain_key: 'user:password@host:port'",
351            "random_domain_key: 'user:********@host:port'",
352        );
353        assert_clean(
354            "   random_url_key:   'http://user:password@host:port'   ",
355            "   random_url_key:   'http://user:********@host:port'   ",
356        );
357        assert_clean(
358            "   random_url_key:   'mongodb+s.r-v://user:password@host:port'   ",
359            "   random_url_key:   'mongodb+s.r-v://user:********@host:port'   ",
360        );
361        assert_clean(
362            "   random_url_key:   'mongodb+srv://user:pass-with-hyphen@abc.example.com/database'   ",
363            "   random_url_key:   'mongodb+srv://user:********@abc.example.com/database'   ",
364        );
365    }
366
367    #[test]
368    fn test_password_yaml_double_quoted_value() {
369        assert_clean("password: \"supersecret\"", "password: \"********\"");
370    }
371
372    #[test]
373    fn test_password_unquoted_value_still_scrubbed() {
374        assert_clean("password=supersecret", "password=********");
375        assert_clean("password: supersecret", "password: ********");
376    }
377
378    #[test]
379    fn test_json_password_like_key_scrubs_to_valid_json() {
380        let scrubber = default_scrubber();
381        // spaced (pretty-printed JSON / YAML)
382        let input = r#"{"mysql_password": "supersecret"}"#;
383        let cleaned = String::from_utf8(scrubber.scrub_bytes(input.as_bytes())).unwrap();
384        serde_json::from_str::<serde_json::Value>(&cleaned).expect("scrubbed JSON must parse");
385        assert!(cleaned.contains("********"));
386
387        // compact JSON (no space after colon)
388        let input_compact = r#"{"password":"secret"}"#;
389        let cleaned_compact = String::from_utf8(scrubber.scrub_bytes(input_compact.as_bytes())).unwrap();
390        serde_json::from_str::<serde_json::Value>(&cleaned_compact).expect("compact scrubbed JSON must parse");
391        assert!(
392            cleaned_compact.contains("********"),
393            "compact JSON password must be scrubbed: {cleaned_compact}"
394        );
395    }
396
397    #[test]
398    fn test_json_single_line_api_key_scrub() {
399        let scrubber = default_scrubber();
400        let input = r#"{"api_key":"aaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb"}"#;
401        let cleaned = scrubber.scrub_bytes(input.as_bytes());
402        let cleaned_string = String::from_utf8(cleaned).unwrap();
403        // Must remain valid JSON after scrubbing (regex YAML-style replacers must not corrupt JSON).
404        serde_json::from_str::<serde_json::Value>(&cleaned_string).expect("scrubbed output must parse as JSON");
405        assert!(
406            cleaned_string.contains("***************************"),
407            "expected masked api key suffix, got: {cleaned_string}"
408        );
409    }
410
411    #[test]
412    fn test_large_single_line_json_scrubbed_still_parses() {
413        let mut map = serde_json::Map::new();
414        map.insert("api_key".into(), serde_json::json!("aaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb"));
415        map.insert("pad".into(), serde_json::json!("x".repeat(25_000)));
416        let line = serde_json::to_string(&serde_json::Value::Object(map)).unwrap();
417        assert!(line.len() > 16_384, "sanity: payload should exceed 16 KiB");
418
419        let scrubber = default_scrubber();
420        let cleaned = scrubber.scrub_bytes(line.as_bytes());
421        let cleaned_string = String::from_utf8(cleaned).unwrap();
422        serde_json::from_str::<serde_json::Value>(&cleaned_string).expect("JSON parse after scrub");
423    }
424
425    #[test]
426    fn test_text_strip_app_key() {
427        assert_clean(
428            "hintedAPPKeyReplacer : http://dog.tld/app_key=InvalidLength12345abbbb",
429            "hintedAPPKeyReplacer : http://dog.tld/app_key=***********************************abbbb",
430        );
431        assert_clean(
432            "hintedAPPKeyReplacer : http://dog.tld/appkey=InvalidLength12345abbbb",
433            "hintedAPPKeyReplacer : http://dog.tld/appkey=***********************************abbbb",
434        );
435        assert_clean(
436            "hintedAPPKeyReplacer : http://dog.tld/application_key=InvalidLength12345abbbb",
437            "hintedAPPKeyReplacer : http://dog.tld/application_key=***********************************abbbb",
438        );
439        assert_clean(
440            "appKeyReplacer: http://dog.tld/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbb",
441            "appKeyReplacer: http://dog.tld/***********************************abbbb",
442        );
443    }
444}