dd_sds/
validation.rs

1use crate::normalization::rust_regex_adapter::{convert_to_rust_regex, QUANTIFIER_LIMIT};
2use crate::parser::error::ParseError;
3use regex_automata::meta::{self};
4use thiserror::Error;
5
6#[derive(Debug, PartialEq, Eq, Error)]
7pub enum RegexValidationError {
8    #[error("Invalid regex syntax")]
9    InvalidSyntax,
10
11    #[error("The regex pattern is nested too deeply")]
12    ExceededDepthLimit,
13
14    #[error("The regex has exceeded the complexity limit (try simplifying the regex)")]
15    TooComplex,
16
17    #[error("Regex patterns are not allowed to match an empty string")]
18    MatchesEmptyString,
19
20    #[error("Regex quantifier is too high. Max is {}", QUANTIFIER_LIMIT)]
21    ExceededQuantifierLimit,
22}
23
24impl From<ParseError> for RegexValidationError {
25    fn from(err: ParseError) -> Self {
26        match err {
27            ParseError::InvalidSyntax => Self::InvalidSyntax,
28            ParseError::ExceededDepthLimit => Self::ExceededDepthLimit,
29            ParseError::ExceededQuantifierLimit => Self::ExceededQuantifierLimit,
30        }
31    }
32}
33
34const REGEX_COMPLEXITY_LIMIT: usize = 1_000_000;
35
36/// Checks that a regex pattern is valid for using in an SDS scanner
37pub fn validate_regex(input: &str) -> Result<(), RegexValidationError> {
38    // This is the same as `validate_and_create_regex`, but removes the actual Regex type
39    // to create a more stable API for external users of the crate.
40    validate_and_create_regex(input).map(|_| ())
41}
42
43pub fn get_regex_complexity_estimate_very_slow(input: &str) -> Result<usize, RegexValidationError> {
44    // The regex crate doesn't directly give you access to the "complexity", but it does
45    // reject if it's too large, so we can binary search to find the limit.
46
47    let converted_pattern = convert_to_rust_regex(input)?;
48
49    let mut low = 1;
50    // Allow it to go a bit higher than the normal limit, since this can be used for debugging
51    // how complex a regex is.
52    let mut high = 10 * REGEX_COMPLEXITY_LIMIT;
53    while low < high {
54        let mid = low + (high - low) / 2;
55        if is_regex_within_complexity_limit(&converted_pattern, mid)? {
56            high = mid;
57        } else {
58            low = mid + 1;
59        }
60    }
61    Ok(low)
62}
63
64pub fn validate_and_create_regex(input: &str) -> Result<meta::Regex, RegexValidationError> {
65    // This validates that the syntax is valid and normalizes behavior.
66    let converted_pattern = convert_to_rust_regex(input)?;
67    let regex = build_regex(&converted_pattern, REGEX_COMPLEXITY_LIMIT)?;
68    if regex.is_match("") {
69        return Err(RegexValidationError::MatchesEmptyString);
70    }
71    Ok(regex)
72}
73
74fn is_regex_within_complexity_limit(
75    converted_pattern: &str,
76    complexity_limit: usize,
77) -> Result<bool, RegexValidationError> {
78    match build_regex(converted_pattern, complexity_limit) {
79        Ok(_) => Ok(true),
80        Err(err) => match err {
81            RegexValidationError::TooComplex => Ok(false),
82            _ => Err(err),
83        },
84    }
85}
86
87fn build_regex(
88    converted_pattern: &str,
89    complexity_limit: usize,
90) -> Result<meta::Regex, RegexValidationError> {
91    meta::Builder::new()
92        .configure(
93            meta::Config::new()
94                .nfa_size_limit(Some(complexity_limit))
95                // This is purely a performance setting. This is the default, but it might be worth testing this in benchmarks for a better number
96                .hybrid_cache_capacity(2 * (1 << 20)),
97        )
98        .syntax(
99            regex_automata::util::syntax::Config::default()
100                .dot_matches_new_line(false)
101                .unicode(true),
102        )
103        .build(converted_pattern)
104        .map_err(|regex_err| {
105            if regex_err.size_limit().is_some() {
106                RegexValidationError::TooComplex
107            } else {
108                // Internally the `regex` crate does this conversion, so we do it too
109                RegexValidationError::InvalidSyntax
110            }
111        })
112}
113
114#[cfg(test)]
115mod test {
116    use crate::validation::{
117        get_regex_complexity_estimate_very_slow, validate_and_create_regex, validate_regex,
118        RegexValidationError,
119    };
120
121    #[test]
122    fn pattern_matching_empty_string_is_invalid() {
123        // simple case that matches (only) empty string
124        assert_eq!(
125            validate_regex(""),
126            Err(RegexValidationError::MatchesEmptyString)
127        );
128        assert_eq!(
129            // This is an alternation with an empty string for the right side (which matches anything)
130            validate_regex("a|"),
131            Err(RegexValidationError::MatchesEmptyString)
132        );
133
134        // A subset of the regex _can_ match the empty string, as long as the entire pattern does not
135        assert!(validate_regex("(a|)b").is_ok(),);
136    }
137
138    #[test]
139    fn too_complex_pattern_is_rejected() {
140        assert_eq!(
141            validate_regex(".{1000}{1000}"),
142            Err(RegexValidationError::TooComplex)
143        );
144    }
145
146    #[test]
147    fn high_repetition_pattern_is_rejected() {
148        assert_eq!(
149            validate_regex(".{10000}"),
150            Err(RegexValidationError::ExceededQuantifierLimit)
151        );
152    }
153
154    #[test]
155    fn test_invalid_range_quantifiers() {
156        assert_eq!(
157            validate_regex(".{100,1}"),
158            Err(RegexValidationError::InvalidSyntax)
159        );
160    }
161
162    #[test]
163    fn dot_new_line() {
164        let regex = validate_and_create_regex(".").unwrap();
165        assert!(!regex.is_match("\n"));
166    }
167
168    #[test]
169    fn test_complexity() {
170        // These values may change slightly when the `regex` crate is updated. As long as they
171        // are close, the test should be updated, otherwise the complexity limit may need to
172        // be adjusted.
173
174        assert_eq!(get_regex_complexity_estimate_very_slow("x"), Ok(1));
175        assert_eq!(get_regex_complexity_estimate_very_slow("x{1,10}"), Ok(920));
176        assert_eq!(get_regex_complexity_estimate_very_slow("."), Ok(1144));
177        assert_eq!(
178            get_regex_complexity_estimate_very_slow(".{1,10}"),
179            Ok(10536)
180        );
181        assert_eq!(
182            get_regex_complexity_estimate_very_slow(".{1,1000}"),
183            Ok(1_040_136)
184        );
185    }
186}