dd_sds/
validation.rs

1use crate::normalization::rust_regex_adapter::{
2    convert_to_rust_regex, convert_to_rust_regex_ast, QUANTIFIER_LIMIT,
3};
4use crate::parser::error::ParseError;
5use regex_automata::meta::{self};
6use regex_syntax::ast::Ast;
7use regex_syntax::hir::translate::Translator;
8use thiserror::Error;
9
10#[derive(Debug, PartialEq, Eq, Error)]
11pub enum RegexValidationError {
12    #[error("Invalid regex syntax")]
13    InvalidSyntax,
14
15    #[error("The regex pattern is nested too deeply")]
16    ExceededDepthLimit,
17
18    #[error("The regex has exceeded the complexity limit (try simplifying the regex)")]
19    TooComplex,
20
21    #[error("Regex patterns are not allowed to match an empty string")]
22    MatchesEmptyString,
23
24    #[error("Regex quantifier is too high. Max is {}", QUANTIFIER_LIMIT)]
25    ExceededQuantifierLimit,
26}
27
28impl From<ParseError> for RegexValidationError {
29    fn from(err: ParseError) -> Self {
30        match err {
31            ParseError::InvalidSyntax => Self::InvalidSyntax,
32            ParseError::ExceededDepthLimit => Self::ExceededDepthLimit,
33            ParseError::ExceededQuantifierLimit => Self::ExceededQuantifierLimit,
34        }
35    }
36}
37
38const REGEX_COMPLEXITY_LIMIT: usize = 1_000_000;
39
40/// Checks that a regex pattern is valid for using in an SDS scanner
41pub fn validate_regex(input: &str) -> Result<(), RegexValidationError> {
42    // This is the same as `validate_and_create_regex`, but removes the actual Regex type
43    // to create a more stable API for external users of the crate.
44    validate_and_create_regex(input).map(|_| ())
45}
46
47pub fn get_regex_complexity_estimate_very_slow(input: &str) -> Result<usize, RegexValidationError> {
48    // The regex crate doesn't directly give you access to the "complexity", but it does
49    // reject if it's too large, so we can binary search to find the limit.
50
51    let converted_pattern = convert_to_rust_regex(input)?;
52
53    let mut low = 1;
54    // Allow it to go a bit higher than the normal limit, since this can be used for debugging
55    // how complex a regex is.
56    let mut high = 10 * REGEX_COMPLEXITY_LIMIT;
57    while low < high {
58        let mid = low + (high - low) / 2;
59        if is_regex_within_complexity_limit(&converted_pattern, mid)? {
60            high = mid;
61        } else {
62            low = mid + 1;
63        }
64    }
65    Ok(low)
66}
67
68pub fn validate_and_create_regex(input: &str) -> Result<meta::Regex, RegexValidationError> {
69    // This validates that the syntax is valid and normalizes behavior.
70    let ast = convert_to_rust_regex_ast(input)?;
71    let pattern = ast.to_string();
72
73    // Ensure that zero-length matches are not possible
74    check_minimum_match_length(&ast, &pattern)?;
75
76    let regex = build_regex(&pattern.to_string(), REGEX_COMPLEXITY_LIMIT)?;
77    Ok(regex)
78}
79
80fn check_minimum_match_length(ast: &Ast, ast_str: &str) -> Result<(), RegexValidationError> {
81    let hir = Translator::new()
82        .translate(ast_str, ast)
83        .map_err(|_| RegexValidationError::InvalidSyntax)?;
84    match hir.properties().minimum_len() {
85        None => {
86            // This pattern cannot ever match
87            Err(RegexValidationError::InvalidSyntax)
88        }
89        Some(min_length) => {
90            if min_length == 0 {
91                Err(RegexValidationError::MatchesEmptyString)
92            } else {
93                Ok(())
94            }
95        }
96    }
97}
98
99fn is_regex_within_complexity_limit(
100    converted_pattern: &str,
101    complexity_limit: usize,
102) -> Result<bool, RegexValidationError> {
103    match build_regex(converted_pattern, complexity_limit) {
104        Ok(_) => Ok(true),
105        Err(err) => match err {
106            RegexValidationError::TooComplex => Ok(false),
107            _ => Err(err),
108        },
109    }
110}
111
112fn build_regex(
113    converted_pattern: &str,
114    complexity_limit: usize,
115) -> Result<meta::Regex, RegexValidationError> {
116    meta::Builder::new()
117        .configure(
118            meta::Config::new()
119                .nfa_size_limit(Some(complexity_limit))
120                // This is purely a performance setting. This is the default, but it might be worth testing this in benchmarks for a better number
121                .hybrid_cache_capacity(2 * (1 << 20)),
122        )
123        .syntax(
124            regex_automata::util::syntax::Config::default()
125                .dot_matches_new_line(false)
126                .unicode(true),
127        )
128        .build(converted_pattern)
129        .map_err(|regex_err| {
130            if regex_err.size_limit().is_some() {
131                RegexValidationError::TooComplex
132            } else {
133                // Internally the `regex` crate does this conversion, so we do it too
134                RegexValidationError::InvalidSyntax
135            }
136        })
137}
138
139#[cfg(test)]
140mod test {
141    use crate::validation::{
142        get_regex_complexity_estimate_very_slow, validate_and_create_regex, validate_regex,
143        RegexValidationError,
144    };
145
146    #[test]
147    fn pattern_matching_empty_string_is_invalid() {
148        // simple case that matches (only) empty string
149        assert_eq!(
150            validate_regex(""),
151            Err(RegexValidationError::MatchesEmptyString)
152        );
153        assert_eq!(
154            // This is an alternation with an empty string for the right side (which matches anything)
155            validate_regex("a|"),
156            Err(RegexValidationError::MatchesEmptyString)
157        );
158
159        // A subset of the regex _can_ match the empty string, as long as the entire pattern does not
160        assert!(validate_regex("(a|)b").is_ok(),);
161    }
162
163    #[test]
164    fn too_complex_pattern_is_rejected() {
165        assert_eq!(
166            validate_regex(".{1000}{1000}"),
167            Err(RegexValidationError::TooComplex)
168        );
169    }
170
171    #[test]
172    fn high_repetition_pattern_is_rejected() {
173        assert_eq!(
174            validate_regex(".{10000}"),
175            Err(RegexValidationError::ExceededQuantifierLimit)
176        );
177    }
178
179    #[test]
180    fn test_invalid_range_quantifiers() {
181        assert_eq!(
182            validate_regex(".{100,1}"),
183            Err(RegexValidationError::InvalidSyntax)
184        );
185    }
186
187    #[test]
188    fn dot_new_line() {
189        let regex = validate_and_create_regex(".").unwrap();
190        assert!(!regex.is_match("\n"));
191    }
192
193    #[test]
194    fn test_complexity() {
195        // These values may change slightly when the `regex` crate is updated. As long as they
196        // are close, the test should be updated, otherwise the complexity limit may need to
197        // be adjusted.
198
199        assert_eq!(get_regex_complexity_estimate_very_slow("x"), Ok(1));
200        assert_eq!(get_regex_complexity_estimate_very_slow("x{1,10}"), Ok(920));
201        assert_eq!(get_regex_complexity_estimate_very_slow("."), Ok(1144));
202        assert_eq!(
203            get_regex_complexity_estimate_very_slow(".{1,10}"),
204            Ok(10536)
205        );
206        assert_eq!(
207            get_regex_complexity_estimate_very_slow(".{1,1000}"),
208            Ok(1_040_136)
209        );
210    }
211
212    #[test]
213    fn test_empty_match_without_empty_string() {
214        assert_eq!(
215            validate_regex("\\bx{0,2}\\b"),
216            Err(RegexValidationError::MatchesEmptyString)
217        );
218    }
219}