dd_sds/
validation.rs

1use crate::normalization::rust_regex_adapter::{
2    convert_to_rust_regex, convert_to_rust_regex_ast, QUANTIFIER_LIMIT,
3};
4use crate::parser::error::ParseError;
5use moka::sync::Cache;
6use once_cell::sync::Lazy;
7use regex_automata::meta::{self};
8use regex_syntax::ast::Ast;
9use regex_syntax::hir::translate::Translator;
10use thiserror::Error;
11
12static REGEX_CACHE: Lazy<Cache<String, Result<(), RegexValidationError>>> =
13    Lazy::new(|| Cache::new(1000));
14
15#[derive(Debug, PartialEq, Eq, Error, Clone)]
16pub enum RegexValidationError {
17    #[error("Invalid regex syntax")]
18    InvalidSyntax,
19
20    #[error("The regex pattern is nested too deeply")]
21    ExceededDepthLimit,
22
23    #[error("The regex has exceeded the complexity limit (try simplifying the regex)")]
24    TooComplex,
25
26    #[error("Regex patterns are not allowed to match an empty string")]
27    MatchesEmptyString,
28
29    #[error("Regex quantifier is too high. Max is {}", QUANTIFIER_LIMIT)]
30    ExceededQuantifierLimit,
31}
32
33impl From<ParseError> for RegexValidationError {
34    fn from(err: ParseError) -> Self {
35        match err {
36            ParseError::InvalidSyntax => Self::InvalidSyntax,
37            ParseError::ExceededDepthLimit => Self::ExceededDepthLimit,
38            ParseError::ExceededQuantifierLimit => Self::ExceededQuantifierLimit,
39        }
40    }
41}
42
43const REGEX_COMPLEXITY_LIMIT: usize = 1_000_000;
44
45/// Checks that a regex pattern is valid for using in an SDS scanner
46pub fn validate_regex(input: &str) -> Result<(), RegexValidationError> {
47    // This is the same as `validate_and_create_regex`, but removes the actual Regex type
48    // to create a more stable API for external users of the crate.
49    if let Some(cache_hit) = REGEX_CACHE.get(input) {
50        return cache_hit;
51    }
52
53    let result = validate_and_create_regex(input).map(|_| ());
54
55    REGEX_CACHE.insert(input.to_owned(), result.clone());
56    result
57}
58
59pub fn get_regex_complexity_estimate_very_slow(input: &str) -> Result<usize, RegexValidationError> {
60    // The regex crate doesn't directly give you access to the "complexity", but it does
61    // reject if it's too large, so we can binary search to find the limit.
62
63    let converted_pattern = convert_to_rust_regex(input)?;
64
65    let mut low = 1;
66    // Allow it to go a bit higher than the normal limit, since this can be used for debugging
67    // how complex a regex is.
68    let mut high = 10 * REGEX_COMPLEXITY_LIMIT;
69    while low < high {
70        let mid = low + (high - low) / 2;
71        if is_regex_within_complexity_limit(&converted_pattern, mid)? {
72            high = mid;
73        } else {
74            low = mid + 1;
75        }
76    }
77    Ok(low)
78}
79
80pub fn validate_and_create_regex(input: &str) -> Result<meta::Regex, RegexValidationError> {
81    // This validates that the syntax is valid and normalizes behavior.
82    let ast = convert_to_rust_regex_ast(input)?;
83    let pattern = ast.to_string();
84
85    // Ensure that zero-length matches are not possible
86    check_minimum_match_length(&ast, &pattern)?;
87
88    let regex = build_regex(&pattern.to_string(), REGEX_COMPLEXITY_LIMIT)?;
89    Ok(regex)
90}
91
92fn check_minimum_match_length(ast: &Ast, ast_str: &str) -> Result<(), RegexValidationError> {
93    let hir = Translator::new()
94        .translate(ast_str, ast)
95        .map_err(|_| RegexValidationError::InvalidSyntax)?;
96    match hir.properties().minimum_len() {
97        None => {
98            // This pattern cannot ever match
99            Err(RegexValidationError::InvalidSyntax)
100        }
101        Some(min_length) => {
102            if min_length == 0 {
103                Err(RegexValidationError::MatchesEmptyString)
104            } else {
105                Ok(())
106            }
107        }
108    }
109}
110
111fn is_regex_within_complexity_limit(
112    converted_pattern: &str,
113    complexity_limit: usize,
114) -> Result<bool, RegexValidationError> {
115    match build_regex(converted_pattern, complexity_limit) {
116        Ok(_) => Ok(true),
117        Err(err) => match err {
118            RegexValidationError::TooComplex => Ok(false),
119            _ => Err(err),
120        },
121    }
122}
123
124fn build_regex(
125    converted_pattern: &str,
126    complexity_limit: usize,
127) -> Result<meta::Regex, RegexValidationError> {
128    meta::Builder::new()
129        .configure(
130            meta::Config::new()
131                .nfa_size_limit(Some(complexity_limit))
132                // This is purely a performance setting. This is the default, but it might be worth testing this in benchmarks for a better number
133                .hybrid_cache_capacity(2 * (1 << 20)),
134        )
135        .syntax(
136            regex_automata::util::syntax::Config::default()
137                .dot_matches_new_line(false)
138                .unicode(true),
139        )
140        .build(converted_pattern)
141        .map_err(|regex_err| {
142            if regex_err.size_limit().is_some() {
143                RegexValidationError::TooComplex
144            } else {
145                // Internally the `regex` crate does this conversion, so we do it too
146                RegexValidationError::InvalidSyntax
147            }
148        })
149}
150
151#[cfg(test)]
152mod test {
153    use crate::validation::{
154        get_regex_complexity_estimate_very_slow, validate_and_create_regex, validate_regex,
155        RegexValidationError,
156    };
157
158    #[test]
159    fn pattern_matching_empty_string_is_invalid() {
160        // simple case that matches (only) empty string
161        assert_eq!(
162            validate_regex(""),
163            Err(RegexValidationError::MatchesEmptyString)
164        );
165        assert_eq!(
166            // This is an alternation with an empty string for the right side (which matches anything)
167            validate_regex("a|"),
168            Err(RegexValidationError::MatchesEmptyString)
169        );
170
171        // A subset of the regex _can_ match the empty string, as long as the entire pattern does not
172        assert!(validate_regex("(a|)b").is_ok(),);
173    }
174
175    #[test]
176    fn too_complex_pattern_is_rejected() {
177        assert_eq!(
178            validate_regex(".{1000}{1000}"),
179            Err(RegexValidationError::TooComplex)
180        );
181    }
182
183    #[test]
184    fn high_repetition_pattern_is_rejected() {
185        assert_eq!(
186            validate_regex(".{10000}"),
187            Err(RegexValidationError::ExceededQuantifierLimit)
188        );
189    }
190
191    #[test]
192    fn test_invalid_range_quantifiers() {
193        assert_eq!(
194            validate_regex(".{100,1}"),
195            Err(RegexValidationError::InvalidSyntax)
196        );
197    }
198
199    #[test]
200    fn dot_new_line() {
201        let regex = validate_and_create_regex(".").unwrap();
202        assert!(!regex.is_match("\n"));
203    }
204
205    #[test]
206    fn test_complexity() {
207        // These values may change slightly when the `regex` crate is updated. As long as they
208        // are close, the test should be updated, otherwise the complexity limit may need to
209        // be adjusted.
210
211        assert_eq!(get_regex_complexity_estimate_very_slow("x"), Ok(1));
212        assert_eq!(get_regex_complexity_estimate_very_slow("x{1,10}"), Ok(920));
213        assert_eq!(get_regex_complexity_estimate_very_slow("."), Ok(1144));
214        assert_eq!(
215            get_regex_complexity_estimate_very_slow(".{1,10}"),
216            Ok(10536)
217        );
218        assert_eq!(
219            get_regex_complexity_estimate_very_slow(".{1,1000}"),
220            Ok(1_040_136)
221        );
222    }
223
224    #[test]
225    fn test_empty_match_without_empty_string() {
226        assert_eq!(
227            validate_regex("\\bx{0,2}\\b"),
228            Err(RegexValidationError::MatchesEmptyString)
229        );
230    }
231}