dd_sds/
validation.rs

1use crate::normalization::rust_regex_adapter::{
2    QUANTIFIER_LIMIT, convert_to_rust_regex, convert_to_rust_regex_ast,
3};
4use crate::parser::error::ParseError;
5use moka::sync::Cache;
6use once_cell::sync::Lazy;
7use regex_automata::meta::{self};
8use regex_syntax::ast::Ast;
9use regex_syntax::hir::translate::Translator;
10use thiserror::Error;
11
12static REGEX_CACHE: Lazy<Cache<String, Result<(), RegexValidationError>>> =
13    Lazy::new(|| Cache::new(1000));
14
15#[derive(Debug, PartialEq, Eq, Error, Clone)]
16pub enum RegexValidationError {
17    #[error("Invalid regex syntax")]
18    InvalidSyntax,
19
20    #[error("The regex pattern is nested too deeply")]
21    ExceededDepthLimit,
22
23    #[error("The regex has exceeded the complexity limit (try simplifying the regex)")]
24    TooComplex,
25
26    #[error("Regex patterns are not allowed to match an empty string")]
27    MatchesEmptyString,
28
29    #[error("Regex quantifier is too high. Max is {}", QUANTIFIER_LIMIT)]
30    ExceededQuantifierLimit,
31}
32
33#[derive(Debug, PartialEq, Eq, Error, Clone)]
34pub enum RegexPatternCaptureGroupsValidationError {
35    #[error("We only allow one capture group, but {0} were provided")]
36    TooManyCaptureGroups(usize),
37
38    #[error("The capture group '{0}' is not present in the regex")]
39    CaptureGroupNotPresent(String),
40
41    #[error("The targeted capture group must be 'sds_match'")]
42    TargetedCaptureGroupMustBeSdsMatch,
43
44    #[error("The regex is invalid")]
45    InvalidSyntax,
46}
47
48impl From<ParseError> for RegexValidationError {
49    fn from(err: ParseError) -> Self {
50        match err {
51            ParseError::InvalidSyntax => Self::InvalidSyntax,
52            ParseError::ExceededDepthLimit => Self::ExceededDepthLimit,
53            ParseError::ExceededQuantifierLimit => Self::ExceededQuantifierLimit,
54        }
55    }
56}
57
58const REGEX_COMPLEXITY_LIMIT: usize = 1_000_000;
59
60/// Checks that a regex pattern is valid for using in an SDS scanner
61pub fn validate_regex(input: &str) -> Result<(), RegexValidationError> {
62    // This is the same as `validate_and_create_regex`, but removes the actual Regex type
63    // to create a more stable API for external users of the crate.
64    if let Some(cache_hit) = REGEX_CACHE.get(input) {
65        return cache_hit;
66    }
67
68    let result = validate_and_create_regex(input).map(|_| ());
69
70    REGEX_CACHE.insert(input.to_owned(), result.clone());
71    result
72}
73
74pub fn get_regex_complexity_estimate_very_slow(input: &str) -> Result<usize, RegexValidationError> {
75    // The regex crate doesn't directly give you access to the "complexity", but it does
76    // reject if it's too large, so we can binary search to find the limit.
77
78    let converted_pattern = convert_to_rust_regex(input)?;
79
80    let mut low = 1;
81    // Allow it to go a bit higher than the normal limit, since this can be used for debugging
82    // how complex a regex is.
83    let mut high = 10 * REGEX_COMPLEXITY_LIMIT;
84    while low < high {
85        let mid = low + (high - low) / 2;
86        if is_regex_within_complexity_limit(&converted_pattern, mid)? {
87            high = mid;
88        } else {
89            low = mid + 1;
90        }
91    }
92    Ok(low)
93}
94
95pub fn validate_and_create_regex(input: &str) -> Result<meta::Regex, RegexValidationError> {
96    // This validates that the syntax is valid and normalizes behavior.
97    let ast = convert_to_rust_regex_ast(input)?;
98    let pattern = ast.to_string();
99
100    // Ensure that zero-length matches are not possible
101    check_minimum_match_length(&ast, &pattern)?;
102
103    let regex = build_regex(&pattern.to_string(), REGEX_COMPLEXITY_LIMIT)?;
104    Ok(regex)
105}
106
107fn check_minimum_match_length(ast: &Ast, ast_str: &str) -> Result<(), RegexValidationError> {
108    let hir = Translator::new()
109        .translate(ast_str, ast)
110        .map_err(|_| RegexValidationError::InvalidSyntax)?;
111    match hir.properties().minimum_len() {
112        None => {
113            // This pattern cannot ever match
114            Err(RegexValidationError::InvalidSyntax)
115        }
116        Some(min_length) => {
117            if min_length == 0 {
118                Err(RegexValidationError::MatchesEmptyString)
119            } else {
120                Ok(())
121            }
122        }
123    }
124}
125
126fn is_regex_within_complexity_limit(
127    converted_pattern: &str,
128    complexity_limit: usize,
129) -> Result<bool, RegexValidationError> {
130    match build_regex(converted_pattern, complexity_limit) {
131        Ok(_) => Ok(true),
132        Err(err) => match err {
133            RegexValidationError::TooComplex => Ok(false),
134            _ => Err(err),
135        },
136    }
137}
138
139fn build_regex(
140    converted_pattern: &str,
141    complexity_limit: usize,
142) -> Result<meta::Regex, RegexValidationError> {
143    meta::Builder::new()
144        .configure(
145            meta::Config::new()
146                .nfa_size_limit(Some(complexity_limit))
147                // This is purely a performance setting. This is the default, but it might be worth testing this in benchmarks for a better number
148                .hybrid_cache_capacity(2 * (1 << 20)),
149        )
150        .syntax(
151            regex_automata::util::syntax::Config::default()
152                .dot_matches_new_line(false)
153                .unicode(true),
154        )
155        .build(converted_pattern)
156        .map_err(|regex_err| {
157            if regex_err.size_limit().is_some() {
158                RegexValidationError::TooComplex
159            } else {
160                // Internally the `regex` crate does this conversion, so we do it too
161                RegexValidationError::InvalidSyntax
162            }
163        })
164}
165
166#[cfg(test)]
167mod test {
168    use crate::validation::{
169        RegexValidationError, get_regex_complexity_estimate_very_slow, validate_and_create_regex,
170        validate_regex,
171    };
172
173    #[test]
174    fn pattern_matching_empty_string_is_invalid() {
175        // simple case that matches (only) empty string
176        assert_eq!(
177            validate_regex(""),
178            Err(RegexValidationError::MatchesEmptyString)
179        );
180        assert_eq!(
181            // This is an alternation with an empty string for the right side (which matches anything)
182            validate_regex("a|"),
183            Err(RegexValidationError::MatchesEmptyString)
184        );
185
186        // A subset of the regex _can_ match the empty string, as long as the entire pattern does not
187        assert!(validate_regex("(a|)b").is_ok(),);
188    }
189
190    #[test]
191    fn too_complex_pattern_is_rejected() {
192        assert_eq!(
193            validate_regex(".{1000}{1000}"),
194            Err(RegexValidationError::TooComplex)
195        );
196    }
197
198    #[test]
199    fn high_repetition_pattern_is_rejected() {
200        assert_eq!(
201            validate_regex(".{10000}"),
202            Err(RegexValidationError::ExceededQuantifierLimit)
203        );
204    }
205
206    #[test]
207    fn test_invalid_range_quantifiers() {
208        assert_eq!(
209            validate_regex(".{100,1}"),
210            Err(RegexValidationError::InvalidSyntax)
211        );
212    }
213
214    #[test]
215    fn dot_new_line() {
216        let regex = validate_and_create_regex(".").unwrap();
217        assert!(!regex.is_match("\n"));
218    }
219
220    #[test]
221    fn test_complexity() {
222        // These values may change slightly when the `regex` crate is updated. As long as they
223        // are close, the test should be updated, otherwise the complexity limit may need to
224        // be adjusted.
225
226        assert_eq!(get_regex_complexity_estimate_very_slow("x"), Ok(1));
227        assert_eq!(get_regex_complexity_estimate_very_slow("x{1,10}"), Ok(920));
228        assert_eq!(get_regex_complexity_estimate_very_slow("."), Ok(1144));
229        assert_eq!(
230            get_regex_complexity_estimate_very_slow(".{1,10}"),
231            Ok(10536)
232        );
233        assert_eq!(
234            get_regex_complexity_estimate_very_slow(".{1,1000}"),
235            Ok(1_040_136)
236        );
237    }
238
239    #[test]
240    fn test_empty_match_without_empty_string() {
241        assert_eq!(
242            validate_regex("\\bx{0,2}\\b"),
243            Err(RegexValidationError::MatchesEmptyString)
244        );
245    }
246}