dd_sds/
validation.rs

1use crate::normalization::rust_regex_adapter::{
2    QUANTIFIER_LIMIT, convert_to_rust_regex, convert_to_rust_regex_ast,
3};
4use crate::parser::error::ParseError;
5use moka::sync::Cache;
6use once_cell::sync::Lazy;
7use regex_automata::meta::{self};
8use regex_syntax::ast::{Ast, GroupKind};
9use regex_syntax::hir::translate::Translator;
10use thiserror::Error;
11
12static REGEX_CACHE: Lazy<Cache<String, Result<(), RegexValidationError>>> =
13    Lazy::new(|| Cache::new(1000));
14
15#[derive(Debug, PartialEq, Eq, Error, Clone)]
16pub enum RegexValidationError {
17    #[error("Invalid regex syntax")]
18    InvalidSyntax,
19
20    #[error("The regex pattern is nested too deeply")]
21    ExceededDepthLimit,
22
23    #[error("The regex has exceeded the complexity limit (try simplifying the regex)")]
24    TooComplex,
25
26    #[error("Regex patterns are not allowed to match an empty string")]
27    MatchesEmptyString,
28
29    #[error("Regex quantifier is too high. Max is {}", QUANTIFIER_LIMIT)]
30    ExceededQuantifierLimit,
31}
32
33#[derive(Debug, PartialEq, Eq, Error, Clone)]
34pub enum RegexPatternCaptureGroupsValidationError {
35    #[error("We only allow one capture group, but {0} were provided")]
36    TooManyCaptureGroups(usize),
37
38    #[error("The capture group '{0}' is not present in the regex")]
39    CaptureGroupNotPresent(String),
40
41    #[error("The targeted capture group must be 'sds_match'")]
42    TargetedCaptureGroupMustBeSdsMatch,
43
44    #[error("The capture group 'sds_match' cannot match an empty string")]
45    CaptureGroupMatchesEmptyString,
46
47    #[error("The regex is invalid")]
48    InvalidSyntax,
49}
50
51impl From<ParseError> for RegexValidationError {
52    fn from(err: ParseError) -> Self {
53        match err {
54            ParseError::InvalidSyntax => Self::InvalidSyntax,
55            ParseError::ExceededDepthLimit => Self::ExceededDepthLimit,
56            ParseError::ExceededQuantifierLimit => Self::ExceededQuantifierLimit,
57        }
58    }
59}
60
61const REGEX_COMPLEXITY_LIMIT: usize = 1_000_000;
62
63/// Checks that a regex pattern is valid for using in an SDS scanner
64pub fn validate_regex(input: &str) -> Result<(), RegexValidationError> {
65    // This is the same as `validate_and_create_regex`, but removes the actual Regex type
66    // to create a more stable API for external users of the crate.
67    if let Some(cache_hit) = REGEX_CACHE.get(input) {
68        return cache_hit;
69    }
70
71    let result = validate_and_create_regex(input).map(|_| ());
72
73    REGEX_CACHE.insert(input.to_owned(), result.clone());
74    result
75}
76
77pub fn get_regex_complexity_estimate_very_slow(input: &str) -> Result<usize, RegexValidationError> {
78    // The regex crate doesn't directly give you access to the "complexity", but it does
79    // reject if it's too large, so we can binary search to find the limit.
80
81    let converted_pattern = convert_to_rust_regex(input)?;
82
83    let mut low = 1;
84    // Allow it to go a bit higher than the normal limit, since this can be used for debugging
85    // how complex a regex is.
86    let mut high = 10 * REGEX_COMPLEXITY_LIMIT;
87    while low < high {
88        let mid = low + (high - low) / 2;
89        if is_regex_within_complexity_limit(&converted_pattern, mid)? {
90            high = mid;
91        } else {
92            low = mid + 1;
93        }
94    }
95    Ok(low)
96}
97
98pub fn validate_and_create_regex(input: &str) -> Result<meta::Regex, RegexValidationError> {
99    // This validates that the syntax is valid and normalizes behavior.
100    let ast = convert_to_rust_regex_ast(input)?;
101    let pattern = ast.to_string();
102
103    // Ensure that zero-length matches are not possible
104    check_minimum_match_length(&ast, &pattern)?;
105
106    let regex = build_regex(&pattern.to_string(), REGEX_COMPLEXITY_LIMIT)?;
107    Ok(regex)
108}
109
110fn check_minimum_match_length(ast: &Ast, ast_str: &str) -> Result<(), RegexValidationError> {
111    let hir = Translator::new()
112        .translate(ast_str, ast)
113        .map_err(|_| RegexValidationError::InvalidSyntax)?;
114    match hir.properties().minimum_len() {
115        None => {
116            // This pattern cannot ever match
117            Err(RegexValidationError::InvalidSyntax)
118        }
119        Some(min_length) => {
120            if min_length == 0 {
121                Err(RegexValidationError::MatchesEmptyString)
122            } else {
123                Ok(())
124            }
125        }
126    }
127}
128
129pub fn validate_named_capture_group_minimum_length(
130    pattern: &str,
131    group_name: &str,
132) -> Result<(), RegexPatternCaptureGroupsValidationError> {
133    let ast = convert_to_rust_regex_ast(pattern)
134        .map_err(|_| RegexPatternCaptureGroupsValidationError::InvalidSyntax)?;
135    let matching_groups = collect_named_capture_group_asts(&ast, group_name);
136
137    for group_ast in matching_groups {
138        let group_pattern = group_ast.to_string();
139        let hir = Translator::new()
140            .translate(&group_pattern, group_ast)
141            .map_err(|_| RegexPatternCaptureGroupsValidationError::InvalidSyntax)?;
142        match hir.properties().minimum_len() {
143            None => return Err(RegexPatternCaptureGroupsValidationError::InvalidSyntax),
144            Some(0) => {
145                return Err(
146                    RegexPatternCaptureGroupsValidationError::CaptureGroupMatchesEmptyString,
147                );
148            }
149            Some(_) => {}
150        }
151    }
152
153    Ok(())
154}
155
156fn collect_named_capture_group_asts<'a>(ast: &'a Ast, group_name: &str) -> Vec<&'a Ast> {
157    let mut matching_groups = Vec::new();
158
159    match ast {
160        Ast::Concat(concat) => {
161            for sub_ast in &concat.asts {
162                matching_groups.extend(collect_named_capture_group_asts(sub_ast, group_name));
163            }
164        }
165        Ast::Group(group) => {
166            if let GroupKind::CaptureName { name, .. } = &group.kind
167                && name.name == group_name
168            {
169                matching_groups.push(ast);
170            }
171            matching_groups.extend(collect_named_capture_group_asts(
172                group.ast.as_ref(),
173                group_name,
174            ));
175        }
176        Ast::Alternation(alternation) => {
177            for sub_ast in &alternation.asts {
178                matching_groups.extend(collect_named_capture_group_asts(sub_ast, group_name));
179            }
180        }
181        Ast::Repetition(repetition) => {
182            matching_groups.extend(collect_named_capture_group_asts(
183                repetition.ast.as_ref(),
184                group_name,
185            ));
186        }
187        _ => {}
188    }
189
190    matching_groups
191}
192
193fn is_regex_within_complexity_limit(
194    converted_pattern: &str,
195    complexity_limit: usize,
196) -> Result<bool, RegexValidationError> {
197    match build_regex(converted_pattern, complexity_limit) {
198        Ok(_) => Ok(true),
199        Err(err) => match err {
200            RegexValidationError::TooComplex => Ok(false),
201            _ => Err(err),
202        },
203    }
204}
205
206fn build_regex(
207    converted_pattern: &str,
208    complexity_limit: usize,
209) -> Result<meta::Regex, RegexValidationError> {
210    meta::Builder::new()
211        .configure(
212            meta::Config::new()
213                .nfa_size_limit(Some(complexity_limit))
214                // This is purely a performance setting. This is the default, but it might be worth testing this in benchmarks for a better number
215                .hybrid_cache_capacity(2 * (1 << 20)),
216        )
217        .syntax(
218            regex_automata::util::syntax::Config::default()
219                .dot_matches_new_line(false)
220                .unicode(true),
221        )
222        .build(converted_pattern)
223        .map_err(|regex_err| {
224            if regex_err.size_limit().is_some() {
225                RegexValidationError::TooComplex
226            } else {
227                // Internally the `regex` crate does this conversion, so we do it too
228                RegexValidationError::InvalidSyntax
229            }
230        })
231}
232
233#[cfg(test)]
234mod test {
235    use crate::validation::{
236        RegexPatternCaptureGroupsValidationError, RegexValidationError,
237        get_regex_complexity_estimate_very_slow, validate_and_create_regex,
238        validate_named_capture_group_minimum_length, validate_regex,
239    };
240
241    #[test]
242    fn pattern_matching_empty_string_is_invalid() {
243        // simple case that matches (only) empty string
244        assert_eq!(
245            validate_regex(""),
246            Err(RegexValidationError::MatchesEmptyString)
247        );
248        assert_eq!(
249            // This is an alternation with an empty string for the right side (which matches anything)
250            validate_regex("a|"),
251            Err(RegexValidationError::MatchesEmptyString)
252        );
253
254        // A subset of the regex _can_ match the empty string, as long as the entire pattern does not
255        assert!(validate_regex("(a|)b").is_ok(),);
256    }
257
258    #[test]
259    fn too_complex_pattern_is_rejected() {
260        assert_eq!(
261            validate_regex(".{1000}{1000}"),
262            Err(RegexValidationError::TooComplex)
263        );
264    }
265
266    #[test]
267    fn high_repetition_pattern_is_rejected() {
268        assert_eq!(
269            validate_regex(".{10000}"),
270            Err(RegexValidationError::ExceededQuantifierLimit)
271        );
272    }
273
274    #[test]
275    fn test_invalid_range_quantifiers() {
276        assert_eq!(
277            validate_regex(".{100,1}"),
278            Err(RegexValidationError::InvalidSyntax)
279        );
280    }
281
282    #[test]
283    fn dot_new_line() {
284        let regex = validate_and_create_regex(".").unwrap();
285        assert!(!regex.is_match("\n"));
286    }
287
288    #[test]
289    fn test_complexity() {
290        // These values may change slightly when the `regex` crate is updated. As long as they
291        // are close, the test should be updated, otherwise the complexity limit may need to
292        // be adjusted.
293
294        assert_eq!(get_regex_complexity_estimate_very_slow("x"), Ok(1));
295        assert_eq!(get_regex_complexity_estimate_very_slow("x{1,10}"), Ok(920));
296        assert_eq!(get_regex_complexity_estimate_very_slow("."), Ok(1144));
297        assert_eq!(
298            get_regex_complexity_estimate_very_slow(".{1,10}"),
299            Ok(10536)
300        );
301        assert_eq!(
302            get_regex_complexity_estimate_very_slow(".{1,1000}"),
303            Ok(1_040_136)
304        );
305    }
306
307    #[test]
308    fn test_empty_match_without_empty_string() {
309        assert_eq!(
310            validate_regex("\\bx{0,2}\\b"),
311            Err(RegexValidationError::MatchesEmptyString)
312        );
313    }
314
315    #[test]
316    fn capture_group_minimum_length_rejects_empty() {
317        assert_eq!(
318            validate_named_capture_group_minimum_length("hello (?<sds_match>d*)", "sds_match"),
319            Err(RegexPatternCaptureGroupsValidationError::CaptureGroupMatchesEmptyString),
320        );
321    }
322
323    #[test]
324    fn capture_group_minimum_length_accepts_non_empty() {
325        assert!(
326            validate_named_capture_group_minimum_length("hello (?<sds_match>world)", "sds_match")
327                .is_ok()
328        );
329    }
330}