1use crate::normalization::rust_regex_adapter::{
2 QUANTIFIER_LIMIT, convert_to_rust_regex, convert_to_rust_regex_ast,
3};
4use crate::parser::error::ParseError;
5use moka::sync::Cache;
6use once_cell::sync::Lazy;
7use regex_automata::meta::{self};
8use regex_syntax::ast::Ast;
9use regex_syntax::hir::translate::Translator;
10use thiserror::Error;
11
12static REGEX_CACHE: Lazy<Cache<String, Result<(), RegexValidationError>>> =
13 Lazy::new(|| Cache::new(1000));
14
15#[derive(Debug, PartialEq, Eq, Error, Clone)]
16pub enum RegexValidationError {
17 #[error("Invalid regex syntax")]
18 InvalidSyntax,
19
20 #[error("The regex pattern is nested too deeply")]
21 ExceededDepthLimit,
22
23 #[error("The regex has exceeded the complexity limit (try simplifying the regex)")]
24 TooComplex,
25
26 #[error("Regex patterns are not allowed to match an empty string")]
27 MatchesEmptyString,
28
29 #[error("Regex quantifier is too high. Max is {}", QUANTIFIER_LIMIT)]
30 ExceededQuantifierLimit,
31}
32
33#[derive(Debug, PartialEq, Eq, Error, Clone)]
34pub enum RegexPatternCaptureGroupsValidationError {
35 #[error("We only allow one capture group, but {0} were provided")]
36 TooManyCaptureGroups(usize),
37
38 #[error("The capture group '{0}' is not present in the regex")]
39 CaptureGroupNotPresent(String),
40
41 #[error("The targeted capture group must be 'sds_match'")]
42 TargetedCaptureGroupMustBeSdsMatch,
43
44 #[error("The regex is invalid")]
45 InvalidSyntax,
46}
47
48impl From<ParseError> for RegexValidationError {
49 fn from(err: ParseError) -> Self {
50 match err {
51 ParseError::InvalidSyntax => Self::InvalidSyntax,
52 ParseError::ExceededDepthLimit => Self::ExceededDepthLimit,
53 ParseError::ExceededQuantifierLimit => Self::ExceededQuantifierLimit,
54 }
55 }
56}
57
58const REGEX_COMPLEXITY_LIMIT: usize = 1_000_000;
59
60pub fn validate_regex(input: &str) -> Result<(), RegexValidationError> {
62 if let Some(cache_hit) = REGEX_CACHE.get(input) {
65 return cache_hit;
66 }
67
68 let result = validate_and_create_regex(input).map(|_| ());
69
70 REGEX_CACHE.insert(input.to_owned(), result.clone());
71 result
72}
73
74pub fn get_regex_complexity_estimate_very_slow(input: &str) -> Result<usize, RegexValidationError> {
75 let converted_pattern = convert_to_rust_regex(input)?;
79
80 let mut low = 1;
81 let mut high = 10 * REGEX_COMPLEXITY_LIMIT;
84 while low < high {
85 let mid = low + (high - low) / 2;
86 if is_regex_within_complexity_limit(&converted_pattern, mid)? {
87 high = mid;
88 } else {
89 low = mid + 1;
90 }
91 }
92 Ok(low)
93}
94
95pub fn validate_and_create_regex(input: &str) -> Result<meta::Regex, RegexValidationError> {
96 let ast = convert_to_rust_regex_ast(input)?;
98 let pattern = ast.to_string();
99
100 check_minimum_match_length(&ast, &pattern)?;
102
103 let regex = build_regex(&pattern.to_string(), REGEX_COMPLEXITY_LIMIT)?;
104 Ok(regex)
105}
106
107fn check_minimum_match_length(ast: &Ast, ast_str: &str) -> Result<(), RegexValidationError> {
108 let hir = Translator::new()
109 .translate(ast_str, ast)
110 .map_err(|_| RegexValidationError::InvalidSyntax)?;
111 match hir.properties().minimum_len() {
112 None => {
113 Err(RegexValidationError::InvalidSyntax)
115 }
116 Some(min_length) => {
117 if min_length == 0 {
118 Err(RegexValidationError::MatchesEmptyString)
119 } else {
120 Ok(())
121 }
122 }
123 }
124}
125
126fn is_regex_within_complexity_limit(
127 converted_pattern: &str,
128 complexity_limit: usize,
129) -> Result<bool, RegexValidationError> {
130 match build_regex(converted_pattern, complexity_limit) {
131 Ok(_) => Ok(true),
132 Err(err) => match err {
133 RegexValidationError::TooComplex => Ok(false),
134 _ => Err(err),
135 },
136 }
137}
138
139fn build_regex(
140 converted_pattern: &str,
141 complexity_limit: usize,
142) -> Result<meta::Regex, RegexValidationError> {
143 meta::Builder::new()
144 .configure(
145 meta::Config::new()
146 .nfa_size_limit(Some(complexity_limit))
147 .hybrid_cache_capacity(2 * (1 << 20)),
149 )
150 .syntax(
151 regex_automata::util::syntax::Config::default()
152 .dot_matches_new_line(false)
153 .unicode(true),
154 )
155 .build(converted_pattern)
156 .map_err(|regex_err| {
157 if regex_err.size_limit().is_some() {
158 RegexValidationError::TooComplex
159 } else {
160 RegexValidationError::InvalidSyntax
162 }
163 })
164}
165
166#[cfg(test)]
167mod test {
168 use crate::validation::{
169 RegexValidationError, get_regex_complexity_estimate_very_slow, validate_and_create_regex,
170 validate_regex,
171 };
172
173 #[test]
174 fn pattern_matching_empty_string_is_invalid() {
175 assert_eq!(
177 validate_regex(""),
178 Err(RegexValidationError::MatchesEmptyString)
179 );
180 assert_eq!(
181 validate_regex("a|"),
183 Err(RegexValidationError::MatchesEmptyString)
184 );
185
186 assert!(validate_regex("(a|)b").is_ok(),);
188 }
189
190 #[test]
191 fn too_complex_pattern_is_rejected() {
192 assert_eq!(
193 validate_regex(".{1000}{1000}"),
194 Err(RegexValidationError::TooComplex)
195 );
196 }
197
198 #[test]
199 fn high_repetition_pattern_is_rejected() {
200 assert_eq!(
201 validate_regex(".{10000}"),
202 Err(RegexValidationError::ExceededQuantifierLimit)
203 );
204 }
205
206 #[test]
207 fn test_invalid_range_quantifiers() {
208 assert_eq!(
209 validate_regex(".{100,1}"),
210 Err(RegexValidationError::InvalidSyntax)
211 );
212 }
213
214 #[test]
215 fn dot_new_line() {
216 let regex = validate_and_create_regex(".").unwrap();
217 assert!(!regex.is_match("\n"));
218 }
219
220 #[test]
221 fn test_complexity() {
222 assert_eq!(get_regex_complexity_estimate_very_slow("x"), Ok(1));
227 assert_eq!(get_regex_complexity_estimate_very_slow("x{1,10}"), Ok(920));
228 assert_eq!(get_regex_complexity_estimate_very_slow("."), Ok(1144));
229 assert_eq!(
230 get_regex_complexity_estimate_very_slow(".{1,10}"),
231 Ok(10536)
232 );
233 assert_eq!(
234 get_regex_complexity_estimate_very_slow(".{1,1000}"),
235 Ok(1_040_136)
236 );
237 }
238
239 #[test]
240 fn test_empty_match_without_empty_string() {
241 assert_eq!(
242 validate_regex("\\bx{0,2}\\b"),
243 Err(RegexValidationError::MatchesEmptyString)
244 );
245 }
246}