1use crate::normalization::rust_regex_adapter::{
2 QUANTIFIER_LIMIT, convert_to_rust_regex, convert_to_rust_regex_ast,
3};
4use crate::parser::error::ParseError;
5use moka::sync::Cache;
6use once_cell::sync::Lazy;
7use regex_automata::meta::{self};
8use regex_syntax::ast::{Ast, GroupKind};
9use regex_syntax::hir::translate::Translator;
10use thiserror::Error;
11
12static REGEX_CACHE: Lazy<Cache<String, Result<(), RegexValidationError>>> =
13 Lazy::new(|| Cache::new(1000));
14
15#[derive(Debug, PartialEq, Eq, Error, Clone)]
16pub enum RegexValidationError {
17 #[error("Invalid regex syntax")]
18 InvalidSyntax,
19
20 #[error("The regex pattern is nested too deeply")]
21 ExceededDepthLimit,
22
23 #[error("The regex has exceeded the complexity limit (try simplifying the regex)")]
24 TooComplex,
25
26 #[error("Regex patterns are not allowed to match an empty string")]
27 MatchesEmptyString,
28
29 #[error("Regex quantifier is too high. Max is {}", QUANTIFIER_LIMIT)]
30 ExceededQuantifierLimit,
31}
32
33#[derive(Debug, PartialEq, Eq, Error, Clone)]
34pub enum RegexPatternCaptureGroupsValidationError {
35 #[error("We only allow one capture group, but {0} were provided")]
36 TooManyCaptureGroups(usize),
37
38 #[error("The capture group '{0}' is not present in the regex")]
39 CaptureGroupNotPresent(String),
40
41 #[error("The targeted capture group must be 'sds_match'")]
42 TargetedCaptureGroupMustBeSdsMatch,
43
44 #[error("The capture group 'sds_match' cannot match an empty string")]
45 CaptureGroupMatchesEmptyString,
46
47 #[error("The regex is invalid")]
48 InvalidSyntax,
49}
50
51impl From<ParseError> for RegexValidationError {
52 fn from(err: ParseError) -> Self {
53 match err {
54 ParseError::InvalidSyntax => Self::InvalidSyntax,
55 ParseError::ExceededDepthLimit => Self::ExceededDepthLimit,
56 ParseError::ExceededQuantifierLimit => Self::ExceededQuantifierLimit,
57 }
58 }
59}
60
61const REGEX_COMPLEXITY_LIMIT: usize = 1_000_000;
62
63pub fn validate_regex(input: &str) -> Result<(), RegexValidationError> {
65 if let Some(cache_hit) = REGEX_CACHE.get(input) {
68 return cache_hit;
69 }
70
71 let result = validate_and_create_regex(input).map(|_| ());
72
73 REGEX_CACHE.insert(input.to_owned(), result.clone());
74 result
75}
76
77pub fn get_regex_complexity_estimate_very_slow(input: &str) -> Result<usize, RegexValidationError> {
78 let converted_pattern = convert_to_rust_regex(input)?;
82
83 let mut low = 1;
84 let mut high = 10 * REGEX_COMPLEXITY_LIMIT;
87 while low < high {
88 let mid = low + (high - low) / 2;
89 if is_regex_within_complexity_limit(&converted_pattern, mid)? {
90 high = mid;
91 } else {
92 low = mid + 1;
93 }
94 }
95 Ok(low)
96}
97
98pub fn validate_and_create_regex(input: &str) -> Result<meta::Regex, RegexValidationError> {
99 let ast = convert_to_rust_regex_ast(input)?;
101 let pattern = ast.to_string();
102
103 check_minimum_match_length(&ast, &pattern)?;
105
106 let regex = build_regex(&pattern.to_string(), REGEX_COMPLEXITY_LIMIT)?;
107 Ok(regex)
108}
109
110fn check_minimum_match_length(ast: &Ast, ast_str: &str) -> Result<(), RegexValidationError> {
111 let hir = Translator::new()
112 .translate(ast_str, ast)
113 .map_err(|_| RegexValidationError::InvalidSyntax)?;
114 match hir.properties().minimum_len() {
115 None => {
116 Err(RegexValidationError::InvalidSyntax)
118 }
119 Some(min_length) => {
120 if min_length == 0 {
121 Err(RegexValidationError::MatchesEmptyString)
122 } else {
123 Ok(())
124 }
125 }
126 }
127}
128
129pub fn validate_named_capture_group_minimum_length(
130 pattern: &str,
131 group_name: &str,
132) -> Result<(), RegexPatternCaptureGroupsValidationError> {
133 let ast = convert_to_rust_regex_ast(pattern)
134 .map_err(|_| RegexPatternCaptureGroupsValidationError::InvalidSyntax)?;
135 let matching_groups = collect_named_capture_group_asts(&ast, group_name);
136
137 for group_ast in matching_groups {
138 let group_pattern = group_ast.to_string();
139 let hir = Translator::new()
140 .translate(&group_pattern, group_ast)
141 .map_err(|_| RegexPatternCaptureGroupsValidationError::InvalidSyntax)?;
142 match hir.properties().minimum_len() {
143 None => return Err(RegexPatternCaptureGroupsValidationError::InvalidSyntax),
144 Some(0) => {
145 return Err(
146 RegexPatternCaptureGroupsValidationError::CaptureGroupMatchesEmptyString,
147 );
148 }
149 Some(_) => {}
150 }
151 }
152
153 Ok(())
154}
155
156fn collect_named_capture_group_asts<'a>(ast: &'a Ast, group_name: &str) -> Vec<&'a Ast> {
157 let mut matching_groups = Vec::new();
158
159 match ast {
160 Ast::Concat(concat) => {
161 for sub_ast in &concat.asts {
162 matching_groups.extend(collect_named_capture_group_asts(sub_ast, group_name));
163 }
164 }
165 Ast::Group(group) => {
166 if let GroupKind::CaptureName { name, .. } = &group.kind
167 && name.name == group_name
168 {
169 matching_groups.push(ast);
170 }
171 matching_groups.extend(collect_named_capture_group_asts(
172 group.ast.as_ref(),
173 group_name,
174 ));
175 }
176 Ast::Alternation(alternation) => {
177 for sub_ast in &alternation.asts {
178 matching_groups.extend(collect_named_capture_group_asts(sub_ast, group_name));
179 }
180 }
181 Ast::Repetition(repetition) => {
182 matching_groups.extend(collect_named_capture_group_asts(
183 repetition.ast.as_ref(),
184 group_name,
185 ));
186 }
187 _ => {}
188 }
189
190 matching_groups
191}
192
193fn is_regex_within_complexity_limit(
194 converted_pattern: &str,
195 complexity_limit: usize,
196) -> Result<bool, RegexValidationError> {
197 match build_regex(converted_pattern, complexity_limit) {
198 Ok(_) => Ok(true),
199 Err(err) => match err {
200 RegexValidationError::TooComplex => Ok(false),
201 _ => Err(err),
202 },
203 }
204}
205
206fn build_regex(
207 converted_pattern: &str,
208 complexity_limit: usize,
209) -> Result<meta::Regex, RegexValidationError> {
210 meta::Builder::new()
211 .configure(
212 meta::Config::new()
213 .nfa_size_limit(Some(complexity_limit))
214 .hybrid_cache_capacity(2 * (1 << 20)),
216 )
217 .syntax(
218 regex_automata::util::syntax::Config::default()
219 .dot_matches_new_line(false)
220 .unicode(true),
221 )
222 .build(converted_pattern)
223 .map_err(|regex_err| {
224 if regex_err.size_limit().is_some() {
225 RegexValidationError::TooComplex
226 } else {
227 RegexValidationError::InvalidSyntax
229 }
230 })
231}
232
233#[cfg(test)]
234mod test {
235 use crate::validation::{
236 RegexPatternCaptureGroupsValidationError, RegexValidationError,
237 get_regex_complexity_estimate_very_slow, validate_and_create_regex,
238 validate_named_capture_group_minimum_length, validate_regex,
239 };
240
241 #[test]
242 fn pattern_matching_empty_string_is_invalid() {
243 assert_eq!(
245 validate_regex(""),
246 Err(RegexValidationError::MatchesEmptyString)
247 );
248 assert_eq!(
249 validate_regex("a|"),
251 Err(RegexValidationError::MatchesEmptyString)
252 );
253
254 assert!(validate_regex("(a|)b").is_ok(),);
256 }
257
258 #[test]
259 fn too_complex_pattern_is_rejected() {
260 assert_eq!(
261 validate_regex(".{1000}{1000}"),
262 Err(RegexValidationError::TooComplex)
263 );
264 }
265
266 #[test]
267 fn high_repetition_pattern_is_rejected() {
268 assert_eq!(
269 validate_regex(".{10000}"),
270 Err(RegexValidationError::ExceededQuantifierLimit)
271 );
272 }
273
274 #[test]
275 fn test_invalid_range_quantifiers() {
276 assert_eq!(
277 validate_regex(".{100,1}"),
278 Err(RegexValidationError::InvalidSyntax)
279 );
280 }
281
282 #[test]
283 fn dot_new_line() {
284 let regex = validate_and_create_regex(".").unwrap();
285 assert!(!regex.is_match("\n"));
286 }
287
288 #[test]
289 fn test_complexity() {
290 assert_eq!(get_regex_complexity_estimate_very_slow("x"), Ok(1));
295 assert_eq!(get_regex_complexity_estimate_very_slow("x{1,10}"), Ok(920));
296 assert_eq!(get_regex_complexity_estimate_very_slow("."), Ok(1144));
297 assert_eq!(
298 get_regex_complexity_estimate_very_slow(".{1,10}"),
299 Ok(10536)
300 );
301 assert_eq!(
302 get_regex_complexity_estimate_very_slow(".{1,1000}"),
303 Ok(1_040_136)
304 );
305 }
306
307 #[test]
308 fn test_empty_match_without_empty_string() {
309 assert_eq!(
310 validate_regex("\\bx{0,2}\\b"),
311 Err(RegexValidationError::MatchesEmptyString)
312 );
313 }
314
315 #[test]
316 fn capture_group_minimum_length_rejects_empty() {
317 assert_eq!(
318 validate_named_capture_group_minimum_length("hello (?<sds_match>d*)", "sds_match"),
319 Err(RegexPatternCaptureGroupsValidationError::CaptureGroupMatchesEmptyString),
320 );
321 }
322
323 #[test]
324 fn capture_group_minimum_length_accepts_non_empty() {
325 assert!(
326 validate_named_capture_group_minimum_length("hello (?<sds_match>world)", "sds_match")
327 .is_ok()
328 );
329 }
330}