dd_sds/scanner/regex_rule/
config.rs

1use crate::proximity_keywords::compile_keywords_proximity_config;
2use crate::scanner::config::RuleConfig;
3use crate::scanner::metrics::RuleMetrics;
4use crate::scanner::regex_rule::compiled::RegexCompiledRule;
5use crate::scanner::regex_rule::regex_store::get_memoized_regex;
6use crate::validation::validate_and_create_regex;
7use crate::{CompiledRule, CreateScannerError, Labels};
8use serde::{Deserialize, Serialize};
9use serde_with::DefaultOnNull;
10use serde_with::serde_as;
11use std::sync::Arc;
12use strum::{AsRefStr, EnumIter};
13
14pub const DEFAULT_KEYWORD_LOOKAHEAD: usize = 30;
15
16#[serde_as]
17#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
18pub struct RegexRuleConfig {
19    pub pattern: String,
20    pub proximity_keywords: Option<ProximityKeywordsConfig>,
21    pub validator: Option<SecondaryValidator>,
22    #[serde_as(deserialize_as = "DefaultOnNull")]
23    #[serde(default)]
24    pub labels: Labels,
25    pub pattern_capture_group: Option<String>,
26}
27
28impl RegexRuleConfig {
29    pub fn new(pattern: &str) -> Self {
30        #[allow(deprecated)]
31        Self {
32            pattern: pattern.to_owned(),
33            proximity_keywords: None,
34            validator: None,
35            labels: Labels::default(),
36            pattern_capture_group: None,
37        }
38    }
39
40    pub fn with_pattern(&self, pattern: &str) -> Self {
41        self.mutate_clone(|x| x.pattern = pattern.to_string())
42    }
43
44    pub fn with_proximity_keywords(&self, proximity_keywords: ProximityKeywordsConfig) -> Self {
45        self.mutate_clone(|x| x.proximity_keywords = Some(proximity_keywords))
46    }
47
48    pub fn with_labels(&self, labels: Labels) -> Self {
49        self.mutate_clone(|x| x.labels = labels)
50    }
51
52    pub fn with_pattern_capture_group(&self, pattern_capture_group: &str) -> Self {
53        self.mutate_clone(|x| x.pattern_capture_group = Some(pattern_capture_group.to_string()))
54    }
55
56    pub fn build(&self) -> Arc<dyn RuleConfig> {
57        Arc::new(self.clone())
58    }
59
60    fn mutate_clone(&self, modify: impl FnOnce(&mut Self)) -> Self {
61        let mut clone = self.clone();
62        modify(&mut clone);
63        clone
64    }
65
66    pub fn with_included_keywords(
67        &self,
68        keywords: impl IntoIterator<Item = impl AsRef<str>>,
69    ) -> Self {
70        let mut this = self.clone();
71        let mut config = self.get_or_create_proximity_keywords_config();
72        config.included_keywords = keywords
73            .into_iter()
74            .map(|x| x.as_ref().to_string())
75            .collect::<Vec<_>>();
76        this.proximity_keywords = Some(config);
77        this
78    }
79
80    pub fn with_validator(&self, validator: Option<SecondaryValidator>) -> Self {
81        let mut this = self.clone();
82        this.validator = validator;
83        this
84    }
85
86    fn get_or_create_proximity_keywords_config(&self) -> ProximityKeywordsConfig {
87        self.proximity_keywords
88            .clone()
89            .unwrap_or_else(|| ProximityKeywordsConfig {
90                look_ahead_character_count: DEFAULT_KEYWORD_LOOKAHEAD,
91                included_keywords: vec![],
92                excluded_keywords: vec![],
93            })
94    }
95}
96
97impl RuleConfig for RegexRuleConfig {
98    fn convert_to_compiled_rule(
99        &self,
100        rule_index: usize,
101        scanner_labels: Labels,
102    ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
103        let regex = get_memoized_regex(&self.pattern, validate_and_create_regex)?;
104
105        let rule_labels = scanner_labels.clone_with_labels(self.labels.clone());
106
107        let (included_keywords, excluded_keywords) = self
108            .proximity_keywords
109            .as_ref()
110            .map(|config| compile_keywords_proximity_config(config, &rule_labels))
111            .unwrap_or(Ok((None, None)))?;
112
113        Ok(Box::new(RegexCompiledRule {
114            rule_index,
115            regex,
116            included_keywords,
117            excluded_keywords,
118            validator: self.validator.clone().map(|x| x.compile()),
119            metrics: RuleMetrics::new(&rule_labels),
120            pattern_capture_group: self.pattern_capture_group.clone(),
121        }))
122    }
123}
124
125#[serde_as]
126#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
127pub struct ProximityKeywordsConfig {
128    pub look_ahead_character_count: usize,
129
130    #[serde_as(deserialize_as = "DefaultOnNull")]
131    #[serde(default)]
132    pub included_keywords: Vec<String>,
133
134    #[serde_as(deserialize_as = "DefaultOnNull")]
135    #[serde(default)]
136    pub excluded_keywords: Vec<String>,
137}
138
139#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, EnumIter, AsRefStr)]
140#[serde(tag = "type")]
141pub enum SecondaryValidator {
142    AbaRtnChecksum,
143    BrazilianCnpjChecksum,
144    BrazilianCpfChecksum,
145    BtcChecksum,
146    BulgarianEGNChecksum,
147    ChineseIdChecksum,
148    CoordinationNumberChecksum,
149    CzechPersonalIdentificationNumberChecksum,
150    CzechTaxIdentificationNumberChecksum,
151    DutchBsnChecksum,
152    DutchPassportChecksum,
153    EntropyCheck,
154    EthereumChecksum,
155    FinnishHetuChecksum,
156    FranceNifChecksum,
157    FranceSsnChecksum,
158    GermanIdsChecksum,
159    GermanSvnrChecksum,
160    GithubTokenChecksum,
161    GreekTinChecksum,
162    HungarianTinChecksum,
163    IbanChecker,
164    IrishPpsChecksum,
165    ItalianNationalIdChecksum,
166    JwtClaimsValidator { config: JwtClaimsValidatorConfig },
167    JwtExpirationChecker,
168    LatviaNationalIdChecksum,
169    LithuanianPersonalIdentificationNumberChecksum,
170    LuhnChecksum,
171    LuxembourgIndividualNINChecksum,
172    Mod11_10checksum,
173    Mod11_2checksum,
174    Mod1271_36Checksum,
175    Mod27_26checksum,
176    Mod37_2checksum,
177    Mod37_36checksum,
178    Mod661_26checksum,
179    Mod97_10checksum,
180    MoneroAddress,
181    NhsCheckDigit,
182    NirChecksum,
183    PolishNationalIdChecksum,
184    PolishNipChecksum,
185    PortugueseTaxIdChecksum,
186    RodneCisloNumberChecksum,
187    RomanianPersonalNumericCode,
188    SlovenianPINChecksum,
189    SpanishDniChecksum,
190    SpanishNussChecksum,
191    SwedenPINChecksum,
192}
193
194#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
195#[serde(tag = "type", content = "config")]
196pub enum ClaimRequirement {
197    /// Just check that the claim exists
198    Present,
199    /// Check that the claim exists and has an exact value
200    ExactValue(String),
201    /// Check that the claim exists and matches a regex pattern
202    RegexMatch(String),
203}
204
205#[derive(Serialize, Deserialize, Default, Clone, Debug, PartialEq)]
206pub struct JwtClaimsValidatorConfig {
207    #[serde(default)]
208    pub required_headers: std::collections::BTreeMap<String, ClaimRequirement>,
209    #[serde(default)]
210    pub required_claims: std::collections::BTreeMap<String, ClaimRequirement>,
211}
212
213#[cfg(test)]
214mod test {
215    use crate::{AwsType, CustomHttpConfig, MatchValidationType, RootRuleConfig};
216    use std::collections::BTreeMap;
217    use strum::IntoEnumIterator;
218
219    use super::*;
220
221    #[test]
222    fn should_override_pattern() {
223        let rule_config = RegexRuleConfig::new("123").with_pattern("456");
224        assert_eq!(rule_config.pattern, "456");
225    }
226
227    #[test]
228    #[allow(deprecated)]
229    fn should_have_default() {
230        let rule_config = RegexRuleConfig::new("123");
231        assert_eq!(
232            rule_config,
233            RegexRuleConfig {
234                pattern: "123".to_string(),
235                proximity_keywords: None,
236                validator: None,
237                labels: Labels::empty(),
238                pattern_capture_group: None,
239            }
240        );
241    }
242
243    #[test]
244    fn should_use_capture_group() {
245        let rule_config = RegexRuleConfig::new("hey (?<capture_group>world)")
246            .with_pattern_capture_group("capture_group");
247        assert_eq!(
248            rule_config,
249            RegexRuleConfig {
250                pattern: "hey (?<capture_group>world)".to_string(),
251                proximity_keywords: None,
252                validator: None,
253                labels: Labels::empty(),
254                pattern_capture_group: Some("capture_group".to_string()),
255            }
256        );
257    }
258
259    #[test]
260    fn proximity_keywords_should_have_default() {
261        let json_config = r#"{"look_ahead_character_count": 0}"#;
262        let test: ProximityKeywordsConfig = serde_json::from_str(json_config).unwrap();
263        assert_eq!(
264            test,
265            ProximityKeywordsConfig {
266                look_ahead_character_count: 0,
267                included_keywords: vec![],
268                excluded_keywords: vec![]
269            }
270        );
271
272        let json_config = r#"{"look_ahead_character_count": 0, "excluded_keywords": null, "included_keywords": null}"#;
273        let test: ProximityKeywordsConfig = serde_json::from_str(json_config).unwrap();
274        assert_eq!(
275            test,
276            ProximityKeywordsConfig {
277                look_ahead_character_count: 0,
278                included_keywords: vec![],
279                excluded_keywords: vec![]
280            }
281        );
282    }
283
284    #[test]
285    #[allow(deprecated)]
286    fn test_third_party_active_checker() {
287        // Test setting only the new field
288        let http_config = CustomHttpConfig::default().with_endpoint("http://test.com".to_string());
289        let validation_type = MatchValidationType::CustomHttp(http_config.clone());
290        let rule_config = RootRuleConfig::new(RegexRuleConfig::new("123"))
291            .third_party_active_checker(validation_type.clone());
292
293        assert_eq!(
294            rule_config.third_party_active_checker,
295            Some(validation_type.clone())
296        );
297        assert_eq!(rule_config.match_validation_type, None);
298        assert_eq!(
299            rule_config.get_third_party_active_checker(),
300            Some(&validation_type)
301        );
302
303        // Test setting via deprecated field updates both
304        let aws_type = AwsType::AwsId;
305        let validation_type2 = MatchValidationType::Aws(aws_type);
306        let rule_config = RootRuleConfig::new(RegexRuleConfig::new("123"))
307            .third_party_active_checker(validation_type2.clone());
308
309        assert_eq!(
310            rule_config.third_party_active_checker,
311            Some(validation_type2.clone())
312        );
313        assert_eq!(
314            rule_config.get_third_party_active_checker(),
315            Some(&validation_type2)
316        );
317
318        // Test that get_match_validation_type prioritizes third_party_active_checker
319        let rule_config = RootRuleConfig::new(RegexRuleConfig::new("123"))
320            .third_party_active_checker(MatchValidationType::CustomHttp(http_config.clone()));
321
322        assert_eq!(
323            rule_config.get_third_party_active_checker(),
324            Some(&MatchValidationType::CustomHttp(http_config.clone()))
325        );
326    }
327
328    #[test]
329    fn test_secondary_validator_enum_iter() {
330        // Test that we can iterate over all SecondaryValidator variants
331        let validators: Vec<SecondaryValidator> = SecondaryValidator::iter().collect();
332        // Verify some variants
333        assert!(validators.contains(&SecondaryValidator::GithubTokenChecksum));
334        assert!(validators.contains(&SecondaryValidator::JwtExpirationChecker));
335    }
336
337    #[test]
338    fn test_secondary_validator_are_sorted() {
339        let validator_names: Vec<String> = SecondaryValidator::iter()
340            .map(|a| a.as_ref().to_string())
341            .collect();
342        let mut sorted_validator_names = validator_names.clone();
343        sorted_validator_names.sort();
344        assert_eq!(
345            sorted_validator_names, validator_names,
346            "Secondary validators should be sorted by alphabetical order, but it's not the case, expected order:"
347        );
348    }
349
350    // The order has to be stable to pass linter checks. Otherwise, each instantiation will change the file
351    #[test]
352    fn test_jwt_claims_validator_config_serialization_order() {
353        // Create a config with claims in non-alphabetical order
354        let mut required_claims = BTreeMap::new();
355        required_claims.insert("zzz".to_string(), ClaimRequirement::Present);
356        required_claims.insert(
357            "aaa".to_string(),
358            ClaimRequirement::ExactValue("test".to_string()),
359        );
360        required_claims.insert(
361            "mmm".to_string(),
362            ClaimRequirement::RegexMatch(r"^test.*".to_string()),
363        );
364
365        let config = JwtClaimsValidatorConfig {
366            required_claims,
367            required_headers: std::collections::BTreeMap::new(),
368        };
369
370        // Serialize multiple times to ensure stable order
371        let serialized1 = serde_json::to_string(&config).unwrap();
372        let serialized2 = serde_json::to_string(&config).unwrap();
373
374        // Both serializations should be identical
375        assert_eq!(serialized1, serialized2, "Serialization should be stable");
376
377        // Keys should be in alphabetical order
378        assert!(serialized1.find("aaa").unwrap() < serialized1.find("mmm").unwrap());
379        assert!(serialized1.find("mmm").unwrap() < serialized1.find("zzz").unwrap());
380    }
381}