dd_sds/scanner/regex_rule/
config.rs

1use crate::proximity_keywords::compile_keywords_proximity_config;
2use crate::scanner::config::RuleConfig;
3use crate::scanner::metrics::RuleMetrics;
4use crate::scanner::regex_rule::compiled::RegexCompiledRule;
5use crate::scanner::regex_rule::regex_store::get_memoized_regex;
6use crate::secondary_validation::jwt_claims_validator::JwtClaimsValidatorConfig;
7use crate::validation::validate_and_create_regex;
8use crate::{CompiledRule, CreateScannerError, Labels};
9use serde::{Deserialize, Serialize};
10use serde_with::DefaultOnNull;
11use serde_with::serde_as;
12use std::sync::Arc;
13use strum::{AsRefStr, EnumIter};
14
15pub const DEFAULT_KEYWORD_LOOKAHEAD: usize = 30;
16
17#[serde_as]
18#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
19pub struct RegexRuleConfig {
20    pub pattern: String,
21    pub proximity_keywords: Option<ProximityKeywordsConfig>,
22    pub validator: Option<SecondaryValidator>,
23    #[serde_as(deserialize_as = "DefaultOnNull")]
24    #[serde(default)]
25    pub labels: Labels,
26}
27
28impl RegexRuleConfig {
29    pub fn new(pattern: &str) -> Self {
30        #[allow(deprecated)]
31        Self {
32            pattern: pattern.to_owned(),
33            proximity_keywords: None,
34            validator: None,
35            labels: Labels::default(),
36        }
37    }
38
39    pub fn with_pattern(&self, pattern: &str) -> Self {
40        self.mutate_clone(|x| x.pattern = pattern.to_string())
41    }
42
43    pub fn with_proximity_keywords(&self, proximity_keywords: ProximityKeywordsConfig) -> Self {
44        self.mutate_clone(|x| x.proximity_keywords = Some(proximity_keywords))
45    }
46
47    pub fn with_labels(&self, labels: Labels) -> Self {
48        self.mutate_clone(|x| x.labels = labels)
49    }
50
51    pub fn build(&self) -> Arc<dyn RuleConfig> {
52        Arc::new(self.clone())
53    }
54
55    fn mutate_clone(&self, modify: impl FnOnce(&mut Self)) -> Self {
56        let mut clone = self.clone();
57        modify(&mut clone);
58        clone
59    }
60
61    pub fn with_included_keywords(
62        &self,
63        keywords: impl IntoIterator<Item = impl AsRef<str>>,
64    ) -> Self {
65        let mut this = self.clone();
66        let mut config = self.get_or_create_proximity_keywords_config();
67        config.included_keywords = keywords
68            .into_iter()
69            .map(|x| x.as_ref().to_string())
70            .collect::<Vec<_>>();
71        this.proximity_keywords = Some(config);
72        this
73    }
74
75    pub fn with_validator(&self, validator: Option<SecondaryValidator>) -> Self {
76        let mut this = self.clone();
77        this.validator = validator;
78        this
79    }
80
81    fn get_or_create_proximity_keywords_config(&self) -> ProximityKeywordsConfig {
82        self.proximity_keywords
83            .clone()
84            .unwrap_or_else(|| ProximityKeywordsConfig {
85                look_ahead_character_count: DEFAULT_KEYWORD_LOOKAHEAD,
86                included_keywords: vec![],
87                excluded_keywords: vec![],
88            })
89    }
90}
91
92impl RuleConfig for RegexRuleConfig {
93    fn convert_to_compiled_rule(
94        &self,
95        rule_index: usize,
96        scanner_labels: Labels,
97    ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
98        let regex = get_memoized_regex(&self.pattern, validate_and_create_regex)?;
99
100        let rule_labels = scanner_labels.clone_with_labels(self.labels.clone());
101
102        let (included_keywords, excluded_keywords) = self
103            .proximity_keywords
104            .as_ref()
105            .map(|config| compile_keywords_proximity_config(config, &rule_labels))
106            .unwrap_or(Ok((None, None)))?;
107
108        Ok(Box::new(RegexCompiledRule {
109            rule_index,
110            regex,
111            included_keywords,
112            excluded_keywords,
113            validator: self.validator.clone().map(|x| x.compile()),
114            metrics: RuleMetrics::new(&rule_labels),
115        }))
116    }
117}
118
119#[serde_as]
120#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
121pub struct ProximityKeywordsConfig {
122    pub look_ahead_character_count: usize,
123
124    #[serde_as(deserialize_as = "DefaultOnNull")]
125    #[serde(default)]
126    pub included_keywords: Vec<String>,
127
128    #[serde_as(deserialize_as = "DefaultOnNull")]
129    #[serde(default)]
130    pub excluded_keywords: Vec<String>,
131}
132
133#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, EnumIter, AsRefStr)]
134#[serde(tag = "type")]
135pub enum SecondaryValidator {
136    AbaRtnChecksum,
137    BrazilianCnpjChecksum,
138    BrazilianCpfChecksum,
139    BtcChecksum,
140    BulgarianEGNChecksum,
141    ChineseIdChecksum,
142    CoordinationNumberChecksum,
143    CzechPersonalIdentificationNumberChecksum,
144    CzechTaxIdentificationNumberChecksum,
145    DutchBsnChecksum,
146    DutchPassportChecksum,
147    EthereumChecksum,
148    FinnishHetuChecksum,
149    FranceNifChecksum,
150    FranceSsnChecksum,
151    GermanIdsChecksum,
152    GermanSvnrChecksum,
153    GithubTokenChecksum,
154    GreekTinChecksum,
155    HungarianTinChecksum,
156    IbanChecker,
157    IrishPpsChecksum,
158    ItalianNationalIdChecksum,
159    JwtClaimsValidator { config: JwtClaimsValidatorConfig },
160    JwtExpirationChecker,
161    LatviaNationalIdChecksum,
162    LithuanianPersonalIdentificationNumberChecksum,
163    LuhnChecksum,
164    LuxembourgIndividualNINChecksum,
165    Mod11_10checksum,
166    Mod11_2checksum,
167    Mod1271_36Checksum,
168    Mod27_26checksum,
169    Mod37_2checksum,
170    Mod37_36checksum,
171    Mod661_26checksum,
172    Mod97_10checksum,
173    MoneroAddress,
174    NhsCheckDigit,
175    NirChecksum,
176    PolishNationalIdChecksum,
177    PolishNipChecksum,
178    PortugueseTaxIdChecksum,
179    RodneCisloNumberChecksum,
180    RomanianPersonalNumericCode,
181    SlovenianPINChecksum,
182    SpanishDniChecksum,
183    SpanishNussChecksum,
184    SwedenPINChecksum,
185}
186
187#[cfg(test)]
188mod test {
189    use crate::{AwsType, CustomHttpConfig, MatchValidationType, RootRuleConfig};
190    use strum::IntoEnumIterator;
191
192    use super::*;
193
194    #[test]
195    fn should_override_pattern() {
196        let rule_config = RegexRuleConfig::new("123").with_pattern("456");
197        assert_eq!(rule_config.pattern, "456");
198    }
199
200    #[test]
201    #[allow(deprecated)]
202    fn should_have_default() {
203        let rule_config = RegexRuleConfig::new("123");
204        assert_eq!(
205            rule_config,
206            RegexRuleConfig {
207                pattern: "123".to_string(),
208                proximity_keywords: None,
209                validator: None,
210                labels: Labels::empty(),
211            }
212        );
213    }
214
215    #[test]
216    fn proximity_keywords_should_have_default() {
217        let json_config = r#"{"look_ahead_character_count": 0}"#;
218        let test: ProximityKeywordsConfig = serde_json::from_str(json_config).unwrap();
219        assert_eq!(
220            test,
221            ProximityKeywordsConfig {
222                look_ahead_character_count: 0,
223                included_keywords: vec![],
224                excluded_keywords: vec![]
225            }
226        );
227
228        let json_config = r#"{"look_ahead_character_count": 0, "excluded_keywords": null, "included_keywords": null}"#;
229        let test: ProximityKeywordsConfig = serde_json::from_str(json_config).unwrap();
230        assert_eq!(
231            test,
232            ProximityKeywordsConfig {
233                look_ahead_character_count: 0,
234                included_keywords: vec![],
235                excluded_keywords: vec![]
236            }
237        );
238    }
239
240    #[test]
241    #[allow(deprecated)]
242    fn test_third_party_active_checker() {
243        // Test setting only the new field
244        let http_config = CustomHttpConfig::default().with_endpoint("http://test.com".to_string());
245        let validation_type = MatchValidationType::CustomHttp(http_config.clone());
246        let rule_config = RootRuleConfig::new(RegexRuleConfig::new("123"))
247            .third_party_active_checker(validation_type.clone());
248
249        assert_eq!(
250            rule_config.third_party_active_checker,
251            Some(validation_type.clone())
252        );
253        assert_eq!(rule_config.match_validation_type, None);
254        assert_eq!(
255            rule_config.get_third_party_active_checker(),
256            Some(&validation_type)
257        );
258
259        // Test setting via deprecated field updates both
260        let aws_type = AwsType::AwsId;
261        let validation_type2 = MatchValidationType::Aws(aws_type);
262        let rule_config = RootRuleConfig::new(RegexRuleConfig::new("123"))
263            .third_party_active_checker(validation_type2.clone());
264
265        assert_eq!(
266            rule_config.third_party_active_checker,
267            Some(validation_type2.clone())
268        );
269        assert_eq!(
270            rule_config.get_third_party_active_checker(),
271            Some(&validation_type2)
272        );
273
274        // Test that get_match_validation_type prioritizes third_party_active_checker
275        let rule_config = RootRuleConfig::new(RegexRuleConfig::new("123"))
276            .third_party_active_checker(MatchValidationType::CustomHttp(http_config.clone()));
277
278        assert_eq!(
279            rule_config.get_third_party_active_checker(),
280            Some(&MatchValidationType::CustomHttp(http_config.clone()))
281        );
282    }
283
284    #[test]
285    fn test_secondary_validator_enum_iter() {
286        // Test that we can iterate over all SecondaryValidator variants
287        let validators: Vec<SecondaryValidator> = SecondaryValidator::iter().collect();
288        // Verify some variants
289        assert!(validators.contains(&SecondaryValidator::GithubTokenChecksum));
290        assert!(validators.contains(&SecondaryValidator::JwtExpirationChecker));
291    }
292
293    #[test]
294    fn test_secondary_validator_are_sorted() {
295        let validator_names: Vec<String> = SecondaryValidator::iter()
296            .map(|a| a.as_ref().to_string())
297            .collect();
298        let mut sorted_validator_names = validator_names.clone();
299        sorted_validator_names.sort();
300        assert_eq!(
301            sorted_validator_names, validator_names,
302            "Secondary validators should be sorted by alphabetical order, but it's not the case, expected order:"
303        );
304    }
305
306    // The order has to be stable to pass linter checks. Otherwise, each instantiation will change the file
307    #[test]
308    fn test_jwt_claims_validator_config_serialization_order() {
309        use crate::secondary_validation::jwt_claims_validator::ClaimRequirement;
310        use std::collections::BTreeMap;
311
312        // Create a config with claims in non-alphabetical order
313        let mut required_claims = BTreeMap::new();
314        required_claims.insert("zzz".to_string(), ClaimRequirement::Present);
315        required_claims.insert(
316            "aaa".to_string(),
317            ClaimRequirement::ExactValue("test".to_string()),
318        );
319        required_claims.insert(
320            "mmm".to_string(),
321            ClaimRequirement::RegexMatch(r"^test.*".to_string()),
322        );
323
324        let config = JwtClaimsValidatorConfig {
325            required_claims,
326            required_headers: std::collections::BTreeMap::new(),
327        };
328
329        // Serialize multiple times to ensure stable order
330        let serialized1 = serde_json::to_string(&config).unwrap();
331        let serialized2 = serde_json::to_string(&config).unwrap();
332
333        // Both serializations should be identical
334        assert_eq!(serialized1, serialized2, "Serialization should be stable");
335
336        // Keys should be in alphabetical order
337        assert!(serialized1.find("aaa").unwrap() < serialized1.find("mmm").unwrap());
338        assert!(serialized1.find("mmm").unwrap() < serialized1.find("zzz").unwrap());
339    }
340}