dd_sds/scanner/
mod.rs

1use crate::encoding::Encoding;
2use crate::event::Event;
3
4use crate::match_validation::{
5    config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
6    match_validator::MatchValidator,
7};
8use rayon::prelude::*;
9
10use error::{MatchValidationError, MatchValidatorCreationError};
11
12use crate::observability::labels::Labels;
13use crate::rule_match::{InternalRuleMatch, RuleMatch};
14use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
15pub use crate::secondary_validation::Validator;
16use crate::{CreateScannerError, EncodeIndices, MatchAction, Path, RegexValidationError};
17use std::ops::Deref;
18use std::sync::Arc;
19
20use self::metrics::ScannerMetrics;
21use crate::scanner::config::RuleConfig;
22use crate::scanner::regex_rule::compiled::RegexCompiledRule;
23use crate::scanner::regex_rule::{access_regex_caches, RegexCaches};
24use crate::scanner::scope::Scope;
25pub use crate::scanner::shared_data::SharedData;
26use crate::stats::GLOBAL_STATS;
27use ahash::{AHashMap, AHashSet};
28use regex_automata::Match;
29use serde::{Deserialize, Serialize};
30use serde_with::serde_as;
31
32pub mod config;
33pub mod error;
34pub mod metrics;
35pub mod regex_rule;
36pub mod scope;
37pub mod shared_data;
38pub mod shared_pool;
39
40#[cfg(test)]
41mod test;
42
43pub struct StringMatch {
44    pub start: usize,
45    pub end: usize,
46}
47
48pub trait MatchEmitter<T = ()> {
49    fn emit(&mut self, string_match: StringMatch) -> T;
50}
51
52// This implements MatchEmitter for mutable closures (so you can use a closure instead of a custom
53// struct that implements MatchEmitter)
54impl<F, T> MatchEmitter<T> for F
55where
56    F: FnMut(StringMatch) -> T,
57{
58    fn emit(&mut self, string_match: StringMatch) -> T {
59        // This just calls the closure (itself)
60        (self)(string_match)
61    }
62}
63
64#[serde_as]
65#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
66pub struct RootRuleConfig<T> {
67    pub match_action: MatchAction,
68    #[serde(default)]
69    pub scope: Scope,
70    #[deprecated(note = "Use `third_party_active_checker` instead")]
71    match_validation_type: Option<MatchValidationType>,
72    third_party_active_checker: Option<MatchValidationType>,
73    #[serde(flatten)]
74    pub inner: T,
75}
76
77impl<T> RootRuleConfig<T>
78where
79    T: RuleConfig + 'static,
80{
81    pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
82        RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
83    }
84
85    pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
86        self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
87    }
88}
89
90impl<T> RootRuleConfig<T> {
91    pub fn new(inner: T) -> Self {
92        #[allow(deprecated)]
93        Self {
94            match_action: MatchAction::None,
95            scope: Scope::all(),
96            match_validation_type: None,
97            third_party_active_checker: None,
98            inner,
99        }
100    }
101
102    pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
103        #[allow(deprecated)]
104        RootRuleConfig {
105            match_action: self.match_action,
106            scope: self.scope,
107            match_validation_type: self.match_validation_type,
108            third_party_active_checker: self.third_party_active_checker,
109            inner: func(self.inner),
110        }
111    }
112
113    pub fn match_action(mut self, action: MatchAction) -> Self {
114        self.match_action = action;
115        self
116    }
117
118    pub fn scope(mut self, scope: Scope) -> Self {
119        self.scope = scope;
120        self
121    }
122
123    pub fn third_party_active_checker(
124        mut self,
125        match_validation_type: MatchValidationType,
126    ) -> Self {
127        self.third_party_active_checker = Some(match_validation_type);
128        self
129    }
130
131    fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
132        #[allow(deprecated)]
133        self.third_party_active_checker
134            .as_ref()
135            .or(self.match_validation_type.as_ref())
136    }
137}
138
139impl<T> Deref for RootRuleConfig<T> {
140    type Target = T;
141
142    fn deref(&self) -> &Self::Target {
143        &self.inner
144    }
145}
146pub struct RootCompiledRule {
147    pub inner: Box<dyn CompiledRule>,
148    pub scope: Scope,
149    pub match_action: MatchAction,
150    pub match_validation_type: Option<MatchValidationType>,
151}
152
153impl RootCompiledRule {
154    pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
155        self.match_validation_type
156            .as_ref()
157            .map(|x| x.get_internal_match_validation_type())
158    }
159}
160
161impl Deref for RootCompiledRule {
162    type Target = dyn CompiledRule;
163
164    fn deref(&self) -> &Self::Target {
165        self.inner.as_ref()
166    }
167}
168
169// This is the public trait that is used to define the behavior of a compiled rule.
170pub trait CompiledRule: Send + Sync {
171    fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
172        // by default, no per-scanner data is initialized
173    }
174
175    fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
176        // by default, no per-string data is initialized
177    }
178
179    fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
180        // by default, no per-event data is initialized
181    }
182
183    #[allow(clippy::too_many_arguments)]
184    fn get_string_matches(
185        &self,
186        content: &str,
187        path: &Path,
188        regex_caches: &mut RegexCaches,
189        per_string_data: &mut SharedData,
190        per_scanner_data: &SharedData,
191        per_event_data: &mut SharedData,
192        exclusion_check: &ExclusionCheck<'_>,
193        excluded_matches: &mut AHashSet<String>,
194        match_emitter: &mut dyn MatchEmitter,
195        wildcard_indices: Option<&Vec<(usize, usize)>>,
196    );
197
198    /// Determines if this rule has a match, without determining the exact position,
199    /// or finding multiple matches. The default implementation just calls
200    /// `get_string_matches`, but this can be overridden with a more efficient
201    /// implementation if applicable
202    #[allow(clippy::too_many_arguments)]
203    fn has_string_match(
204        &self,
205        content: &str,
206        path: &Path,
207        regex_caches: &mut RegexCaches,
208        per_string_data: &mut SharedData,
209        per_scanner_data: &SharedData,
210        per_event_data: &mut SharedData,
211        exclusion_check: &ExclusionCheck<'_>,
212        excluded_matches: &mut AHashSet<String>,
213        wildcard_indices: Option<&Vec<(usize, usize)>>,
214    ) -> bool {
215        let mut found_match = false;
216        let mut match_emitter = |_| found_match = true;
217        self.get_string_matches(
218            content,
219            path,
220            regex_caches,
221            per_string_data,
222            per_scanner_data,
223            per_event_data,
224            exclusion_check,
225            excluded_matches,
226            &mut match_emitter,
227            wildcard_indices,
228        );
229        found_match
230    }
231
232    // Whether a match from this rule should be excluded (marked as a false-positive)
233    // if the content of this match was found in a match from an excluded scope
234    fn should_exclude_multipass_v0(&self) -> bool {
235        // default is to NOT use Multi-pass V0
236        false
237    }
238
239    fn on_excluded_match_multipass_v0(&self) {
240        // default is to do nothing
241    }
242}
243
244impl<T> RuleConfig for Box<T>
245where
246    T: RuleConfig + ?Sized,
247{
248    fn convert_to_compiled_rule(
249        &self,
250        rule_index: usize,
251        labels: Labels,
252    ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
253        self.as_ref().convert_to_compiled_rule(rule_index, labels)
254    }
255}
256
257#[derive(Debug, PartialEq, Clone)]
258struct ScannerFeatures {
259    pub add_implicit_index_wildcards: bool,
260    pub multipass_v0_enabled: bool,
261    pub return_matches: bool,
262    // This is a temporary flag to disable failed rules (instead of fail the entire scanner)
263    // for regex rules that match an empty string
264    pub skip_rules_with_regex_matching_empty_string: bool,
265}
266
267impl Default for ScannerFeatures {
268    fn default() -> Self {
269        Self {
270            add_implicit_index_wildcards: false,
271            multipass_v0_enabled: true,
272            return_matches: false,
273            skip_rules_with_regex_matching_empty_string: false,
274        }
275    }
276}
277
278pub struct ScanOptions {
279    // The blocked_rules_idx parameter is a list of rule indices that should be skipped for this scan.
280    // this list shall be small (<10), so a linear search is acceptable otherwise performance will be impacted.
281    pub blocked_rules_idx: Vec<usize>,
282    // The wildcarded_indices parameter is a map containing a list of tuples of (start, end) indices that should be treated as wildcards (for the message key only) per path.
283    pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
284}
285
286impl Default for ScanOptions {
287    fn default() -> Self {
288        Self {
289            blocked_rules_idx: vec![],
290            wildcarded_indices: AHashMap::new(),
291        }
292    }
293}
294
295pub struct ScanOptionBuilder {
296    blocked_rules_idx: Vec<usize>,
297    wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
298}
299
300impl ScanOptionBuilder {
301    pub fn new() -> Self {
302        Self {
303            blocked_rules_idx: vec![],
304            wildcarded_indices: AHashMap::new(),
305        }
306    }
307
308    pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
309        self.blocked_rules_idx = blocked_rules_idx;
310        self
311    }
312
313    pub fn with_wildcarded_indices(
314        mut self,
315        wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
316    ) -> Self {
317        self.wildcarded_indices = wildcarded_indices;
318        self
319    }
320
321    pub fn build(self) -> ScanOptions {
322        ScanOptions {
323            blocked_rules_idx: self.blocked_rules_idx,
324            wildcarded_indices: self.wildcarded_indices,
325        }
326    }
327}
328
329pub struct Scanner {
330    rules: Vec<RootCompiledRule>,
331    scoped_ruleset: ScopedRuleSet,
332    scanner_features: ScannerFeatures,
333    metrics: ScannerMetrics,
334    labels: Labels,
335    match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
336    per_scanner_data: SharedData,
337}
338
339impl Scanner {
340    pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder {
341        ScannerBuilder::new(rules)
342    }
343
344    pub fn scan_with_options<E: Event>(
345        &self,
346        event: &mut E,
347        options: ScanOptions,
348    ) -> Vec<RuleMatch> {
349        // All matches, after some (but not all) false-positives have been removed.
350        // This is a vec of vecs, where each inner vec is a set of matches for a single path.
351        let mut rule_matches_list = vec![];
352
353        let mut excluded_matches = AHashSet::new();
354
355        // Measure detection time
356        let start = std::time::Instant::now();
357        access_regex_caches(|regex_caches| {
358            self.scoped_ruleset.visit_string_rule_combinations(
359                event,
360                ScannerContentVisitor {
361                    scanner: self,
362                    regex_caches,
363                    rule_matches: &mut rule_matches_list,
364                    blocked_rules: &options.blocked_rules_idx,
365                    excluded_matches: &mut excluded_matches,
366                    per_event_data: SharedData::new(),
367                    wildcarded_indexes: &options.wildcarded_indices,
368                },
369            );
370        });
371
372        let mut output_rule_matches = vec![];
373
374        for (path, rule_matches) in &mut rule_matches_list {
375            // All rule matches in each inner list are for a single path, so they can be processed independently.
376            event.visit_string_mut(path, |content| {
377                if self.scanner_features.multipass_v0_enabled {
378                    // Now that the `excluded_matches` set is fully populated, filter out any matches
379                    // that are the same as excluded matches (also known as "Multi-pass V0")
380                    rule_matches.retain(|rule_match| {
381                        if self.rules[rule_match.rule_index]
382                            .inner
383                            .should_exclude_multipass_v0()
384                        {
385                            let is_false_positive = excluded_matches
386                                .contains(&content[rule_match.utf8_start..rule_match.utf8_end]);
387                            if is_false_positive && self.scanner_features.multipass_v0_enabled {
388                                self.rules[rule_match.rule_index].on_excluded_match_multipass_v0();
389                            }
390                            !is_false_positive
391                        } else {
392                            true
393                        }
394                    });
395                }
396
397                self.sort_and_remove_overlapping_rules::<E::Encoding>(rule_matches);
398
399                let will_mutate = rule_matches
400                    .iter()
401                    .any(|rule_match| self.rules[rule_match.rule_index].match_action.is_mutating());
402
403                self.apply_match_actions(content, path, rule_matches, &mut output_rule_matches);
404
405                will_mutate
406            });
407        }
408        // Record detection time
409        self.metrics
410            .duration_ns
411            .increment(start.elapsed().as_nanos() as u64);
412        // Add number of scanned events
413        self.metrics.num_scanned_events.increment(1);
414        // Add number of matches
415        self.metrics
416            .match_count
417            .increment(output_rule_matches.len() as u64);
418
419        output_rule_matches
420    }
421
422    // This function scans the given event with the rules configured in the scanner.
423    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
424    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
425    pub fn scan<E: Event>(&self, event: &mut E) -> Vec<RuleMatch> {
426        self.scan_with_options(event, ScanOptions::default())
427    }
428
429    pub fn validate_matches(
430        &self,
431        rule_matches: &mut Vec<RuleMatch>,
432    ) -> Result<(), MatchValidationError> {
433        if !self.scanner_features.return_matches {
434            return Err(MatchValidationError::NoMatchValidationType);
435        }
436        // Create MatchValidatorRuleMatch per match_validator_type to pass it to each match_validator
437        let mut match_validator_rule_match_per_type = AHashMap::new();
438
439        let mut validated_rule_matches = vec![];
440
441        for mut rule_match in rule_matches.drain(..) {
442            let rule = &self.rules[rule_match.rule_index];
443            if let Some(match_validation_type) = rule.internal_match_validation_type() {
444                match_validator_rule_match_per_type
445                    .entry(match_validation_type)
446                    .or_insert_with(Vec::new)
447                    .push(rule_match)
448            } else {
449                // There is no match validator for this rule, so mark it as not available.
450                rule_match.match_status.merge(MatchStatus::NotAvailable);
451                validated_rule_matches.push(rule_match);
452            }
453        }
454
455        match_validator_rule_match_per_type.par_iter_mut().for_each(
456            |(match_validation_type, matches_per_type)| {
457                let match_validator = self.match_validators_per_type.get(match_validation_type);
458                if let Some(match_validator) = match_validator {
459                    match_validator
460                        .as_ref()
461                        .validate(matches_per_type, &self.rules)
462                }
463            },
464        );
465
466        // Refill the rule_matches with the validated matches
467        for (_, mut matches) in match_validator_rule_match_per_type {
468            validated_rule_matches.append(&mut matches);
469        }
470
471        // Sort rule_matches by start index
472        validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
473        *rule_matches = validated_rule_matches;
474        Ok(())
475    }
476
477    /// Apply mutations from actions, and shift indices to match the mutated values.
478    /// This assumes the matches are all from the content given, and are sorted by start index.
479    fn apply_match_actions<E: Encoding>(
480        &self,
481        content: &mut String,
482        path: &Path<'static>,
483        rule_matches: &mut [InternalRuleMatch<E>],
484        output_rule_matches: &mut Vec<RuleMatch>,
485    ) {
486        let mut utf8_byte_delta: isize = 0;
487        let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
488
489        for rule_match in rule_matches {
490            output_rule_matches.push(self.apply_match_actions_for_string::<E>(
491                content,
492                path.clone(),
493                rule_match,
494                &mut utf8_byte_delta,
495                &mut custom_index_delta,
496            ));
497        }
498    }
499
500    /// This will be called once for each match of a single string. The rules must be passed in in order of the start index. Mutating rules must not overlap.
501    fn apply_match_actions_for_string<E: Encoding>(
502        &self,
503        content: &mut String,
504        path: Path<'static>,
505        rule_match: &InternalRuleMatch<E>,
506        // The current difference in length between the original and mutated string
507        utf8_byte_delta: &mut isize,
508
509        // The difference between the custom index on the original string and the mutated string
510        custom_index_delta: &mut <E>::IndexShift,
511    ) -> RuleMatch {
512        let rule = &self.rules[rule_match.rule_index];
513
514        let custom_start =
515            (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
516                + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
517
518        let mut matched_content_copy = None;
519
520        if self.scanner_features.return_matches {
521            // This copies part of the is_mutating block but is seperate since can't mix compilation condition and code condition
522            let mutated_utf8_match_start =
523                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
524            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
525
526            // Matches for mutating rules must have valid indices
527            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
528            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
529
530            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
531            matched_content_copy = Some(matched_content.to_string());
532        }
533
534        if rule.match_action.is_mutating() {
535            let mutated_utf8_match_start =
536                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
537            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
538
539            // Matches for mutating rules must have valid indices
540            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
541            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
542
543            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
544            if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
545                let before_replacement = &matched_content[replacement.start..replacement.end];
546
547                // update indices to match the new mutated content
548                <E>::adjust_shift(
549                    custom_index_delta,
550                    before_replacement,
551                    &replacement.replacement,
552                );
553                *utf8_byte_delta +=
554                    replacement.replacement.len() as isize - before_replacement.len() as isize;
555
556                let replacement_start = mutated_utf8_match_start + replacement.start;
557                let replacement_end = mutated_utf8_match_start + replacement.end;
558                content.replace_range(replacement_start..replacement_end, &replacement.replacement);
559            }
560        }
561
562        let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
563        let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
564            + shift_offset) as usize;
565
566        let rule = &self.rules[rule_match.rule_index];
567
568        let match_status: MatchStatus = if rule.match_validation_type.is_some() {
569            MatchStatus::NotChecked
570        } else {
571            MatchStatus::NotAvailable
572        };
573
574        RuleMatch {
575            rule_index: rule_match.rule_index,
576            path,
577            replacement_type: rule.match_action.replacement_type(),
578            start_index: custom_start,
579            end_index_exclusive: custom_end,
580            shift_offset,
581            match_value: matched_content_copy,
582            match_status,
583        }
584    }
585
586    fn sort_and_remove_overlapping_rules<E: Encoding>(
587        &self,
588        rule_matches: &mut Vec<InternalRuleMatch<E>>,
589    ) {
590        // Some of the scanner code relies on the behavior here, such as the sort order and removal of overlapping mutating rules.
591        // Be very careful if this function is modified.
592
593        rule_matches.sort_unstable_by(|a, b| {
594            // Mutating rules are a higher priority (earlier in the list)
595            let ord = self.rules[a.rule_index]
596                .match_action
597                .is_mutating()
598                .cmp(&self.rules[b.rule_index].match_action.is_mutating())
599                .reverse();
600
601            // Earlier start offset
602            let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
603
604            // Longer matches
605            let ord = ord.then(a.len().cmp(&b.len()).reverse());
606
607            // Matches from earlier rules
608            let ord = ord.then(a.rule_index.cmp(&b.rule_index));
609
610            // swap the order of everything so matches can be efficiently popped off the back as they are processed
611            ord.reverse()
612        });
613
614        let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
615
616        'rule_matches: while let Some(rule_match) = rule_matches.pop() {
617            if self.rules[rule_match.rule_index].match_action.is_mutating() {
618                // Mutating rules are kept only if they don't overlap with a previous rule.
619                if let Some(last) = retained_rules.last() {
620                    if last.utf8_end > rule_match.utf8_start {
621                        continue;
622                    }
623                }
624            } else {
625                // Only retain if it doesn't overlap with any other rule. Since mutating matches are sorted before non-mutated matches
626                // this needs to check all retained matches (instead of just the last one)
627                for retained_rule in &retained_rules {
628                    if retained_rule.utf8_start < rule_match.utf8_end
629                        && retained_rule.utf8_end > rule_match.utf8_start
630                    {
631                        continue 'rule_matches;
632                    }
633                }
634            };
635            retained_rules.push(rule_match);
636        }
637
638        // ensure rules are sorted by start index (other parts of the library required this to function correctly)
639        retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
640
641        *rule_matches = retained_rules;
642    }
643}
644
645impl Drop for Scanner {
646    fn drop(&mut self) {
647        let stats = &*GLOBAL_STATS;
648        stats.scanner_deletions.increment(1);
649        stats.decrement_total_scanners();
650    }
651}
652
653#[derive(Default)]
654pub struct ScannerBuilder<'a> {
655    rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
656    labels: Labels,
657    scanner_features: ScannerFeatures,
658}
659
660impl ScannerBuilder<'_> {
661    pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder {
662        ScannerBuilder {
663            rules,
664            labels: Labels::empty(),
665            scanner_features: ScannerFeatures::default(),
666        }
667    }
668
669    pub fn labels(mut self, labels: Labels) -> Self {
670        self.labels = labels;
671        self
672    }
673
674    pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
675        self.scanner_features.add_implicit_index_wildcards = value;
676        self
677    }
678
679    pub fn with_return_matches(mut self, value: bool) -> Self {
680        self.scanner_features.return_matches = value;
681        self
682    }
683
684    /// Enables/Disables the Multipass V0 feature. This defaults to TRUE.
685    /// Multipass V0 saves matches from excluded scopes, and marks any identical
686    /// matches in included scopes as a false positive.
687    pub fn with_multipass_v0(mut self, value: bool) -> Self {
688        self.scanner_features.multipass_v0_enabled = value;
689        self
690    }
691
692    pub fn with_skip_rules_with_regex_matching_empty_string(mut self, value: bool) -> Self {
693        self.scanner_features
694            .skip_rules_with_regex_matching_empty_string = value;
695        self
696    }
697
698    pub fn build(self) -> Result<Scanner, CreateScannerError> {
699        let mut match_validators_per_type = AHashMap::new();
700
701        for rule in self.rules.iter() {
702            if let Some(match_validation_type) = &rule.get_third_party_active_checker() {
703                if match_validation_type.can_create_match_validator() {
704                    let internal_type = match_validation_type.get_internal_match_validation_type();
705                    let match_validator = match_validation_type.into_match_validator();
706                    if let Ok(match_validator) = match_validator {
707                        if !match_validators_per_type.contains_key(&internal_type) {
708                            match_validators_per_type.insert(internal_type, match_validator);
709                        }
710                    } else {
711                        return Err(CreateScannerError::InvalidMatchValidator(
712                            MatchValidatorCreationError::InternalError,
713                        ));
714                    }
715                }
716            }
717        }
718
719        let compiled_rules = self
720            .rules
721            .iter()
722            .enumerate()
723            .filter_map(|(rule_index, config)| {
724                let inner = match config.convert_to_compiled_rule(rule_index, self.labels.clone()) {
725                    Ok(inner) => Ok(inner),
726                    Err(err) => {
727                        if self
728                            .scanner_features
729                            .skip_rules_with_regex_matching_empty_string
730                            && err
731                                == CreateScannerError::InvalidRegex(
732                                    RegexValidationError::MatchesEmptyString,
733                                )
734                        {
735                            // this is a temporary feature to skip rules that should be considered invalid.
736                            #[allow(clippy::print_stdout)]
737                            {
738                                println!("skipping rule that matches empty string: rule_index={}, labels={:?}", rule_index, self.labels.clone());
739                            }
740                            return None;
741                        } else {
742                            Err(err)
743                        }
744                    }
745                };
746                Some((config, inner))
747            })
748            .map(|(config, inner)| {
749                config.match_action.validate()?;
750                Ok(RootCompiledRule {
751                    inner: inner?,
752                    scope: config.scope.clone(),
753                    match_action: config.match_action.clone(),
754                    match_validation_type: config.get_third_party_active_checker().cloned(),
755                })
756            })
757            .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
758
759        let mut per_scanner_data = SharedData::new();
760
761        compiled_rules.iter().for_each(|rule| {
762            rule.init_per_scanner_data(&mut per_scanner_data);
763        });
764
765        let scoped_ruleset = ScopedRuleSet::new(
766            &compiled_rules
767                .iter()
768                .map(|rule| rule.scope.clone())
769                .collect::<Vec<_>>(),
770        )
771        .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
772
773        {
774            let stats = &*GLOBAL_STATS;
775            stats.scanner_creations.increment(1);
776            stats.increment_total_scanners();
777        }
778
779        Ok(Scanner {
780            rules: compiled_rules,
781            scoped_ruleset,
782            scanner_features: self.scanner_features,
783            metrics: ScannerMetrics::new(&self.labels),
784            match_validators_per_type,
785            labels: self.labels,
786            per_scanner_data,
787        })
788    }
789}
790
791struct ScannerContentVisitor<'a, E: Encoding> {
792    scanner: &'a Scanner,
793    regex_caches: &'a mut RegexCaches,
794    rule_matches: &'a mut Vec<(crate::Path<'static>, Vec<InternalRuleMatch<E>>)>,
795    // Rules that shall be skipped for this scan
796    // This list shall be small (<10), so a linear search is acceptable
797    blocked_rules: &'a Vec<usize>,
798    excluded_matches: &'a mut AHashSet<String>,
799    per_event_data: SharedData,
800    wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
801}
802
803impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
804    fn visit_content<'b>(
805        &'b mut self,
806        path: &Path<'a>,
807        content: &str,
808        mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
809        exclusion_check: ExclusionCheck<'b>,
810    ) -> bool {
811        // matches for a single path
812        let mut path_rules_matches = vec![];
813
814        // Create a map of per rule type data that can be shared between rules of the same type
815        let mut per_string_data = SharedData::new();
816        let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
817
818        rule_visitor.visit_rule_indices(|rule_index| {
819            if self.blocked_rules.contains(&rule_index) {
820                return;
821            }
822            let rule = &self.scanner.rules[rule_index];
823            {
824                // creating the emitter is basically free, it will get mostly optimized away
825                let mut emitter = |rule_match: StringMatch| {
826                    // This should never happen, but to ensure no empty match is ever generated
827                    // (which may cause an infinite loop), this will panic instead.
828                    assert_ne!(rule_match.start, rule_match.end, "empty match detected");
829
830                    path_rules_matches.push(InternalRuleMatch {
831                        rule_index,
832                        utf8_start: rule_match.start,
833                        utf8_end: rule_match.end,
834                        custom_start: E::zero_index(),
835                        custom_end: E::zero_index(),
836                    });
837                };
838
839                rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
840
841                // TODO: move this somewhere higher?
842                rule.init_per_event_data(&mut self.per_event_data);
843
844                rule.get_string_matches(
845                    content,
846                    path,
847                    self.regex_caches,
848                    &mut per_string_data,
849                    &self.scanner.per_scanner_data,
850                    &mut self.per_event_data,
851                    &exclusion_check,
852                    self.excluded_matches,
853                    &mut emitter,
854                    wildcard_indices_per_path,
855                );
856            }
857        });
858
859        // calculate_indices requires that matches are sorted by start index
860        path_rules_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
861
862        E::calculate_indices(
863            content,
864            path_rules_matches
865                .iter_mut()
866                .map(|rule_match: &mut InternalRuleMatch<E>| EncodeIndices {
867                    utf8_start: rule_match.utf8_start,
868                    utf8_end: rule_match.utf8_end,
869                    custom_start: &mut rule_match.custom_start,
870                    custom_end: &mut rule_match.custom_end,
871                }),
872        );
873
874        // If there are any matches, the string will need to be accessed to check for false positives from
875        // excluded matches, any to potentially mutate the string.
876        let has_match = !path_rules_matches.is_empty();
877
878        if has_match {
879            self.rule_matches
880                .push((path.into_static(), path_rules_matches));
881        }
882
883        has_match
884    }
885}
886
887// Calculates the next starting position for a regex match if a the previous match is a false positive
888fn get_next_regex_start(content: &str, regex_match: &Match) -> Option<usize> {
889    // The next valid UTF8 char after the start of the regex match is used
890    if let Some((i, _)) = content[regex_match.start()..].char_indices().nth(1) {
891        Some(regex_match.start() + i)
892    } else {
893        // There are no more chars left in the string to scan
894        None
895    }
896}
897
898fn is_false_positive_match(
899    regex_match: &Match,
900    rule: &RegexCompiledRule,
901    content: &str,
902    check_excluded_keywords: bool,
903) -> bool {
904    if check_excluded_keywords {
905        if let Some(excluded_keywords) = &rule.excluded_keywords {
906            if excluded_keywords.is_false_positive_match(content, regex_match.start()) {
907                return true;
908            }
909        }
910    }
911
912    if let Some(validator) = rule.validator.as_ref() {
913        if !validator.is_valid_match(&content[regex_match.range()]) {
914            return true;
915        };
916    }
917    false
918}