dd_sds/scanner/
mod.rs

1use crate::encoding::Encoding;
2use crate::event::Event;
3
4use crate::match_validation::{
5    config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
6    match_validator::MatchValidator,
7};
8use rayon::prelude::*;
9
10use error::{MatchValidationError, MatchValidatorCreationError};
11
12use crate::observability::labels::Labels;
13use crate::rule_match::{InternalRuleMatch, RuleMatch};
14use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
15pub use crate::secondary_validation::Validator;
16use crate::{CreateScannerError, EncodeIndices, MatchAction, Path};
17use std::ops::Deref;
18use std::sync::Arc;
19
20use self::metrics::ScannerMetrics;
21use crate::scanner::config::RuleConfig;
22use crate::scanner::regex_rule::compiled::RegexCompiledRule;
23use crate::scanner::regex_rule::{access_regex_caches, RegexCaches};
24use crate::scanner::scope::Scope;
25pub use crate::scanner::shared_data::SharedData;
26use crate::stats::GLOBAL_STATS;
27use ahash::{AHashMap, AHashSet};
28use regex_automata::Match;
29use serde::{Deserialize, Serialize};
30use serde_with::serde_as;
31
32pub mod config;
33pub mod error;
34pub mod metrics;
35pub mod regex_rule;
36pub mod scope;
37pub mod shared_data;
38pub mod shared_pool;
39
40#[cfg(test)]
41mod test;
42
43pub struct StringMatch {
44    pub start: usize,
45    pub end: usize,
46}
47
48pub trait MatchEmitter<T = ()> {
49    fn emit(&mut self, string_match: StringMatch) -> T;
50}
51
52// This implements MatchEmitter for mutable closures (so you can use a closure instead of a custom
53// struct that implements MatchEmitter)
54impl<F, T> MatchEmitter<T> for F
55where
56    F: FnMut(StringMatch) -> T,
57{
58    fn emit(&mut self, string_match: StringMatch) -> T {
59        // This just calls the closure (itself)
60        (self)(string_match)
61    }
62}
63
64#[serde_as]
65#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
66pub struct RootRuleConfig<T> {
67    pub match_action: MatchAction,
68    #[serde(default)]
69    pub scope: Scope,
70    #[deprecated(note = "Use `third_party_active_checker` instead")]
71    match_validation_type: Option<MatchValidationType>,
72    third_party_active_checker: Option<MatchValidationType>,
73    #[serde(flatten)]
74    pub inner: T,
75}
76
77impl<T> RootRuleConfig<T>
78where
79    T: RuleConfig + 'static,
80{
81    pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
82        RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
83    }
84
85    pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
86        self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
87    }
88}
89
90impl<T> RootRuleConfig<T> {
91    pub fn new(inner: T) -> Self {
92        #[allow(deprecated)]
93        Self {
94            match_action: MatchAction::None,
95            scope: Scope::all(),
96            match_validation_type: None,
97            third_party_active_checker: None,
98            inner,
99        }
100    }
101
102    pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
103        #[allow(deprecated)]
104        RootRuleConfig {
105            match_action: self.match_action,
106            scope: self.scope,
107            match_validation_type: self.match_validation_type,
108            third_party_active_checker: self.third_party_active_checker,
109            inner: func(self.inner),
110        }
111    }
112
113    pub fn match_action(mut self, action: MatchAction) -> Self {
114        self.match_action = action;
115        self
116    }
117
118    pub fn scope(mut self, scope: Scope) -> Self {
119        self.scope = scope;
120        self
121    }
122
123    pub fn third_party_active_checker(
124        mut self,
125        match_validation_type: MatchValidationType,
126    ) -> Self {
127        self.third_party_active_checker = Some(match_validation_type);
128        self
129    }
130
131    fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
132        #[allow(deprecated)]
133        self.third_party_active_checker
134            .as_ref()
135            .or(self.match_validation_type.as_ref())
136    }
137}
138
139impl<T> Deref for RootRuleConfig<T> {
140    type Target = T;
141
142    fn deref(&self) -> &Self::Target {
143        &self.inner
144    }
145}
146pub struct RootCompiledRule {
147    pub inner: Box<dyn CompiledRule>,
148    pub scope: Scope,
149    pub match_action: MatchAction,
150    pub match_validation_type: Option<MatchValidationType>,
151}
152
153impl RootCompiledRule {
154    pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
155        self.match_validation_type
156            .as_ref()
157            .map(|x| x.get_internal_match_validation_type())
158    }
159}
160
161impl Deref for RootCompiledRule {
162    type Target = dyn CompiledRule;
163
164    fn deref(&self) -> &Self::Target {
165        self.inner.as_ref()
166    }
167}
168
169// This is the public trait that is used to define the behavior of a compiled rule.
170pub trait CompiledRule: Send + Sync {
171    fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
172        // by default, no per-scanner data is initialized
173    }
174
175    fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
176        // by default, no per-string data is initialized
177    }
178
179    fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
180        // by default, no per-event data is initialized
181    }
182
183    #[allow(clippy::too_many_arguments)]
184    fn get_string_matches(
185        &self,
186        content: &str,
187        path: &Path,
188        regex_caches: &mut RegexCaches,
189        per_string_data: &mut SharedData,
190        per_scanner_data: &SharedData,
191        per_event_data: &mut SharedData,
192        exclusion_check: &ExclusionCheck<'_>,
193        excluded_matches: &mut AHashSet<String>,
194        match_emitter: &mut dyn MatchEmitter,
195        wildcard_indices: Option<&Vec<(usize, usize)>>,
196    );
197
198    /// Determines if this rule has a match, without determining the exact position,
199    /// or finding multiple matches. The default implementation just calls
200    /// `get_string_matches`, but this can be overridden with a more efficient
201    /// implementation if applicable
202    #[allow(clippy::too_many_arguments)]
203    fn has_string_match(
204        &self,
205        content: &str,
206        path: &Path,
207        regex_caches: &mut RegexCaches,
208        per_string_data: &mut SharedData,
209        per_scanner_data: &SharedData,
210        per_event_data: &mut SharedData,
211        exclusion_check: &ExclusionCheck<'_>,
212        excluded_matches: &mut AHashSet<String>,
213        wildcard_indices: Option<&Vec<(usize, usize)>>,
214    ) -> bool {
215        let mut found_match = false;
216        let mut match_emitter = |_| found_match = true;
217        self.get_string_matches(
218            content,
219            path,
220            regex_caches,
221            per_string_data,
222            per_scanner_data,
223            per_event_data,
224            exclusion_check,
225            excluded_matches,
226            &mut match_emitter,
227            wildcard_indices,
228        );
229        found_match
230    }
231
232    // Whether a match from this rule should be excluded (marked as a false-positive)
233    // if the content of this match was found in a match from an excluded scope
234    fn should_exclude_multipass_v0(&self) -> bool {
235        // default is to NOT use Multi-pass V0
236        false
237    }
238
239    fn on_excluded_match_multipass_v0(&self) {
240        // default is to do nothing
241    }
242}
243
244impl<T> RuleConfig for Box<T>
245where
246    T: RuleConfig + ?Sized,
247{
248    fn convert_to_compiled_rule(
249        &self,
250        rule_index: usize,
251        labels: Labels,
252    ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
253        self.as_ref().convert_to_compiled_rule(rule_index, labels)
254    }
255}
256
257#[derive(Debug, PartialEq, Clone)]
258struct ScannerFeatures {
259    pub add_implicit_index_wildcards: bool,
260    pub multipass_v0_enabled: bool,
261    pub return_matches: bool,
262}
263
264impl Default for ScannerFeatures {
265    fn default() -> Self {
266        Self {
267            add_implicit_index_wildcards: false,
268            multipass_v0_enabled: true,
269            return_matches: false,
270        }
271    }
272}
273
274pub struct ScanOptions {
275    // The blocked_rules_idx parameter is a list of rule indices that should be skipped for this scan.
276    // this list shall be small (<10), so a linear search is acceptable otherwise performance will be impacted.
277    pub blocked_rules_idx: Vec<usize>,
278    // The wildcarded_indices parameter is a map containing a list of tuples of (start, end) indices that should be treated as wildcards (for the message key only) per path.
279    pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
280}
281
282impl Default for ScanOptions {
283    fn default() -> Self {
284        Self {
285            blocked_rules_idx: vec![],
286            wildcarded_indices: AHashMap::new(),
287        }
288    }
289}
290
291pub struct ScanOptionBuilder {
292    blocked_rules_idx: Vec<usize>,
293    wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
294}
295
296impl ScanOptionBuilder {
297    pub fn new() -> Self {
298        Self {
299            blocked_rules_idx: vec![],
300            wildcarded_indices: AHashMap::new(),
301        }
302    }
303
304    pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
305        self.blocked_rules_idx = blocked_rules_idx;
306        self
307    }
308
309    pub fn with_wildcarded_indices(
310        mut self,
311        wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
312    ) -> Self {
313        self.wildcarded_indices = wildcarded_indices;
314        self
315    }
316
317    pub fn build(self) -> ScanOptions {
318        ScanOptions {
319            blocked_rules_idx: self.blocked_rules_idx,
320            wildcarded_indices: self.wildcarded_indices,
321        }
322    }
323}
324
325pub struct Scanner {
326    rules: Vec<RootCompiledRule>,
327    scoped_ruleset: ScopedRuleSet,
328    scanner_features: ScannerFeatures,
329    metrics: ScannerMetrics,
330    labels: Labels,
331    match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
332    per_scanner_data: SharedData,
333}
334
335impl Scanner {
336    pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder {
337        ScannerBuilder::new(rules)
338    }
339
340    pub fn scan_with_options<E: Event>(
341        &self,
342        event: &mut E,
343        options: ScanOptions,
344    ) -> Vec<RuleMatch> {
345        // All matches, after some (but not all) false-positives have been removed.
346        // This is a vec of vecs, where each inner vec is a set of matches for a single path.
347        let mut rule_matches_list = vec![];
348
349        let mut excluded_matches = AHashSet::new();
350
351        // Measure detection time
352        let start = std::time::Instant::now();
353        access_regex_caches(|regex_caches| {
354            self.scoped_ruleset.visit_string_rule_combinations(
355                event,
356                ScannerContentVisitor {
357                    scanner: self,
358                    regex_caches,
359                    rule_matches: &mut rule_matches_list,
360                    blocked_rules: &options.blocked_rules_idx,
361                    excluded_matches: &mut excluded_matches,
362                    per_event_data: SharedData::new(),
363                    wildcarded_indexes: &options.wildcarded_indices,
364                },
365            );
366        });
367
368        let mut output_rule_matches = vec![];
369
370        for (path, rule_matches) in &mut rule_matches_list {
371            // All rule matches in each inner list are for a single path, so they can be processed independently.
372            event.visit_string_mut(path, |content| {
373                if self.scanner_features.multipass_v0_enabled {
374                    // Now that the `excluded_matches` set is fully populated, filter out any matches
375                    // that are the same as excluded matches (also known as "Multi-pass V0")
376                    rule_matches.retain(|rule_match| {
377                        if self.rules[rule_match.rule_index]
378                            .inner
379                            .should_exclude_multipass_v0()
380                        {
381                            let is_false_positive = excluded_matches
382                                .contains(&content[rule_match.utf8_start..rule_match.utf8_end]);
383                            if is_false_positive && self.scanner_features.multipass_v0_enabled {
384                                self.rules[rule_match.rule_index].on_excluded_match_multipass_v0();
385                            }
386                            !is_false_positive
387                        } else {
388                            true
389                        }
390                    });
391                }
392
393                self.sort_and_remove_overlapping_rules::<E::Encoding>(rule_matches);
394
395                let will_mutate = rule_matches
396                    .iter()
397                    .any(|rule_match| self.rules[rule_match.rule_index].match_action.is_mutating());
398
399                self.apply_match_actions(content, path, rule_matches, &mut output_rule_matches);
400
401                will_mutate
402            });
403        }
404        // Record detection time
405        self.metrics
406            .duration_ns
407            .increment(start.elapsed().as_nanos() as u64);
408        // Add number of scanned events
409        self.metrics.num_scanned_events.increment(1);
410        // Add number of matches
411        self.metrics
412            .match_count
413            .increment(output_rule_matches.len() as u64);
414
415        output_rule_matches
416    }
417
418    // This function scans the given event with the rules configured in the scanner.
419    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
420    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
421    pub fn scan<E: Event>(&self, event: &mut E) -> Vec<RuleMatch> {
422        self.scan_with_options(event, ScanOptions::default())
423    }
424
425    pub fn validate_matches(
426        &self,
427        rule_matches: &mut Vec<RuleMatch>,
428    ) -> Result<(), MatchValidationError> {
429        if !self.scanner_features.return_matches {
430            return Err(MatchValidationError::NoMatchValidationType);
431        }
432        // Create MatchValidatorRuleMatch per match_validator_type to pass it to each match_validator
433        let mut match_validator_rule_match_per_type = AHashMap::new();
434
435        let mut validated_rule_matches = vec![];
436
437        for mut rule_match in rule_matches.drain(..) {
438            let rule = &self.rules[rule_match.rule_index];
439            if let Some(match_validation_type) = rule.internal_match_validation_type() {
440                match_validator_rule_match_per_type
441                    .entry(match_validation_type)
442                    .or_insert_with(Vec::new)
443                    .push(rule_match)
444            } else {
445                // There is no match validator for this rule, so mark it as not available.
446                rule_match.match_status.merge(MatchStatus::NotAvailable);
447                validated_rule_matches.push(rule_match);
448            }
449        }
450
451        match_validator_rule_match_per_type.par_iter_mut().for_each(
452            |(match_validation_type, matches_per_type)| {
453                let match_validator = self.match_validators_per_type.get(match_validation_type);
454                if let Some(match_validator) = match_validator {
455                    match_validator
456                        .as_ref()
457                        .validate(matches_per_type, &self.rules)
458                }
459            },
460        );
461
462        // Refill the rule_matches with the validated matches
463        for (_, mut matches) in match_validator_rule_match_per_type {
464            validated_rule_matches.append(&mut matches);
465        }
466
467        // Sort rule_matches by start index
468        validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
469        *rule_matches = validated_rule_matches;
470        Ok(())
471    }
472
473    /// Apply mutations from actions, and shift indices to match the mutated values.
474    /// This assumes the matches are all from the content given, and are sorted by start index.
475    fn apply_match_actions<E: Encoding>(
476        &self,
477        content: &mut String,
478        path: &Path<'static>,
479        rule_matches: &mut [InternalRuleMatch<E>],
480        output_rule_matches: &mut Vec<RuleMatch>,
481    ) {
482        let mut utf8_byte_delta: isize = 0;
483        let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
484
485        for rule_match in rule_matches {
486            output_rule_matches.push(self.apply_match_actions_for_string::<E>(
487                content,
488                path.clone(),
489                rule_match,
490                &mut utf8_byte_delta,
491                &mut custom_index_delta,
492            ));
493        }
494    }
495
496    /// This will be called once for each match of a single string. The rules must be passed in in order of the start index. Mutating rules must not overlap.
497    fn apply_match_actions_for_string<E: Encoding>(
498        &self,
499        content: &mut String,
500        path: Path<'static>,
501        rule_match: &InternalRuleMatch<E>,
502        // The current difference in length between the original and mutated string
503        utf8_byte_delta: &mut isize,
504
505        // The difference between the custom index on the original string and the mutated string
506        custom_index_delta: &mut <E>::IndexShift,
507    ) -> RuleMatch {
508        let rule = &self.rules[rule_match.rule_index];
509
510        let custom_start =
511            (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
512                + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
513
514        let mut matched_content_copy = None;
515
516        if self.scanner_features.return_matches {
517            // This copies part of the is_mutating block but is seperate since can't mix compilation condition and code condition
518            let mutated_utf8_match_start =
519                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
520            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
521
522            // Matches for mutating rules must have valid indices
523            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
524            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
525
526            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
527            matched_content_copy = Some(matched_content.to_string());
528        }
529
530        if rule.match_action.is_mutating() {
531            let mutated_utf8_match_start =
532                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
533            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
534
535            // Matches for mutating rules must have valid indices
536            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
537            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
538
539            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
540            if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
541                let before_replacement = &matched_content[replacement.start..replacement.end];
542
543                // update indices to match the new mutated content
544                <E>::adjust_shift(
545                    custom_index_delta,
546                    before_replacement,
547                    &replacement.replacement,
548                );
549                *utf8_byte_delta +=
550                    replacement.replacement.len() as isize - before_replacement.len() as isize;
551
552                let replacement_start = mutated_utf8_match_start + replacement.start;
553                let replacement_end = mutated_utf8_match_start + replacement.end;
554                content.replace_range(replacement_start..replacement_end, &replacement.replacement);
555            }
556        }
557
558        let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
559        let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
560            + shift_offset) as usize;
561
562        let rule = &self.rules[rule_match.rule_index];
563
564        let match_status: MatchStatus = if rule.match_validation_type.is_some() {
565            MatchStatus::NotChecked
566        } else {
567            MatchStatus::NotAvailable
568        };
569
570        RuleMatch {
571            rule_index: rule_match.rule_index,
572            path,
573            replacement_type: rule.match_action.replacement_type(),
574            start_index: custom_start,
575            end_index_exclusive: custom_end,
576            shift_offset,
577            match_value: matched_content_copy,
578            match_status,
579        }
580    }
581
582    fn sort_and_remove_overlapping_rules<E: Encoding>(
583        &self,
584        rule_matches: &mut Vec<InternalRuleMatch<E>>,
585    ) {
586        // Some of the scanner code relies on the behavior here, such as the sort order and removal of overlapping mutating rules.
587        // Be very careful if this function is modified.
588
589        rule_matches.sort_unstable_by(|a, b| {
590            // Mutating rules are a higher priority (earlier in the list)
591            let ord = self.rules[a.rule_index]
592                .match_action
593                .is_mutating()
594                .cmp(&self.rules[b.rule_index].match_action.is_mutating())
595                .reverse();
596
597            // Earlier start offset
598            let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
599
600            // Longer matches
601            let ord = ord.then(a.len().cmp(&b.len()).reverse());
602
603            // Matches from earlier rules
604            let ord = ord.then(a.rule_index.cmp(&b.rule_index));
605
606            // swap the order of everything so matches can be efficiently popped off the back as they are processed
607            ord.reverse()
608        });
609
610        let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
611
612        'rule_matches: while let Some(rule_match) = rule_matches.pop() {
613            if self.rules[rule_match.rule_index].match_action.is_mutating() {
614                // Mutating rules are kept only if they don't overlap with a previous rule.
615                if let Some(last) = retained_rules.last() {
616                    if last.utf8_end > rule_match.utf8_start {
617                        continue;
618                    }
619                }
620            } else {
621                // Only retain if it doesn't overlap with any other rule. Since mutating matches are sorted before non-mutated matches
622                // this needs to check all retained matches (instead of just the last one)
623                for retained_rule in &retained_rules {
624                    if retained_rule.utf8_start < rule_match.utf8_end
625                        && retained_rule.utf8_end > rule_match.utf8_start
626                    {
627                        continue 'rule_matches;
628                    }
629                }
630            };
631            retained_rules.push(rule_match);
632        }
633
634        // ensure rules are sorted by start index (other parts of the library required this to function correctly)
635        retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
636
637        *rule_matches = retained_rules;
638    }
639}
640
641impl Drop for Scanner {
642    fn drop(&mut self) {
643        let stats = &*GLOBAL_STATS;
644        stats.scanner_deletions.increment(1);
645        stats.decrement_total_scanners();
646    }
647}
648
649#[derive(Default)]
650pub struct ScannerBuilder<'a> {
651    rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
652    labels: Labels,
653    scanner_features: ScannerFeatures,
654}
655
656impl ScannerBuilder<'_> {
657    pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder {
658        ScannerBuilder {
659            rules,
660            labels: Labels::empty(),
661            scanner_features: ScannerFeatures::default(),
662        }
663    }
664
665    pub fn labels(mut self, labels: Labels) -> Self {
666        self.labels = labels;
667        self
668    }
669
670    pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
671        self.scanner_features.add_implicit_index_wildcards = value;
672        self
673    }
674
675    pub fn with_return_matches(mut self, value: bool) -> Self {
676        self.scanner_features.return_matches = value;
677        self
678    }
679
680    /// Enables/Disables the Multipass V0 feature. This defaults to TRUE.
681    /// Multipass V0 saves matches from excluded scopes, and marks any identical
682    /// matches in included scopes as a false positive.
683    pub fn with_multipass_v0(mut self, value: bool) -> Self {
684        self.scanner_features.multipass_v0_enabled = value;
685        self
686    }
687
688    pub fn build(self) -> Result<Scanner, CreateScannerError> {
689        let mut scanner_features = self.scanner_features.clone();
690        let mut match_validators_per_type = AHashMap::new();
691
692        for rule in self.rules.iter() {
693            if let Some(match_validation_type) = &rule.get_third_party_active_checker() {
694                if match_validation_type.can_create_match_validator() {
695                    let internal_type = match_validation_type.get_internal_match_validation_type();
696                    let match_validator = match_validation_type.into_match_validator();
697                    if let Ok(match_validator) = match_validator {
698                        if !match_validators_per_type.contains_key(&internal_type) {
699                            match_validators_per_type.insert(internal_type, match_validator);
700                            // Let's add return_matches to the scanner features
701                            // TODO Fixme, this implicit behavior could cause issue in case the config is reloaded.
702                            // The scanner features should only be enabled at build time and not based on custom rules.
703                            scanner_features.return_matches = true;
704                        }
705                    } else {
706                        return Err(CreateScannerError::InvalidMatchValidator(
707                            MatchValidatorCreationError::InternalError,
708                        ));
709                    }
710                }
711            }
712        }
713
714        let compiled_rules = self
715            .rules
716            .iter()
717            .enumerate()
718            .map(|(rule_index, config)| {
719                let inner = config.convert_to_compiled_rule(rule_index, self.labels.clone())?;
720                config.match_action.validate()?;
721                Ok(RootCompiledRule {
722                    inner,
723                    scope: config.scope.clone(),
724                    match_action: config.match_action.clone(),
725                    match_validation_type: config.get_third_party_active_checker().cloned(),
726                })
727            })
728            .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
729
730        let mut per_scanner_data = SharedData::new();
731
732        compiled_rules.iter().for_each(|rule| {
733            rule.init_per_scanner_data(&mut per_scanner_data);
734        });
735
736        let scoped_ruleset = ScopedRuleSet::new(
737            &compiled_rules
738                .iter()
739                .map(|rule| rule.scope.clone())
740                .collect::<Vec<_>>(),
741        )
742        .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
743
744        {
745            let stats = &*GLOBAL_STATS;
746            stats.scanner_creations.increment(1);
747            stats.increment_total_scanners();
748        }
749
750        Ok(Scanner {
751            rules: compiled_rules,
752            scoped_ruleset,
753            scanner_features,
754            metrics: ScannerMetrics::new(&self.labels),
755            match_validators_per_type,
756            labels: self.labels,
757            per_scanner_data,
758        })
759    }
760}
761
762struct ScannerContentVisitor<'a, E: Encoding> {
763    scanner: &'a Scanner,
764    regex_caches: &'a mut RegexCaches,
765    rule_matches: &'a mut Vec<(crate::Path<'static>, Vec<InternalRuleMatch<E>>)>,
766    // Rules that shall be skipped for this scan
767    // This list shall be small (<10), so a linear search is acceptable
768    blocked_rules: &'a Vec<usize>,
769    excluded_matches: &'a mut AHashSet<String>,
770    per_event_data: SharedData,
771    wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
772}
773
774impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
775    fn visit_content<'b>(
776        &'b mut self,
777        path: &Path<'a>,
778        content: &str,
779        mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
780        exclusion_check: ExclusionCheck<'b>,
781    ) -> bool {
782        // matches for a single path
783        let mut path_rules_matches = vec![];
784
785        // Create a map of per rule type data that can be shared between rules of the same type
786        let mut per_string_data = SharedData::new();
787        let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
788
789        rule_visitor.visit_rule_indices(|rule_index| {
790            if self.blocked_rules.contains(&rule_index) {
791                return;
792            }
793            let rule = &self.scanner.rules[rule_index];
794            {
795                // creating the emitter is basically free, it will get mostly optimized away
796                let mut emitter = |rule_match: StringMatch| {
797                    path_rules_matches.push(InternalRuleMatch {
798                        rule_index,
799                        utf8_start: rule_match.start,
800                        utf8_end: rule_match.end,
801                        custom_start: E::zero_index(),
802                        custom_end: E::zero_index(),
803                    });
804                };
805
806                rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
807
808                // TODO: move this somewhere higher?
809                rule.init_per_event_data(&mut self.per_event_data);
810
811                rule.get_string_matches(
812                    content,
813                    path,
814                    self.regex_caches,
815                    &mut per_string_data,
816                    &self.scanner.per_scanner_data,
817                    &mut self.per_event_data,
818                    &exclusion_check,
819                    self.excluded_matches,
820                    &mut emitter,
821                    wildcard_indices_per_path,
822                );
823            }
824        });
825
826        // calculate_indices requires that matches are sorted by start index
827        path_rules_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
828
829        E::calculate_indices(
830            content,
831            path_rules_matches
832                .iter_mut()
833                .map(|rule_match: &mut InternalRuleMatch<E>| EncodeIndices {
834                    utf8_start: rule_match.utf8_start,
835                    utf8_end: rule_match.utf8_end,
836                    custom_start: &mut rule_match.custom_start,
837                    custom_end: &mut rule_match.custom_end,
838                }),
839        );
840
841        // If there are any matches, the string will need to be accessed to check for false positives from
842        // excluded matches, any to potentially mutate the string.
843        let has_match = !path_rules_matches.is_empty();
844
845        if has_match {
846            self.rule_matches
847                .push((path.into_static(), path_rules_matches));
848        }
849
850        has_match
851    }
852}
853
854// Calculates the next starting position for a regex match if a the previous match is a false positive
855fn get_next_regex_start(content: &str, regex_match: &Match) -> Option<usize> {
856    // The next valid UTF8 char after the start of the regex match is used
857    if let Some((i, _)) = content[regex_match.start()..].char_indices().nth(1) {
858        Some(regex_match.start() + i)
859    } else {
860        // There are no more chars left in the string to scan
861        None
862    }
863}
864
865fn is_false_positive_match(
866    regex_match: &Match,
867    rule: &RegexCompiledRule,
868    content: &str,
869    check_excluded_keywords: bool,
870) -> bool {
871    if check_excluded_keywords {
872        if let Some(excluded_keywords) = &rule.excluded_keywords {
873            if excluded_keywords.is_false_positive_match(content, regex_match.start()) {
874                return true;
875            }
876        }
877    }
878
879    if let Some(validator) = rule.validator.as_ref() {
880        if !validator.is_valid_match(&content[regex_match.range()]) {
881            return true;
882        };
883    }
884    false
885}