dd_sds/scanner/
mod.rs

1use crate::encoding::Encoding;
2use crate::event::Event;
3
4use crate::match_validation::{
5    config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
6    match_validator::MatchValidator,
7};
8
9use error::{MatchValidationError, MatchValidatorCreationError};
10
11use crate::observability::labels::Labels;
12use crate::rule_match::{InternalRuleMatch, RuleMatch};
13use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
14pub use crate::secondary_validation::Validator;
15use crate::{
16    CreateScannerError, EncodeIndices, MatchAction, Path, RegexValidationError, ScannerError,
17};
18use std::ops::Deref;
19use std::sync::Arc;
20
21use self::metrics::ScannerMetrics;
22use crate::match_validation::match_validator::RAYON_THREAD_POOL;
23use crate::scanner::config::RuleConfig;
24use crate::scanner::regex_rule::compiled::RegexCompiledRule;
25use crate::scanner::regex_rule::{access_regex_caches, RegexCaches};
26use crate::scanner::scope::Scope;
27pub use crate::scanner::shared_data::SharedData;
28use crate::stats::GLOBAL_STATS;
29use ahash::{AHashMap, AHashSet};
30use regex_automata::Match;
31use serde::{Deserialize, Serialize};
32use serde_with::serde_as;
33
34pub mod config;
35pub mod error;
36pub mod metrics;
37pub mod regex_rule;
38pub mod scope;
39pub mod shared_data;
40pub mod shared_pool;
41
42#[cfg(test)]
43mod test;
44
45pub struct StringMatch {
46    pub start: usize,
47    pub end: usize,
48}
49
50pub trait MatchEmitter<T = ()> {
51    fn emit(&mut self, string_match: StringMatch) -> T;
52}
53
54// This implements MatchEmitter for mutable closures (so you can use a closure instead of a custom
55// struct that implements MatchEmitter)
56impl<F, T> MatchEmitter<T> for F
57where
58    F: FnMut(StringMatch) -> T,
59{
60    fn emit(&mut self, string_match: StringMatch) -> T {
61        // This just calls the closure (itself)
62        (self)(string_match)
63    }
64}
65
66#[serde_as]
67#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
68pub struct RootRuleConfig<T> {
69    pub match_action: MatchAction,
70    #[serde(default)]
71    pub scope: Scope,
72    #[deprecated(note = "Use `third_party_active_checker` instead")]
73    match_validation_type: Option<MatchValidationType>,
74    third_party_active_checker: Option<MatchValidationType>,
75    #[serde(flatten)]
76    pub inner: T,
77}
78
79impl<T> RootRuleConfig<T>
80where
81    T: RuleConfig + 'static,
82{
83    pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
84        RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
85    }
86
87    pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
88        self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
89    }
90}
91
92impl<T> RootRuleConfig<T> {
93    pub fn new(inner: T) -> Self {
94        #[allow(deprecated)]
95        Self {
96            match_action: MatchAction::None,
97            scope: Scope::all(),
98            match_validation_type: None,
99            third_party_active_checker: None,
100            inner,
101        }
102    }
103
104    pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
105        #[allow(deprecated)]
106        RootRuleConfig {
107            match_action: self.match_action,
108            scope: self.scope,
109            match_validation_type: self.match_validation_type,
110            third_party_active_checker: self.third_party_active_checker,
111            inner: func(self.inner),
112        }
113    }
114
115    pub fn match_action(mut self, action: MatchAction) -> Self {
116        self.match_action = action;
117        self
118    }
119
120    pub fn scope(mut self, scope: Scope) -> Self {
121        self.scope = scope;
122        self
123    }
124
125    pub fn third_party_active_checker(
126        mut self,
127        match_validation_type: MatchValidationType,
128    ) -> Self {
129        self.third_party_active_checker = Some(match_validation_type);
130        self
131    }
132
133    fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
134        #[allow(deprecated)]
135        self.third_party_active_checker
136            .as_ref()
137            .or(self.match_validation_type.as_ref())
138    }
139}
140
141impl<T> Deref for RootRuleConfig<T> {
142    type Target = T;
143
144    fn deref(&self) -> &Self::Target {
145        &self.inner
146    }
147}
148pub struct RootCompiledRule {
149    pub inner: Box<dyn CompiledRule>,
150    pub scope: Scope,
151    pub match_action: MatchAction,
152    pub match_validation_type: Option<MatchValidationType>,
153}
154
155impl RootCompiledRule {
156    pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
157        self.match_validation_type
158            .as_ref()
159            .map(|x| x.get_internal_match_validation_type())
160    }
161}
162
163impl Deref for RootCompiledRule {
164    type Target = dyn CompiledRule;
165
166    fn deref(&self) -> &Self::Target {
167        self.inner.as_ref()
168    }
169}
170
171pub struct StringMatchesCtx<'a> {
172    pub regex_caches: &'a mut RegexCaches,
173    pub exclusion_check: &'a ExclusionCheck<'a>,
174    pub excluded_matches: &'a mut AHashSet<String>,
175    pub match_emitter: &'a mut dyn MatchEmitter,
176    pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
177
178    // Shared Data
179    pub per_string_data: &'a mut SharedData,
180    pub per_scanner_data: &'a SharedData,
181    pub per_event_data: &'a mut SharedData,
182}
183
184// This is the public trait that is used to define the behavior of a compiled rule.
185pub trait CompiledRule: Send + Sync {
186    fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
187        // by default, no per-scanner data is initialized
188    }
189
190    fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
191        // by default, no per-string data is initialized
192    }
193
194    fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
195        // by default, no per-event data is initialized
196    }
197
198    fn get_string_matches(
199        &self,
200        content: &str,
201        path: &Path,
202        ctx: &mut StringMatchesCtx<'_>,
203    ) -> Result<(), ScannerError>;
204
205    /// Determines if this rule has a match, without determining the exact position,
206    /// or finding multiple matches. The default implementation just calls
207    /// `get_string_matches`, but this can be overridden with a more efficient
208    /// implementation if applicable
209    #[allow(clippy::too_many_arguments)]
210    fn has_string_match(
211        &self,
212        content: &str,
213        path: &Path,
214        ctx: &mut StringMatchesCtx<'_>,
215    ) -> Result<bool, ScannerError> {
216        let mut found_match = false;
217
218        let mut match_emitter = |_| found_match = true;
219
220        let mut new_ctx = StringMatchesCtx {
221            match_emitter: &mut match_emitter,
222            regex_caches: ctx.regex_caches,
223            exclusion_check: ctx.exclusion_check,
224            excluded_matches: ctx.excluded_matches,
225            wildcard_indices: ctx.wildcard_indices,
226            per_string_data: ctx.per_string_data,
227            per_scanner_data: ctx.per_scanner_data,
228            per_event_data: ctx.per_event_data,
229        };
230
231        self.get_string_matches(content, path, &mut new_ctx)
232            .map(|_| found_match)
233    }
234
235    // Whether a match from this rule should be excluded (marked as a false-positive)
236    // if the content of this match was found in a match from an excluded scope
237    fn should_exclude_multipass_v0(&self) -> bool {
238        // default is to NOT use Multi-pass V0
239        false
240    }
241
242    fn on_excluded_match_multipass_v0(&self) {
243        // default is to do nothing
244    }
245}
246
247impl<T> RuleConfig for Box<T>
248where
249    T: RuleConfig + ?Sized,
250{
251    fn convert_to_compiled_rule(
252        &self,
253        rule_index: usize,
254        labels: Labels,
255    ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
256        self.as_ref().convert_to_compiled_rule(rule_index, labels)
257    }
258}
259
260#[derive(Debug, PartialEq, Clone)]
261struct ScannerFeatures {
262    pub add_implicit_index_wildcards: bool,
263    pub multipass_v0_enabled: bool,
264    pub return_matches: bool,
265    // This is a temporary flag to disable failed rules (instead of fail the entire scanner)
266    // for regex rules that match an empty string
267    pub skip_rules_with_regex_matching_empty_string: bool,
268}
269
270impl Default for ScannerFeatures {
271    fn default() -> Self {
272        Self {
273            add_implicit_index_wildcards: false,
274            multipass_v0_enabled: true,
275            return_matches: false,
276            skip_rules_with_regex_matching_empty_string: false,
277        }
278    }
279}
280
281pub struct ScanOptions {
282    // The blocked_rules_idx parameter is a list of rule indices that should be skipped for this scan.
283    // this list shall be small (<10), so a linear search is acceptable otherwise performance will be impacted.
284    pub blocked_rules_idx: Vec<usize>,
285    // The wildcarded_indices parameter is a map containing a list of tuples of (start, end) indices that should be treated as wildcards (for the message key only) per path.
286    pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
287}
288
289impl Default for ScanOptions {
290    fn default() -> Self {
291        Self {
292            blocked_rules_idx: vec![],
293            wildcarded_indices: AHashMap::new(),
294        }
295    }
296}
297
298pub struct ScanOptionBuilder {
299    blocked_rules_idx: Vec<usize>,
300    wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
301}
302
303impl ScanOptionBuilder {
304    pub fn new() -> Self {
305        Self {
306            blocked_rules_idx: vec![],
307            wildcarded_indices: AHashMap::new(),
308        }
309    }
310
311    pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
312        self.blocked_rules_idx = blocked_rules_idx;
313        self
314    }
315
316    pub fn with_wildcarded_indices(
317        mut self,
318        wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
319    ) -> Self {
320        self.wildcarded_indices = wildcarded_indices;
321        self
322    }
323
324    pub fn build(self) -> ScanOptions {
325        ScanOptions {
326            blocked_rules_idx: self.blocked_rules_idx,
327            wildcarded_indices: self.wildcarded_indices,
328        }
329    }
330}
331
332pub struct Scanner {
333    rules: Vec<RootCompiledRule>,
334    scoped_ruleset: ScopedRuleSet,
335    scanner_features: ScannerFeatures,
336    metrics: ScannerMetrics,
337    labels: Labels,
338    match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
339    per_scanner_data: SharedData,
340}
341
342impl Scanner {
343    pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder {
344        ScannerBuilder::new(rules)
345    }
346
347    fn record_metrics(&self, output_rule_matches: &[RuleMatch], start: std::time::Instant) {
348        // Record detection time
349        self.metrics
350            .duration_ns
351            .increment(start.elapsed().as_nanos() as u64);
352        // Add number of scanned events
353        self.metrics.num_scanned_events.increment(1);
354        // Add number of matches
355        self.metrics
356            .match_count
357            .increment(output_rule_matches.len() as u64);
358    }
359
360    pub fn scan_with_options<E: Event>(
361        &self,
362        event: &mut E,
363        options: ScanOptions,
364    ) -> Result<Vec<RuleMatch>, ScannerError> {
365        // All matches, after some (but not all) false-positives have been removed.
366        // This is a vec of vecs, where each inner vec is a set of matches for a single path.
367        let mut rule_matches_list = vec![];
368
369        let mut excluded_matches = AHashSet::new();
370
371        // Measure detection time
372        let start = std::time::Instant::now();
373        let result = access_regex_caches(|regex_caches| {
374            self.scoped_ruleset.visit_string_rule_combinations(
375                event,
376                ScannerContentVisitor {
377                    scanner: self,
378                    regex_caches,
379                    rule_matches: &mut rule_matches_list,
380                    blocked_rules: &options.blocked_rules_idx,
381                    excluded_matches: &mut excluded_matches,
382                    per_event_data: SharedData::new(),
383                    wildcarded_indexes: &options.wildcarded_indices,
384                },
385            )
386        });
387
388        // If we were not able to scan, no need to go any further.
389        // Don't forget to record the metrics though!
390        if let Err(e) = result {
391            self.record_metrics(&[], start);
392            return Err(e);
393        }
394
395        let mut output_rule_matches = vec![];
396
397        for (path, rule_matches) in &mut rule_matches_list {
398            // All rule matches in each inner list are for a single path, so they can be processed independently.
399            event.visit_string_mut(path, |content| {
400                if self.scanner_features.multipass_v0_enabled {
401                    // Now that the `excluded_matches` set is fully populated, filter out any matches
402                    // that are the same as excluded matches (also known as "Multi-pass V0")
403                    rule_matches.retain(|rule_match| {
404                        if self.rules[rule_match.rule_index]
405                            .inner
406                            .should_exclude_multipass_v0()
407                        {
408                            let is_false_positive = excluded_matches
409                                .contains(&content[rule_match.utf8_start..rule_match.utf8_end]);
410                            if is_false_positive && self.scanner_features.multipass_v0_enabled {
411                                self.rules[rule_match.rule_index].on_excluded_match_multipass_v0();
412                            }
413                            !is_false_positive
414                        } else {
415                            true
416                        }
417                    });
418                }
419
420                self.sort_and_remove_overlapping_rules::<E::Encoding>(rule_matches);
421
422                let will_mutate = rule_matches
423                    .iter()
424                    .any(|rule_match| self.rules[rule_match.rule_index].match_action.is_mutating());
425
426                self.apply_match_actions(content, path, rule_matches, &mut output_rule_matches);
427
428                will_mutate
429            });
430        }
431
432        self.record_metrics(&output_rule_matches, start);
433
434        Ok(output_rule_matches)
435    }
436
437    // This function scans the given event with the rules configured in the scanner.
438    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
439    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
440    pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
441        self.scan_with_options(event, ScanOptions::default())
442    }
443
444    pub fn validate_matches(
445        &self,
446        rule_matches: &mut Vec<RuleMatch>,
447    ) -> Result<(), MatchValidationError> {
448        if !self.scanner_features.return_matches {
449            return Err(MatchValidationError::NoMatchValidationType);
450        }
451        // Create MatchValidatorRuleMatch per match_validator_type to pass it to each match_validator
452        let mut match_validator_rule_match_per_type = AHashMap::new();
453
454        let mut validated_rule_matches = vec![];
455
456        for mut rule_match in rule_matches.drain(..) {
457            let rule = &self.rules[rule_match.rule_index];
458            if let Some(match_validation_type) = rule.internal_match_validation_type() {
459                match_validator_rule_match_per_type
460                    .entry(match_validation_type)
461                    .or_insert_with(Vec::new)
462                    .push(rule_match)
463            } else {
464                // There is no match validator for this rule, so mark it as not available.
465                rule_match.match_status.merge(MatchStatus::NotAvailable);
466                validated_rule_matches.push(rule_match);
467            }
468        }
469
470        RAYON_THREAD_POOL.install(|| {
471            use rayon::prelude::*;
472
473            match_validator_rule_match_per_type.par_iter_mut().for_each(
474                |(match_validation_type, matches_per_type)| {
475                    let match_validator = self.match_validators_per_type.get(match_validation_type);
476                    if let Some(match_validator) = match_validator {
477                        match_validator
478                            .as_ref()
479                            .validate(matches_per_type, &self.rules)
480                    }
481                },
482            );
483        });
484
485        // Refill the rule_matches with the validated matches
486        for (_, mut matches) in match_validator_rule_match_per_type {
487            validated_rule_matches.append(&mut matches);
488        }
489
490        // Sort rule_matches by start index
491        validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
492        *rule_matches = validated_rule_matches;
493        Ok(())
494    }
495
496    /// Apply mutations from actions, and shift indices to match the mutated values.
497    /// This assumes the matches are all from the content given, and are sorted by start index.
498    fn apply_match_actions<E: Encoding>(
499        &self,
500        content: &mut String,
501        path: &Path<'static>,
502        rule_matches: &mut [InternalRuleMatch<E>],
503        output_rule_matches: &mut Vec<RuleMatch>,
504    ) {
505        let mut utf8_byte_delta: isize = 0;
506        let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
507
508        for rule_match in rule_matches {
509            output_rule_matches.push(self.apply_match_actions_for_string::<E>(
510                content,
511                path.clone(),
512                rule_match,
513                &mut utf8_byte_delta,
514                &mut custom_index_delta,
515            ));
516        }
517    }
518
519    /// This will be called once for each match of a single string. The rules must be passed in in order of the start index. Mutating rules must not overlap.
520    fn apply_match_actions_for_string<E: Encoding>(
521        &self,
522        content: &mut String,
523        path: Path<'static>,
524        rule_match: &InternalRuleMatch<E>,
525        // The current difference in length between the original and mutated string
526        utf8_byte_delta: &mut isize,
527
528        // The difference between the custom index on the original string and the mutated string
529        custom_index_delta: &mut <E>::IndexShift,
530    ) -> RuleMatch {
531        let rule = &self.rules[rule_match.rule_index];
532
533        let custom_start =
534            (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
535                + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
536
537        let mut matched_content_copy = None;
538
539        if self.scanner_features.return_matches {
540            // This copies part of the is_mutating block but is seperate since can't mix compilation condition and code condition
541            let mutated_utf8_match_start =
542                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
543            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
544
545            // Matches for mutating rules must have valid indices
546            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
547            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
548
549            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
550            matched_content_copy = Some(matched_content.to_string());
551        }
552
553        if rule.match_action.is_mutating() {
554            let mutated_utf8_match_start =
555                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
556            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
557
558            // Matches for mutating rules must have valid indices
559            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
560            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
561
562            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
563            if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
564                let before_replacement = &matched_content[replacement.start..replacement.end];
565
566                // update indices to match the new mutated content
567                <E>::adjust_shift(
568                    custom_index_delta,
569                    before_replacement,
570                    &replacement.replacement,
571                );
572                *utf8_byte_delta +=
573                    replacement.replacement.len() as isize - before_replacement.len() as isize;
574
575                let replacement_start = mutated_utf8_match_start + replacement.start;
576                let replacement_end = mutated_utf8_match_start + replacement.end;
577                content.replace_range(replacement_start..replacement_end, &replacement.replacement);
578            }
579        }
580
581        let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
582        let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
583            + shift_offset) as usize;
584
585        let rule = &self.rules[rule_match.rule_index];
586
587        let match_status: MatchStatus = if rule.match_validation_type.is_some() {
588            MatchStatus::NotChecked
589        } else {
590            MatchStatus::NotAvailable
591        };
592
593        RuleMatch {
594            rule_index: rule_match.rule_index,
595            path,
596            replacement_type: rule.match_action.replacement_type(),
597            start_index: custom_start,
598            end_index_exclusive: custom_end,
599            shift_offset,
600            match_value: matched_content_copy,
601            match_status,
602        }
603    }
604
605    fn sort_and_remove_overlapping_rules<E: Encoding>(
606        &self,
607        rule_matches: &mut Vec<InternalRuleMatch<E>>,
608    ) {
609        // Some of the scanner code relies on the behavior here, such as the sort order and removal of overlapping mutating rules.
610        // Be very careful if this function is modified.
611
612        rule_matches.sort_unstable_by(|a, b| {
613            // Mutating rules are a higher priority (earlier in the list)
614            let ord = self.rules[a.rule_index]
615                .match_action
616                .is_mutating()
617                .cmp(&self.rules[b.rule_index].match_action.is_mutating())
618                .reverse();
619
620            // Earlier start offset
621            let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
622
623            // Longer matches
624            let ord = ord.then(a.len().cmp(&b.len()).reverse());
625
626            // Matches from earlier rules
627            let ord = ord.then(a.rule_index.cmp(&b.rule_index));
628
629            // swap the order of everything so matches can be efficiently popped off the back as they are processed
630            ord.reverse()
631        });
632
633        let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
634
635        'rule_matches: while let Some(rule_match) = rule_matches.pop() {
636            if self.rules[rule_match.rule_index].match_action.is_mutating() {
637                // Mutating rules are kept only if they don't overlap with a previous rule.
638                if let Some(last) = retained_rules.last() {
639                    if last.utf8_end > rule_match.utf8_start {
640                        continue;
641                    }
642                }
643            } else {
644                // Only retain if it doesn't overlap with any other rule. Since mutating matches are sorted before non-mutated matches
645                // this needs to check all retained matches (instead of just the last one)
646                for retained_rule in &retained_rules {
647                    if retained_rule.utf8_start < rule_match.utf8_end
648                        && retained_rule.utf8_end > rule_match.utf8_start
649                    {
650                        continue 'rule_matches;
651                    }
652                }
653            };
654            retained_rules.push(rule_match);
655        }
656
657        // ensure rules are sorted by start index (other parts of the library required this to function correctly)
658        retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
659
660        *rule_matches = retained_rules;
661    }
662}
663
664impl Drop for Scanner {
665    fn drop(&mut self) {
666        let stats = &*GLOBAL_STATS;
667        stats.scanner_deletions.increment(1);
668        stats.decrement_total_scanners();
669    }
670}
671
672#[derive(Default)]
673pub struct ScannerBuilder<'a> {
674    rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
675    labels: Labels,
676    scanner_features: ScannerFeatures,
677}
678
679impl ScannerBuilder<'_> {
680    pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder {
681        ScannerBuilder {
682            rules,
683            labels: Labels::empty(),
684            scanner_features: ScannerFeatures::default(),
685        }
686    }
687
688    pub fn labels(mut self, labels: Labels) -> Self {
689        self.labels = labels;
690        self
691    }
692
693    pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
694        self.scanner_features.add_implicit_index_wildcards = value;
695        self
696    }
697
698    pub fn with_return_matches(mut self, value: bool) -> Self {
699        self.scanner_features.return_matches = value;
700        self
701    }
702
703    /// Enables/Disables the Multipass V0 feature. This defaults to TRUE.
704    /// Multipass V0 saves matches from excluded scopes, and marks any identical
705    /// matches in included scopes as a false positive.
706    pub fn with_multipass_v0(mut self, value: bool) -> Self {
707        self.scanner_features.multipass_v0_enabled = value;
708        self
709    }
710
711    pub fn with_skip_rules_with_regex_matching_empty_string(mut self, value: bool) -> Self {
712        self.scanner_features
713            .skip_rules_with_regex_matching_empty_string = value;
714        self
715    }
716
717    pub fn build(self) -> Result<Scanner, CreateScannerError> {
718        let mut match_validators_per_type = AHashMap::new();
719
720        for rule in self.rules.iter() {
721            if let Some(match_validation_type) = &rule.get_third_party_active_checker() {
722                if match_validation_type.can_create_match_validator() {
723                    let internal_type = match_validation_type.get_internal_match_validation_type();
724                    let match_validator = match_validation_type.into_match_validator();
725                    if let Ok(match_validator) = match_validator {
726                        if !match_validators_per_type.contains_key(&internal_type) {
727                            match_validators_per_type.insert(internal_type, match_validator);
728                        }
729                    } else {
730                        return Err(CreateScannerError::InvalidMatchValidator(
731                            MatchValidatorCreationError::InternalError,
732                        ));
733                    }
734                }
735            }
736        }
737
738        let compiled_rules = self
739            .rules
740            .iter()
741            .enumerate()
742            .filter_map(|(rule_index, config)| {
743                let inner = match config.convert_to_compiled_rule(rule_index, self.labels.clone()) {
744                    Ok(inner) => Ok(inner),
745                    Err(err) => {
746                        if self
747                            .scanner_features
748                            .skip_rules_with_regex_matching_empty_string
749                            && err
750                                == CreateScannerError::InvalidRegex(
751                                    RegexValidationError::MatchesEmptyString,
752                                )
753                        {
754                            // this is a temporary feature to skip rules that should be considered invalid.
755                            #[allow(clippy::print_stdout)]
756                            {
757                                println!("skipping rule that matches empty string: rule_index={}, labels={:?}", rule_index, self.labels.clone());
758                            }
759                            return None;
760                        } else {
761                            Err(err)
762                        }
763                    }
764                };
765                Some((config, inner))
766            })
767            .map(|(config, inner)| {
768                config.match_action.validate()?;
769                Ok(RootCompiledRule {
770                    inner: inner?,
771                    scope: config.scope.clone(),
772                    match_action: config.match_action.clone(),
773                    match_validation_type: config.get_third_party_active_checker().cloned(),
774                })
775            })
776            .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
777
778        let mut per_scanner_data = SharedData::new();
779
780        compiled_rules.iter().for_each(|rule| {
781            rule.init_per_scanner_data(&mut per_scanner_data);
782        });
783
784        let scoped_ruleset = ScopedRuleSet::new(
785            &compiled_rules
786                .iter()
787                .map(|rule| rule.scope.clone())
788                .collect::<Vec<_>>(),
789        )
790        .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
791
792        {
793            let stats = &*GLOBAL_STATS;
794            stats.scanner_creations.increment(1);
795            stats.increment_total_scanners();
796        }
797
798        Ok(Scanner {
799            rules: compiled_rules,
800            scoped_ruleset,
801            scanner_features: self.scanner_features,
802            metrics: ScannerMetrics::new(&self.labels),
803            match_validators_per_type,
804            labels: self.labels,
805            per_scanner_data,
806        })
807    }
808}
809
810struct ScannerContentVisitor<'a, E: Encoding> {
811    scanner: &'a Scanner,
812    regex_caches: &'a mut RegexCaches,
813    rule_matches: &'a mut Vec<(crate::Path<'static>, Vec<InternalRuleMatch<E>>)>,
814    // Rules that shall be skipped for this scan
815    // This list shall be small (<10), so a linear search is acceptable
816    blocked_rules: &'a Vec<usize>,
817    excluded_matches: &'a mut AHashSet<String>,
818    per_event_data: SharedData,
819    wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
820}
821
822impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
823    fn visit_content<'b>(
824        &'b mut self,
825        path: &Path<'a>,
826        content: &str,
827        mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
828        exclusion_check: ExclusionCheck<'b>,
829    ) -> Result<bool, ScannerError> {
830        // matches for a single path
831        let mut path_rules_matches = vec![];
832
833        // Create a map of per rule type data that can be shared between rules of the same type
834        let mut per_string_data = SharedData::new();
835        let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
836
837        rule_visitor.visit_rule_indices(|rule_index| {
838            if self.blocked_rules.contains(&rule_index) {
839                return Ok(());
840            }
841            let rule = &self.scanner.rules[rule_index];
842            {
843                // creating the emitter is basically free, it will get mostly optimized away
844                let mut emitter = |rule_match: StringMatch| {
845                    // This should never happen, but to ensure no empty match is ever generated
846                    // (which may cause an infinite loop), this will panic instead.
847                    assert_ne!(rule_match.start, rule_match.end, "empty match detected");
848
849                    path_rules_matches.push(InternalRuleMatch {
850                        rule_index,
851                        utf8_start: rule_match.start,
852                        utf8_end: rule_match.end,
853                        custom_start: E::zero_index(),
854                        custom_end: E::zero_index(),
855                    });
856                };
857
858                rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
859
860                // TODO: move this somewhere higher?
861                rule.init_per_event_data(&mut self.per_event_data);
862
863                let mut ctx = StringMatchesCtx {
864                    regex_caches: self.regex_caches,
865                    exclusion_check: &exclusion_check,
866                    excluded_matches: self.excluded_matches,
867                    match_emitter: &mut emitter,
868                    wildcard_indices: wildcard_indices_per_path,
869                    per_string_data: &mut per_string_data,
870                    per_scanner_data: &self.scanner.per_scanner_data,
871                    per_event_data: &mut self.per_event_data,
872                };
873
874                rule.get_string_matches(content, path, &mut ctx)?;
875            }
876            Ok(())
877        })?;
878
879        // calculate_indices requires that matches are sorted by start index
880        path_rules_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
881
882        E::calculate_indices(
883            content,
884            path_rules_matches
885                .iter_mut()
886                .map(|rule_match: &mut InternalRuleMatch<E>| EncodeIndices {
887                    utf8_start: rule_match.utf8_start,
888                    utf8_end: rule_match.utf8_end,
889                    custom_start: &mut rule_match.custom_start,
890                    custom_end: &mut rule_match.custom_end,
891                }),
892        );
893
894        // If there are any matches, the string will need to be accessed to check for false positives from
895        // excluded matches, any to potentially mutate the string.
896        let has_match = !path_rules_matches.is_empty();
897
898        if has_match {
899            self.rule_matches
900                .push((path.into_static(), path_rules_matches));
901        }
902
903        Ok(has_match)
904    }
905}
906
907// Calculates the next starting position for a regex match if a the previous match is a false positive
908fn get_next_regex_start(content: &str, regex_match: &Match) -> Option<usize> {
909    // The next valid UTF8 char after the start of the regex match is used
910    if let Some((i, _)) = content[regex_match.start()..].char_indices().nth(1) {
911        Some(regex_match.start() + i)
912    } else {
913        // There are no more chars left in the string to scan
914        None
915    }
916}
917
918fn is_false_positive_match(
919    regex_match: &Match,
920    rule: &RegexCompiledRule,
921    content: &str,
922    check_excluded_keywords: bool,
923) -> bool {
924    if check_excluded_keywords {
925        if let Some(excluded_keywords) = &rule.excluded_keywords {
926            if excluded_keywords.is_false_positive_match(content, regex_match.start()) {
927                return true;
928            }
929        }
930    }
931
932    if let Some(validator) = rule.validator.as_ref() {
933        if !validator.is_valid_match(&content[regex_match.range()]) {
934            return true;
935        };
936    }
937    false
938}