dd_sds/scanner/
mod.rs

1use crate::encoding::Encoding;
2use crate::event::Event;
3use std::future::Future;
4
5use crate::match_validation::{
6    config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
7    match_validator::MatchValidator,
8};
9
10use error::MatchValidatorCreationError;
11
12use self::metrics::ScannerMetrics;
13use crate::match_validation::match_validator::RAYON_THREAD_POOL;
14use crate::observability::labels::Labels;
15use crate::rule_match::{InternalRuleMatch, RuleMatch};
16use crate::scanner::config::RuleConfig;
17use crate::scanner::internal_rule_match_set::InternalRuleMatchSet;
18use crate::scanner::regex_rule::compiled::RegexCompiledRule;
19use crate::scanner::regex_rule::{RegexCaches, access_regex_caches};
20use crate::scanner::scope::Scope;
21pub use crate::scanner::shared_data::SharedData;
22use crate::scanner::suppression::{CompiledSuppressions, SuppressionValidationError, Suppressions};
23use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
24pub use crate::secondary_validation::Validator;
25use crate::stats::GLOBAL_STATS;
26use crate::tokio::TOKIO_RUNTIME;
27use crate::{
28    CreateScannerError, EncodeIndices, MatchAction, Path, RegexValidationError, ScannerError,
29};
30use ahash::{AHashMap, AHashSet};
31use futures::executor::block_on;
32use serde::{Deserialize, Serialize};
33use serde_with::serde_as;
34use std::ops::Deref;
35use std::pin::Pin;
36use std::sync::Arc;
37use std::time::{Duration, Instant};
38use tokio::task::JoinHandle;
39use tokio::time::timeout;
40
41pub mod config;
42pub mod error;
43pub mod metrics;
44pub mod regex_rule;
45pub mod scope;
46pub mod shared_data;
47pub mod shared_pool;
48pub mod suppression;
49
50mod internal_rule_match_set;
51#[cfg(test)]
52mod test;
53
54#[derive(Copy, Clone)]
55pub struct StringMatch {
56    pub start: usize,
57    pub end: usize,
58}
59
60pub trait MatchEmitter<T = ()> {
61    fn emit(&mut self, string_match: StringMatch) -> T;
62}
63
64// This implements MatchEmitter for mutable closures (so you can use a closure instead of a custom
65// struct that implements MatchEmitter)
66impl<F, T> MatchEmitter<T> for F
67where
68    F: FnMut(StringMatch) -> T,
69{
70    fn emit(&mut self, string_match: StringMatch) -> T {
71        // This just calls the closure (itself)
72        (self)(string_match)
73    }
74}
75
76#[serde_as]
77#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
78pub struct RootRuleConfig<T> {
79    pub match_action: MatchAction,
80    #[serde(default)]
81    pub scope: Scope,
82    #[deprecated(note = "Use `third_party_active_checker` instead")]
83    match_validation_type: Option<MatchValidationType>,
84    third_party_active_checker: Option<MatchValidationType>,
85    suppressions: Option<Suppressions>,
86    #[serde(flatten)]
87    pub inner: T,
88}
89
90impl<T> RootRuleConfig<T>
91where
92    T: RuleConfig + 'static,
93{
94    pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
95        RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
96    }
97
98    pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
99        self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
100    }
101}
102
103impl<T> RootRuleConfig<T> {
104    pub fn new(inner: T) -> Self {
105        #[allow(deprecated)]
106        Self {
107            match_action: MatchAction::None,
108            scope: Scope::all(),
109            match_validation_type: None,
110            third_party_active_checker: None,
111            suppressions: None,
112            inner,
113        }
114    }
115
116    pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
117        #[allow(deprecated)]
118        RootRuleConfig {
119            match_action: self.match_action,
120            scope: self.scope,
121            match_validation_type: self.match_validation_type,
122            third_party_active_checker: self.third_party_active_checker,
123            suppressions: self.suppressions,
124            inner: func(self.inner),
125        }
126    }
127
128    pub fn match_action(mut self, action: MatchAction) -> Self {
129        self.match_action = action;
130        self
131    }
132
133    pub fn scope(mut self, scope: Scope) -> Self {
134        self.scope = scope;
135        self
136    }
137
138    pub fn third_party_active_checker(
139        mut self,
140        match_validation_type: MatchValidationType,
141    ) -> Self {
142        self.third_party_active_checker = Some(match_validation_type);
143        self
144    }
145
146    pub fn suppressions(mut self, suppressions: Suppressions) -> Self {
147        self.suppressions = Some(suppressions);
148        self
149    }
150
151    fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
152        #[allow(deprecated)]
153        self.third_party_active_checker
154            .as_ref()
155            .or(self.match_validation_type.as_ref())
156    }
157}
158
159impl<T> Deref for RootRuleConfig<T> {
160    type Target = T;
161
162    fn deref(&self) -> &Self::Target {
163        &self.inner
164    }
165}
166pub struct RootCompiledRule {
167    pub inner: Box<dyn CompiledRule>,
168    pub scope: Scope,
169    pub match_action: MatchAction,
170    pub match_validation_type: Option<MatchValidationType>,
171    pub suppressions: Option<CompiledSuppressions>,
172}
173
174impl RootCompiledRule {
175    pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
176        self.match_validation_type
177            .as_ref()
178            .map(|x| x.get_internal_match_validation_type())
179    }
180}
181
182impl Deref for RootCompiledRule {
183    type Target = dyn CompiledRule;
184
185    fn deref(&self) -> &Self::Target {
186        self.inner.as_ref()
187    }
188}
189
190pub struct StringMatchesCtx<'a> {
191    rule_index: usize,
192    pub regex_caches: &'a mut RegexCaches,
193    pub exclusion_check: &'a ExclusionCheck<'a>,
194    pub excluded_matches: &'a mut AHashSet<String>,
195    pub match_emitter: &'a mut dyn MatchEmitter,
196    pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
197
198    // Shared Data
199    pub per_string_data: &'a mut SharedData,
200    pub per_scanner_data: &'a SharedData,
201    pub per_event_data: &'a mut SharedData,
202}
203
204impl StringMatchesCtx<'_> {
205    /// If a `get_string_matches` implementation needs to do any async processing (e.g. I/O),
206    /// this function can be used to return an "async job" to find matches. The return value
207    /// of `process_async` should be returned from the `get_string_matches` function. The future
208    /// passed into this function will be spawned and executed immediately without blocking
209    /// other `get_string_matches` calls. This means all the async jobs will run concurrently.
210    ///
211    /// The `ctx` available to async jobs is more restrictive than the normal `ctx` available in
212    /// `get_string_matches`. The only thing you can do is return matches. If other data is needed,
213    /// it should be accessed before `process_async` is called.
214    pub fn process_async(
215        &self,
216        func: impl for<'a> FnOnce(
217            &'a mut AsyncStringMatchesCtx,
218        )
219            -> Pin<Box<dyn Future<Output = Result<(), ScannerError>> + Send + 'a>>
220        + Send
221        + 'static,
222    ) -> RuleResult {
223        let rule_index = self.rule_index;
224
225        // The future is spawned onto the tokio runtime immediately so it starts running
226        // in the background
227        let fut = TOKIO_RUNTIME.spawn(async move {
228            let mut ctx = AsyncStringMatchesCtx {
229                rule_matches: vec![],
230            };
231            (func)(&mut ctx).await?;
232
233            Ok(AsyncRuleInfo {
234                rule_index,
235                rule_matches: ctx.rule_matches,
236            })
237        });
238
239        Ok(RuleStatus::Pending(fut))
240    }
241}
242
243pub struct AsyncStringMatchesCtx {
244    rule_matches: Vec<StringMatch>,
245}
246
247impl AsyncStringMatchesCtx {
248    pub fn emit_match(&mut self, string_match: StringMatch) {
249        self.rule_matches.push(string_match);
250    }
251}
252
253#[must_use]
254pub enum RuleStatus {
255    Done,
256    Pending(PendingRuleResult),
257}
258
259// pub type PendingRuleResult = BoxFuture<'static, Result<AsyncRuleInfo, ScannerError>>;
260pub type PendingRuleResult = JoinHandle<Result<AsyncRuleInfo, ScannerError>>;
261
262pub struct PendingRuleJob {
263    fut: PendingRuleResult,
264    path: Path<'static>,
265}
266
267pub struct AsyncRuleInfo {
268    rule_index: usize,
269    rule_matches: Vec<StringMatch>,
270}
271
272/// A rule result that cannot be async
273pub type RuleResult = Result<RuleStatus, ScannerError>;
274
275// This is the public trait that is used to define the behavior of a compiled rule.
276pub trait CompiledRule: Send + Sync {
277    fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
278        // by default, no per-scanner data is initialized
279    }
280
281    fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
282        // by default, no per-string data is initialized
283    }
284
285    fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
286        // by default, no per-event data is initialized
287    }
288
289    fn get_string_matches(
290        &self,
291        content: &str,
292        path: &Path,
293        ctx: &mut StringMatchesCtx<'_>,
294    ) -> RuleResult;
295
296    // Whether a match from this rule should be excluded (marked as a false-positive)
297    // if the content of this match was found in a match from an excluded scope
298    fn should_exclude_multipass_v0(&self) -> bool {
299        // default is to NOT use Multi-pass V0
300        false
301    }
302
303    fn on_excluded_match_multipass_v0(&self) {
304        // default is to do nothing
305    }
306}
307
308impl<T> RuleConfig for Box<T>
309where
310    T: RuleConfig + ?Sized,
311{
312    fn convert_to_compiled_rule(
313        &self,
314        rule_index: usize,
315        labels: Labels,
316    ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
317        self.as_ref().convert_to_compiled_rule(rule_index, labels)
318    }
319}
320
321#[derive(Debug, PartialEq, Clone)]
322struct ScannerFeatures {
323    pub add_implicit_index_wildcards: bool,
324    pub multipass_v0_enabled: bool,
325    pub return_matches: bool,
326    // This is a temporary flag to disable failed rules (instead of fail the entire scanner)
327    // for regex rules that match an empty string
328    pub skip_rules_with_regex_matching_empty_string: bool,
329}
330
331impl Default for ScannerFeatures {
332    fn default() -> Self {
333        Self {
334            add_implicit_index_wildcards: false,
335            multipass_v0_enabled: true,
336            return_matches: false,
337            skip_rules_with_regex_matching_empty_string: false,
338        }
339    }
340}
341
342pub struct ScanOptions {
343    // The blocked_rules_idx parameter is a list of rule indices that should be skipped for this scan.
344    // this list shall be small (<10), so a linear search is acceptable otherwise performance will be impacted.
345    pub blocked_rules_idx: Vec<usize>,
346    // The wildcarded_indices parameter is a map containing a list of tuples of (start, end) indices that should be treated as wildcards (for the message key only) per path.
347    pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
348    // Whether to validate matches using third-party validators (e.g., checksum validation for credit cards).
349    // When enabled, the scanner automatically collects match content needed for validation.
350    pub validate_matches: bool,
351}
352
353impl Default for ScanOptions {
354    fn default() -> Self {
355        Self {
356            blocked_rules_idx: vec![],
357            wildcarded_indices: AHashMap::new(),
358            validate_matches: false,
359        }
360    }
361}
362
363pub struct ScanOptionBuilder {
364    blocked_rules_idx: Vec<usize>,
365    wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
366    validate_matches: bool,
367}
368
369impl ScanOptionBuilder {
370    pub fn new() -> Self {
371        Self {
372            blocked_rules_idx: vec![],
373            wildcarded_indices: AHashMap::new(),
374            validate_matches: false,
375        }
376    }
377
378    pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
379        self.blocked_rules_idx = blocked_rules_idx;
380        self
381    }
382
383    pub fn with_wildcarded_indices(
384        mut self,
385        wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
386    ) -> Self {
387        self.wildcarded_indices = wildcarded_indices;
388        self
389    }
390
391    pub fn with_validate_matching(mut self, validate_matches: bool) -> Self {
392        self.validate_matches = validate_matches;
393        self
394    }
395
396    pub fn build(self) -> ScanOptions {
397        ScanOptions {
398            blocked_rules_idx: self.blocked_rules_idx,
399            wildcarded_indices: self.wildcarded_indices,
400            validate_matches: self.validate_matches,
401        }
402    }
403}
404
405pub struct Scanner {
406    rules: Vec<RootCompiledRule>,
407    scoped_ruleset: ScopedRuleSet,
408    scanner_features: ScannerFeatures,
409    metrics: ScannerMetrics,
410    labels: Labels,
411    match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
412    per_scanner_data: SharedData,
413    async_scan_timeout: Duration,
414}
415
416impl Scanner {
417    pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
418        ScannerBuilder::new(rules)
419    }
420
421    // This function scans the given event with the rules configured in the scanner.
422    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
423    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
424    // This version uses default scan options (no validation, no blocked rules, no wildcarded indices).
425    pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
426        self.scan_with_options(event, ScanOptions::default())
427    }
428
429    // This function scans the given event with the rules configured in the scanner.
430    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
431    // The options parameter allows customizing the scan behavior (validation, blocked rules, etc.).
432    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
433    pub fn scan_with_options<E: Event>(
434        &self,
435        event: &mut E,
436        options: ScanOptions,
437    ) -> Result<Vec<RuleMatch>, ScannerError> {
438        block_on(self.internal_scan_with_metrics(event, options))
439    }
440
441    // This function scans the given event with the rules configured in the scanner.
442    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
443    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
444    pub async fn scan_async<E: Event>(
445        &self,
446        event: &mut E,
447    ) -> Result<Vec<RuleMatch>, ScannerError> {
448        self.scan_async_with_options(event, ScanOptions::default())
449            .await
450    }
451
452    pub async fn scan_async_with_options<E: Event>(
453        &self,
454        event: &mut E,
455        options: ScanOptions,
456    ) -> Result<Vec<RuleMatch>, ScannerError> {
457        let fut = self.internal_scan_with_metrics(event, options);
458
459        // The sleep from the timeout requires being in a tokio context
460        // The guard needs to be dropped before await since the guard is !Send
461        let timeout = {
462            let _tokio_guard = TOKIO_RUNTIME.enter();
463            timeout(self.async_scan_timeout, fut)
464        };
465
466        timeout.await.unwrap_or(Err(ScannerError::Transient(
467            "Async scan timeout".to_string(),
468        )))
469    }
470
471    fn record_metrics(&self, output_rule_matches: &[RuleMatch], start: Instant) {
472        // Record detection time
473        self.metrics
474            .duration_ns
475            .increment(start.elapsed().as_nanos() as u64);
476        // Add number of scanned events
477        self.metrics.num_scanned_events.increment(1);
478        // Add number of matches
479        self.metrics
480            .match_count
481            .increment(output_rule_matches.len() as u64);
482    }
483
484    async fn internal_scan_with_metrics<E: Event>(
485        &self,
486        event: &mut E,
487        options: ScanOptions,
488    ) -> Result<Vec<RuleMatch>, ScannerError> {
489        let start = Instant::now();
490        let result = self.internal_scan(event, options).await;
491        match &result {
492            Ok(rule_matches) => {
493                self.record_metrics(rule_matches, start);
494            }
495            Err(_) => {
496                self.record_metrics(&[], start);
497            }
498        }
499        result
500    }
501
502    async fn internal_scan<E: Event>(
503        &self,
504        event: &mut E,
505        options: ScanOptions,
506    ) -> Result<Vec<RuleMatch>, ScannerError> {
507        // If validation is requested, we need to collect match content even if the scanner
508        // wasn't originally configured to return matches
509        let need_match_content = self.scanner_features.return_matches || options.validate_matches;
510        // All matches, after some (but not all) false-positives have been removed.
511        let mut rule_matches = InternalRuleMatchSet::new();
512        let mut excluded_matches = AHashSet::new();
513        let mut async_jobs = vec![];
514
515        access_regex_caches(|regex_caches| {
516            self.scoped_ruleset.visit_string_rule_combinations(
517                event,
518                ScannerContentVisitor {
519                    scanner: self,
520                    regex_caches,
521                    rule_matches: &mut rule_matches,
522                    blocked_rules: &options.blocked_rules_idx,
523                    excluded_matches: &mut excluded_matches,
524                    per_event_data: SharedData::new(),
525                    wildcarded_indexes: &options.wildcarded_indices,
526                    async_jobs: &mut async_jobs,
527                },
528            )
529        })?;
530
531        // The async jobs were already spawned on the tokio runtime, so the
532        // results just need to be collected
533        for job in async_jobs {
534            let rule_info = job.fut.await.unwrap()?;
535            rule_matches.push_async_matches(
536                &job.path,
537                rule_info
538                    .rule_matches
539                    .into_iter()
540                    .map(|x| InternalRuleMatch::new(rule_info.rule_index, x)),
541            );
542        }
543
544        let mut output_rule_matches = vec![];
545
546        for (path, mut rule_matches) in rule_matches.into_iter() {
547            // All rule matches in each inner list are for a single path, so they can be processed independently.
548            event.visit_string_mut(&path, |content| {
549                // calculate_indices requires that matches are sorted by start index
550                rule_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
551
552                <<E as Event>::Encoding>::calculate_indices(
553                    content,
554                    rule_matches.iter_mut().map(
555                        |rule_match: &mut InternalRuleMatch<E::Encoding>| EncodeIndices {
556                            utf8_start: rule_match.utf8_start,
557                            utf8_end: rule_match.utf8_end,
558                            custom_start: &mut rule_match.custom_start,
559                            custom_end: &mut rule_match.custom_end,
560                        },
561                    ),
562                );
563
564                if self.scanner_features.multipass_v0_enabled {
565                    // Now that the `excluded_matches` set is fully populated, filter out any matches
566                    // that are the same as excluded matches (also known as "Multi-pass V0")
567                    rule_matches.retain(|rule_match| {
568                        if self.rules[rule_match.rule_index]
569                            .inner
570                            .should_exclude_multipass_v0()
571                        {
572                            let is_false_positive = excluded_matches
573                                .contains(&content[rule_match.utf8_start..rule_match.utf8_end]);
574                            if is_false_positive && self.scanner_features.multipass_v0_enabled {
575                                self.rules[rule_match.rule_index].on_excluded_match_multipass_v0();
576                            }
577                            !is_false_positive
578                        } else {
579                            true
580                        }
581                    });
582                }
583
584                self.suppress_matches::<E::Encoding>(&mut rule_matches, content);
585
586                self.sort_and_remove_overlapping_rules::<E::Encoding>(&mut rule_matches);
587
588                let will_mutate = rule_matches
589                    .iter()
590                    .any(|rule_match| self.rules[rule_match.rule_index].match_action.is_mutating());
591
592                self.apply_match_actions(
593                    content,
594                    &path,
595                    &mut rule_matches,
596                    &mut output_rule_matches,
597                    need_match_content,
598                );
599
600                will_mutate
601            });
602        }
603
604        if options.validate_matches {
605            self.validate_matches(&mut output_rule_matches);
606        }
607
608        Ok(output_rule_matches)
609    }
610
611    pub fn suppress_matches<E: Encoding>(
612        &self,
613        rule_matches: &mut Vec<InternalRuleMatch<E>>,
614        content: &str,
615    ) {
616        rule_matches.retain(|rule_match| {
617            if let Some(suppressions) = &self.rules[rule_match.rule_index].suppressions {
618                let match_should_be_suppressed = suppressions.should_match_be_suppressed(
619                    &content[rule_match.utf8_start..rule_match.utf8_end],
620                );
621                if match_should_be_suppressed {
622                    self.metrics.suppressed_match_count.increment(1);
623                }
624                !match_should_be_suppressed
625            } else {
626                true
627            }
628        });
629    }
630
631    pub fn validate_matches(&self, rule_matches: &mut Vec<RuleMatch>) {
632        // Create MatchValidatorRuleMatch per match_validator_type to pass it to each match_validator
633        let mut match_validator_rule_match_per_type = AHashMap::new();
634
635        let mut validated_rule_matches = vec![];
636
637        for mut rule_match in rule_matches.drain(..) {
638            let rule = &self.rules[rule_match.rule_index];
639            if let Some(match_validation_type) = rule.internal_match_validation_type() {
640                match_validator_rule_match_per_type
641                    .entry(match_validation_type)
642                    .or_insert_with(Vec::new)
643                    .push(rule_match)
644            } else {
645                // There is no match validator for this rule, so mark it as not available.
646                rule_match.match_status.merge(MatchStatus::NotAvailable);
647                validated_rule_matches.push(rule_match);
648            }
649        }
650
651        RAYON_THREAD_POOL.install(|| {
652            use rayon::prelude::*;
653
654            match_validator_rule_match_per_type.par_iter_mut().for_each(
655                |(match_validation_type, matches_per_type)| {
656                    let match_validator = self.match_validators_per_type.get(match_validation_type);
657                    if let Some(match_validator) = match_validator {
658                        match_validator
659                            .as_ref()
660                            .validate(matches_per_type, &self.rules)
661                    }
662                },
663            );
664        });
665
666        // Refill the rule_matches with the validated matches
667        for (_, mut matches) in match_validator_rule_match_per_type {
668            validated_rule_matches.append(&mut matches);
669        }
670
671        // Sort rule_matches by start index
672        validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
673        *rule_matches = validated_rule_matches;
674    }
675
676    /// Apply mutations from actions, and shift indices to match the mutated values.
677    /// This assumes the matches are all from the content given, and are sorted by start index.
678    fn apply_match_actions<E: Encoding>(
679        &self,
680        content: &mut String,
681        path: &Path<'static>,
682        rule_matches: &mut [InternalRuleMatch<E>],
683        output_rule_matches: &mut Vec<RuleMatch>,
684        need_match_content: bool,
685    ) {
686        let mut utf8_byte_delta: isize = 0;
687        let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
688
689        for rule_match in rule_matches {
690            output_rule_matches.push(self.apply_match_actions_for_string::<E>(
691                content,
692                path.clone(),
693                rule_match,
694                &mut utf8_byte_delta,
695                &mut custom_index_delta,
696                need_match_content,
697            ));
698        }
699    }
700
701    /// This will be called once for each match of a single string. The rules must be passed in in order of the start index. Mutating rules must not overlap.
702    fn apply_match_actions_for_string<E: Encoding>(
703        &self,
704        content: &mut String,
705        path: Path<'static>,
706        rule_match: &InternalRuleMatch<E>,
707        // The current difference in length between the original and mutated string
708        utf8_byte_delta: &mut isize,
709
710        // The difference between the custom index on the original string and the mutated string
711        custom_index_delta: &mut <E>::IndexShift,
712        need_match_content: bool,
713    ) -> RuleMatch {
714        let rule = &self.rules[rule_match.rule_index];
715
716        let custom_start =
717            (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
718                + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
719
720        let mut matched_content_copy = None;
721
722        if need_match_content {
723            // This copies part of the is_mutating block but is seperate since can't mix compilation condition and code condition
724            let mutated_utf8_match_start =
725                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
726            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
727
728            // Matches for mutating rules must have valid indices
729            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
730            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
731
732            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
733            matched_content_copy = Some(matched_content.to_string());
734        }
735
736        if rule.match_action.is_mutating() {
737            let mutated_utf8_match_start =
738                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
739            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
740
741            // Matches for mutating rules must have valid indices
742            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
743            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
744
745            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
746            if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
747                let before_replacement = &matched_content[replacement.start..replacement.end];
748
749                // update indices to match the new mutated content
750                <E>::adjust_shift(
751                    custom_index_delta,
752                    before_replacement,
753                    &replacement.replacement,
754                );
755                *utf8_byte_delta +=
756                    replacement.replacement.len() as isize - before_replacement.len() as isize;
757
758                let replacement_start = mutated_utf8_match_start + replacement.start;
759                let replacement_end = mutated_utf8_match_start + replacement.end;
760                content.replace_range(replacement_start..replacement_end, &replacement.replacement);
761            }
762        }
763
764        let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
765        let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
766            + shift_offset) as usize;
767
768        let rule = &self.rules[rule_match.rule_index];
769
770        let match_status: MatchStatus = if rule.match_validation_type.is_some() {
771            MatchStatus::NotChecked
772        } else {
773            MatchStatus::NotAvailable
774        };
775
776        RuleMatch {
777            rule_index: rule_match.rule_index,
778            path,
779            replacement_type: rule.match_action.replacement_type(),
780            start_index: custom_start,
781            end_index_exclusive: custom_end,
782            shift_offset,
783            match_value: matched_content_copy,
784            match_status,
785        }
786    }
787
788    fn sort_and_remove_overlapping_rules<E: Encoding>(
789        &self,
790        rule_matches: &mut Vec<InternalRuleMatch<E>>,
791    ) {
792        // Some of the scanner code relies on the behavior here, such as the sort order and removal of overlapping mutating rules.
793        // Be very careful if this function is modified.
794
795        rule_matches.sort_unstable_by(|a, b| {
796            // Mutating rules are a higher priority (earlier in the list)
797            let ord = self.rules[a.rule_index]
798                .match_action
799                .is_mutating()
800                .cmp(&self.rules[b.rule_index].match_action.is_mutating())
801                .reverse();
802
803            // Earlier start offset
804            let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
805
806            // Longer matches
807            let ord = ord.then(a.len().cmp(&b.len()).reverse());
808
809            // Matches from earlier rules
810            let ord = ord.then(a.rule_index.cmp(&b.rule_index));
811
812            // swap the order of everything so matches can be efficiently popped off the back as they are processed
813            ord.reverse()
814        });
815
816        let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
817
818        'rule_matches: while let Some(rule_match) = rule_matches.pop() {
819            if self.rules[rule_match.rule_index].match_action.is_mutating() {
820                // Mutating rules are kept only if they don't overlap with a previous rule.
821                if let Some(last) = retained_rules.last()
822                    && last.utf8_end > rule_match.utf8_start
823                {
824                    continue;
825                }
826            } else {
827                // Only retain if it doesn't overlap with any other rule. Since mutating matches are sorted before non-mutated matches
828                // this needs to check all retained matches (instead of just the last one)
829                for retained_rule in &retained_rules {
830                    if retained_rule.utf8_start < rule_match.utf8_end
831                        && retained_rule.utf8_end > rule_match.utf8_start
832                    {
833                        continue 'rule_matches;
834                    }
835                }
836            };
837            retained_rules.push(rule_match);
838        }
839
840        // ensure rules are sorted by start index (other parts of the library required this to function correctly)
841        retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
842
843        *rule_matches = retained_rules;
844    }
845}
846
847impl Drop for Scanner {
848    fn drop(&mut self) {
849        let stats = &*GLOBAL_STATS;
850        stats.scanner_deletions.increment(1);
851        stats.decrement_total_scanners();
852    }
853}
854
855#[derive(Default)]
856pub struct ScannerBuilder<'a> {
857    rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
858    labels: Labels,
859    scanner_features: ScannerFeatures,
860    async_scan_timeout: Duration,
861}
862
863impl ScannerBuilder<'_> {
864    pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
865        ScannerBuilder {
866            rules,
867            labels: Labels::empty(),
868            scanner_features: ScannerFeatures::default(),
869            async_scan_timeout: Duration::from_secs(60 * 5),
870        }
871    }
872
873    pub fn labels(mut self, labels: Labels) -> Self {
874        self.labels = labels;
875        self
876    }
877
878    pub fn with_async_scan_timeout(mut self, duration: Duration) -> Self {
879        self.async_scan_timeout = duration;
880        self
881    }
882
883    pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
884        self.scanner_features.add_implicit_index_wildcards = value;
885        self
886    }
887
888    pub fn with_return_matches(mut self, value: bool) -> Self {
889        self.scanner_features.return_matches = value;
890        self
891    }
892
893    /// Enables/Disables the Multipass V0 feature. This defaults to TRUE.
894    /// Multipass V0 saves matches from excluded scopes, and marks any identical
895    /// matches in included scopes as a false positive.
896    pub fn with_multipass_v0(mut self, value: bool) -> Self {
897        self.scanner_features.multipass_v0_enabled = value;
898        self
899    }
900
901    pub fn with_skip_rules_with_regex_matching_empty_string(mut self, value: bool) -> Self {
902        self.scanner_features
903            .skip_rules_with_regex_matching_empty_string = value;
904        self
905    }
906
907    pub fn build(self) -> Result<Scanner, CreateScannerError> {
908        let mut match_validators_per_type = AHashMap::new();
909
910        for rule in self.rules.iter() {
911            if let Some(match_validation_type) = &rule.get_third_party_active_checker()
912                && match_validation_type.can_create_match_validator()
913            {
914                let internal_type = match_validation_type.get_internal_match_validation_type();
915                let match_validator = match_validation_type.into_match_validator();
916                if let Ok(match_validator) = match_validator {
917                    if !match_validators_per_type.contains_key(&internal_type) {
918                        match_validators_per_type.insert(internal_type, match_validator);
919                    }
920                } else {
921                    return Err(CreateScannerError::InvalidMatchValidator(
922                        MatchValidatorCreationError::InternalError,
923                    ));
924                }
925            }
926        }
927
928        let compiled_rules = self
929            .rules
930            .iter()
931            .enumerate()
932            .filter_map(|(rule_index, config)| {
933                let inner = match config.convert_to_compiled_rule(rule_index, self.labels.clone()) {
934                    Ok(inner) => Ok(inner),
935                    Err(err) => {
936                        if self
937                            .scanner_features
938                            .skip_rules_with_regex_matching_empty_string
939                            && err
940                            == CreateScannerError::InvalidRegex(
941                            RegexValidationError::MatchesEmptyString,
942                        )
943                        {
944                            // this is a temporary feature to skip rules that should be considered invalid.
945                            #[allow(clippy::print_stdout)]
946                            {
947                                println!("skipping rule that matches empty string: rule_index={}, labels={:?}", rule_index, self.labels.clone());
948                            }
949                            return None;
950                        } else {
951                            Err(err)
952                        }
953                    }
954                };
955                Some((config, inner))
956            })
957            .map(|(config, inner)| {
958                config.match_action.validate()?;
959                let compiled_suppressions = match &config.suppressions {
960                    Some(s) => Some(s.clone().try_into()?),
961                    None => None,
962                };
963                Ok(RootCompiledRule {
964                    inner: inner?,
965                    scope: config.scope.clone(),
966                    match_action: config.match_action.clone(),
967                    match_validation_type: config.get_third_party_active_checker().cloned(),
968                    suppressions: compiled_suppressions,
969                })
970            })
971            .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
972
973        let mut per_scanner_data = SharedData::new();
974
975        compiled_rules.iter().for_each(|rule| {
976            rule.init_per_scanner_data(&mut per_scanner_data);
977        });
978
979        let scoped_ruleset = ScopedRuleSet::new(
980            &compiled_rules
981                .iter()
982                .map(|rule| rule.scope.clone())
983                .collect::<Vec<_>>(),
984        )
985        .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
986
987        {
988            let stats = &*GLOBAL_STATS;
989            stats.scanner_creations.increment(1);
990            stats.increment_total_scanners();
991        }
992
993        Ok(Scanner {
994            rules: compiled_rules,
995            scoped_ruleset,
996            scanner_features: self.scanner_features,
997            metrics: ScannerMetrics::new(&self.labels),
998            match_validators_per_type,
999            labels: self.labels,
1000            per_scanner_data,
1001            async_scan_timeout: self.async_scan_timeout,
1002        })
1003    }
1004}
1005
1006struct ScannerContentVisitor<'a, E: Encoding> {
1007    scanner: &'a Scanner,
1008    regex_caches: &'a mut RegexCaches,
1009    rule_matches: &'a mut InternalRuleMatchSet<E>,
1010    // Rules that shall be skipped for this scan
1011    // This list shall be small (<10), so a linear search is acceptable
1012    blocked_rules: &'a Vec<usize>,
1013    excluded_matches: &'a mut AHashSet<String>,
1014    per_event_data: SharedData,
1015    wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
1016    async_jobs: &'a mut Vec<PendingRuleJob>,
1017}
1018
1019impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
1020    fn visit_content<'b>(
1021        &'b mut self,
1022        path: &Path<'a>,
1023        content: &str,
1024        mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
1025        exclusion_check: ExclusionCheck<'b>,
1026    ) -> Result<bool, ScannerError> {
1027        // matches for a single path
1028        let mut path_rules_matches = vec![];
1029
1030        // Create a map of per rule type data that can be shared between rules of the same type
1031        let mut per_string_data = SharedData::new();
1032        let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
1033
1034        rule_visitor.visit_rule_indices(|rule_index| {
1035            if self.blocked_rules.contains(&rule_index) {
1036                return Ok(());
1037            }
1038            let rule = &self.scanner.rules[rule_index];
1039            {
1040                // creating the emitter is basically free, it will get mostly optimized away
1041                let mut emitter = |rule_match: StringMatch| {
1042                    // This should never happen, but to ensure no empty match is ever generated
1043                    // (which may cause an infinite loop), this will panic instead.
1044                    assert_ne!(rule_match.start, rule_match.end, "empty match detected");
1045                    path_rules_matches.push(InternalRuleMatch::new(rule_index, rule_match));
1046                };
1047
1048                rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
1049
1050                // TODO: move this somewhere higher?
1051                rule.init_per_event_data(&mut self.per_event_data);
1052
1053                let mut ctx = StringMatchesCtx {
1054                    rule_index,
1055                    regex_caches: self.regex_caches,
1056                    exclusion_check: &exclusion_check,
1057                    excluded_matches: self.excluded_matches,
1058                    match_emitter: &mut emitter,
1059                    wildcard_indices: wildcard_indices_per_path,
1060                    per_string_data: &mut per_string_data,
1061                    per_scanner_data: &self.scanner.per_scanner_data,
1062                    per_event_data: &mut self.per_event_data,
1063                };
1064
1065                let async_status = rule.get_string_matches(content, path, &mut ctx)?;
1066
1067                match async_status {
1068                    RuleStatus::Done => {
1069                        // nothing to do
1070                    }
1071                    RuleStatus::Pending(fut) => {
1072                        self.async_jobs.push(PendingRuleJob {
1073                            fut,
1074                            path: path.into_static(),
1075                        });
1076                    }
1077                }
1078            }
1079            Ok(())
1080        })?;
1081
1082        // If there are any matches, the string will need to be accessed to check for false positives from
1083        // excluded matches, any to potentially mutate the string.
1084        // If there are any async jobs, this is also true since it's not known yet whether there
1085        // will be a match
1086        let needs_to_access_content = !path_rules_matches.is_empty() || !self.async_jobs.is_empty();
1087
1088        self.rule_matches
1089            .push_sync_matches(path, path_rules_matches);
1090
1091        Ok(needs_to_access_content)
1092    }
1093}
1094
1095// Calculates the next starting position for a regex match if a the previous match is a false positive
1096fn get_next_regex_start(content: &str, regex_match: (usize, usize)) -> Option<usize> {
1097    // The next valid UTF8 char after the start of the regex match is used
1098    if let Some((i, _)) = content[regex_match.0..].char_indices().nth(1) {
1099        Some(regex_match.0 + i)
1100    } else {
1101        // There are no more chars left in the string to scan
1102        None
1103    }
1104}
1105
1106fn is_false_positive_match(
1107    regex_match_range: (usize, usize),
1108    rule: &RegexCompiledRule,
1109    content: &str,
1110    check_excluded_keywords: bool,
1111) -> bool {
1112    if check_excluded_keywords
1113        && let Some(excluded_keywords) = &rule.excluded_keywords
1114        && excluded_keywords.is_false_positive_match(content, regex_match_range.0)
1115    {
1116        return true;
1117    }
1118
1119    if let Some(validator) = rule.validator.as_ref()
1120        && !validator.is_valid_match(&content[regex_match_range.0..regex_match_range.1])
1121    {
1122        return true;
1123    }
1124    false
1125}