dd_sds/scanner/
mod.rs

1use crate::encoding::Encoding;
2use crate::event::Event;
3use std::future::Future;
4
5use crate::match_validation::{
6    config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
7    match_validator::MatchValidator,
8};
9
10use error::MatchValidatorCreationError;
11
12use self::metrics::ScannerMetrics;
13use crate::match_validation::match_validator::RAYON_THREAD_POOL;
14use crate::observability::labels::Labels;
15use crate::rule_match::{InternalRuleMatch, RuleMatch};
16use crate::scanner::config::RuleConfig;
17use crate::scanner::internal_rule_match_set::InternalRuleMatchSet;
18use crate::scanner::regex_rule::compiled::RegexCompiledRule;
19use crate::scanner::regex_rule::{RegexCaches, access_regex_caches};
20use crate::scanner::scope::Scope;
21pub use crate::scanner::shared_data::SharedData;
22use crate::scanner::suppression::{CompiledSuppressions, SuppressionValidationError, Suppressions};
23use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
24pub use crate::secondary_validation::Validator;
25use crate::stats::GLOBAL_STATS;
26use crate::tokio::TOKIO_RUNTIME;
27use crate::{CreateScannerError, EncodeIndices, MatchAction, Path, ScannerError};
28use ahash::{AHashMap, AHashSet};
29use futures::executor::block_on;
30use serde::{Deserialize, Serialize};
31use serde_with::serde_as;
32use std::ops::Deref;
33use std::pin::Pin;
34use std::sync::Arc;
35use std::time::{Duration, Instant};
36use tokio::task::JoinHandle;
37use tokio::time::timeout;
38
39pub mod config;
40pub mod debug_scan;
41pub mod error;
42pub mod metrics;
43pub mod regex_rule;
44pub mod scope;
45pub mod shared_data;
46pub mod shared_pool;
47pub mod suppression;
48
49mod internal_rule_match_set;
50#[cfg(test)]
51mod test;
52
53#[derive(Clone)]
54pub struct StringMatch {
55    pub start: usize,
56    pub end: usize,
57    // The keyword that was used to match this rule. Optional, only some rules may set this value.
58    pub keyword: Option<String>,
59}
60
61pub trait MatchEmitter<T = ()> {
62    fn emit(&mut self, string_match: StringMatch) -> T;
63}
64
65// This implements MatchEmitter for mutable closures (so you can use a closure instead of a custom
66// struct that implements MatchEmitter)
67impl<F, T> MatchEmitter<T> for F
68where
69    F: FnMut(StringMatch) -> T,
70{
71    fn emit(&mut self, string_match: StringMatch) -> T {
72        // This just calls the closure (itself)
73        (self)(string_match)
74    }
75}
76
77/// The precedence of a rule. Catchall is the lowest precedence, Specific is the highest precedence.
78/// The default precedence is Specific.
79/// For rules that:
80/// - Have the same mutation priority
81/// - Match at the same index
82/// - Match the same number of characters
83///
84/// Then the rule with the highest precedence will be used.
85#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Default)]
86pub enum Precedence {
87    Catchall,
88    Generic,
89    #[default]
90    Specific,
91}
92
93#[serde_as]
94#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
95pub struct RootRuleConfig<T> {
96    pub match_action: MatchAction,
97    #[serde(default)]
98    pub scope: Scope,
99    #[deprecated(note = "Use `third_party_active_checker` instead")]
100    match_validation_type: Option<MatchValidationType>,
101    third_party_active_checker: Option<MatchValidationType>,
102    suppressions: Option<Suppressions>,
103    #[serde(default)]
104    precedence: Precedence,
105    #[serde(flatten)]
106    pub inner: T,
107}
108
109impl<T> RootRuleConfig<T>
110where
111    T: RuleConfig + 'static,
112{
113    pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
114        RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
115    }
116
117    pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
118        self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
119    }
120}
121
122impl<T> RootRuleConfig<T> {
123    pub fn new(inner: T) -> Self {
124        #[allow(deprecated)]
125        Self {
126            match_action: MatchAction::None,
127            scope: Scope::all(),
128            match_validation_type: None,
129            third_party_active_checker: None,
130            suppressions: None,
131            precedence: Precedence::default(),
132            inner,
133        }
134    }
135
136    pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
137        #[allow(deprecated)]
138        RootRuleConfig {
139            match_action: self.match_action,
140            scope: self.scope,
141            match_validation_type: self.match_validation_type,
142            third_party_active_checker: self.third_party_active_checker,
143            suppressions: self.suppressions,
144            precedence: self.precedence,
145            inner: func(self.inner),
146        }
147    }
148
149    pub fn match_action(mut self, action: MatchAction) -> Self {
150        self.match_action = action;
151        self
152    }
153
154    pub fn precedence(mut self, precedence: Precedence) -> Self {
155        self.precedence = precedence;
156        self
157    }
158
159    pub fn scope(mut self, scope: Scope) -> Self {
160        self.scope = scope;
161        self
162    }
163
164    pub fn third_party_active_checker(
165        mut self,
166        match_validation_type: MatchValidationType,
167    ) -> Self {
168        self.third_party_active_checker = Some(match_validation_type);
169        self
170    }
171
172    pub fn suppressions(mut self, suppressions: Suppressions) -> Self {
173        self.suppressions = Some(suppressions);
174        self
175    }
176
177    fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
178        #[allow(deprecated)]
179        self.third_party_active_checker
180            .as_ref()
181            .or(self.match_validation_type.as_ref())
182    }
183}
184
185impl<T> Deref for RootRuleConfig<T> {
186    type Target = T;
187
188    fn deref(&self) -> &Self::Target {
189        &self.inner
190    }
191}
192pub struct RootCompiledRule {
193    pub inner: Box<dyn CompiledRule>,
194    pub scope: Scope,
195    pub match_action: MatchAction,
196    pub match_validation_type: Option<MatchValidationType>,
197    pub suppressions: Option<CompiledSuppressions>,
198    pub precedence: Precedence,
199}
200
201impl RootCompiledRule {
202    pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
203        self.match_validation_type
204            .as_ref()
205            .map(|x| x.get_internal_match_validation_type())
206    }
207}
208
209impl Deref for RootCompiledRule {
210    type Target = dyn CompiledRule;
211
212    fn deref(&self) -> &Self::Target {
213        self.inner.as_ref()
214    }
215}
216
217pub struct StringMatchesCtx<'a> {
218    rule_index: usize,
219    pub regex_caches: &'a mut RegexCaches,
220    pub exclusion_check: &'a ExclusionCheck<'a>,
221    pub excluded_matches: &'a mut AHashSet<String>,
222    pub match_emitter: &'a mut dyn MatchEmitter,
223    pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
224
225    // Shared Data
226    pub per_string_data: &'a mut SharedData,
227    pub per_scanner_data: &'a SharedData,
228    pub per_event_data: &'a mut SharedData,
229    pub event_id: Option<&'a str>,
230}
231
232impl StringMatchesCtx<'_> {
233    /// If a `get_string_matches` implementation needs to do any async processing (e.g. I/O),
234    /// this function can be used to return an "async job" to find matches. The return value
235    /// of `process_async` should be returned from the `get_string_matches` function. The future
236    /// passed into this function will be spawned and executed immediately without blocking
237    /// other `get_string_matches` calls. This means all the async jobs will run concurrently.
238    ///
239    /// The `ctx` available to async jobs is more restrictive than the normal `ctx` available in
240    /// `get_string_matches`. The only thing you can do is return matches. If other data is needed,
241    /// it should be accessed before `process_async` is called.
242    pub fn process_async(
243        &self,
244        func: impl for<'a> FnOnce(
245            &'a mut AsyncStringMatchesCtx,
246        )
247            -> Pin<Box<dyn Future<Output = Result<(), ScannerError>> + Send + 'a>>
248        + Send
249        + 'static,
250    ) -> RuleResult {
251        let rule_index = self.rule_index;
252
253        // The future is spawned onto the tokio runtime immediately so it starts running
254        // in the background
255        let fut = TOKIO_RUNTIME.spawn(async move {
256            let start = Instant::now();
257            let mut ctx = AsyncStringMatchesCtx {
258                rule_matches: vec![],
259            };
260            (func)(&mut ctx).await?;
261            let io_duration = start.elapsed();
262
263            Ok(AsyncRuleInfo {
264                rule_index,
265                rule_matches: ctx.rule_matches,
266                io_duration,
267            })
268        });
269
270        Ok(RuleStatus::Pending(fut))
271    }
272}
273
274pub struct AsyncStringMatchesCtx {
275    rule_matches: Vec<StringMatch>,
276}
277
278impl AsyncStringMatchesCtx {
279    pub fn emit_match(&mut self, string_match: StringMatch) {
280        self.rule_matches.push(string_match);
281    }
282}
283
284#[must_use]
285pub enum RuleStatus {
286    Done,
287    Pending(PendingRuleResult),
288}
289
290// pub type PendingRuleResult = BoxFuture<'static, Result<AsyncRuleInfo, ScannerError>>;
291pub type PendingRuleResult = JoinHandle<Result<AsyncRuleInfo, ScannerError>>;
292
293pub struct PendingRuleJob {
294    fut: PendingRuleResult,
295    path: Path<'static>,
296}
297
298pub struct AsyncRuleInfo {
299    rule_index: usize,
300    rule_matches: Vec<StringMatch>,
301    io_duration: Duration,
302}
303
304/// A rule result that cannot be async
305pub type RuleResult = Result<RuleStatus, ScannerError>;
306
307// This is the public trait that is used to define the behavior of a compiled rule.
308pub trait CompiledRule: Send + Sync {
309    fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
310        // by default, no per-scanner data is initialized
311    }
312
313    fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
314        // by default, no per-string data is initialized
315    }
316
317    fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
318        // by default, no per-event data is initialized
319    }
320
321    fn get_string_matches(
322        &self,
323        content: &str,
324        path: &Path,
325        ctx: &mut StringMatchesCtx<'_>,
326    ) -> RuleResult;
327
328    // Whether a match from this rule should be excluded (marked as a false-positive)
329    // if the content of this match was found in a match from an excluded scope
330    fn should_exclude_multipass_v0(&self) -> bool {
331        // default is to NOT use Multi-pass V0
332        false
333    }
334
335    fn on_excluded_match_multipass_v0(&self) {
336        // default is to do nothing
337    }
338
339    fn as_regex_rule(&self) -> Option<&RegexCompiledRule> {
340        None
341    }
342
343    fn as_regex_rule_mut(&mut self) -> Option<&mut RegexCompiledRule> {
344        None
345    }
346
347    fn allow_scanner_to_exclude_namespace(&self) -> bool {
348        true
349    }
350}
351
352impl<T> RuleConfig for Box<T>
353where
354    T: RuleConfig + ?Sized,
355{
356    fn convert_to_compiled_rule(
357        &self,
358        rule_index: usize,
359        labels: Labels,
360    ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
361        self.as_ref().convert_to_compiled_rule(rule_index, labels)
362    }
363}
364
365#[derive(Debug, PartialEq, Clone)]
366struct ScannerFeatures {
367    pub add_implicit_index_wildcards: bool,
368    pub multipass_v0_enabled: bool,
369    pub return_matches: bool,
370}
371
372impl Default for ScannerFeatures {
373    fn default() -> Self {
374        Self {
375            add_implicit_index_wildcards: false,
376            multipass_v0_enabled: true,
377            return_matches: false,
378        }
379    }
380}
381
382pub struct ScanOptions {
383    // The blocked_rules_idx parameter is a list of rule indices that should be skipped for this scan.
384    // this list shall be small (<10), so a linear search is acceptable otherwise performance will be impacted.
385    pub blocked_rules_idx: Vec<usize>,
386    // The wildcarded_indices parameter is a map containing a list of tuples of (start, end) indices that should be treated as wildcards (for the message key only) per path.
387    pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
388    // Whether to validate matches using third-party validators (e.g., checksum validation for credit cards).
389    // When enabled, the scanner automatically collects match content needed for validation.
390    pub validate_matches: bool,
391}
392
393impl Default for ScanOptions {
394    fn default() -> Self {
395        Self {
396            blocked_rules_idx: vec![],
397            wildcarded_indices: AHashMap::new(),
398            validate_matches: false,
399        }
400    }
401}
402
403pub struct ScanOptionBuilder {
404    blocked_rules_idx: Vec<usize>,
405    wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
406    validate_matches: bool,
407}
408
409impl ScanOptionBuilder {
410    pub fn new() -> Self {
411        Self {
412            blocked_rules_idx: vec![],
413            wildcarded_indices: AHashMap::new(),
414            validate_matches: false,
415        }
416    }
417
418    pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
419        self.blocked_rules_idx = blocked_rules_idx;
420        self
421    }
422
423    pub fn with_wildcarded_indices(
424        mut self,
425        wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
426    ) -> Self {
427        self.wildcarded_indices = wildcarded_indices;
428        self
429    }
430
431    pub fn with_validate_matching(mut self, validate_matches: bool) -> Self {
432        self.validate_matches = validate_matches;
433        self
434    }
435
436    pub fn build(self) -> ScanOptions {
437        ScanOptions {
438            blocked_rules_idx: self.blocked_rules_idx,
439            wildcarded_indices: self.wildcarded_indices,
440            validate_matches: self.validate_matches,
441        }
442    }
443}
444
445pub struct Scanner {
446    rules: Vec<RootCompiledRule>,
447    scoped_ruleset: ScopedRuleSet,
448    scanner_features: ScannerFeatures,
449    metrics: ScannerMetrics,
450    labels: Labels,
451    match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
452    per_scanner_data: SharedData,
453    async_scan_timeout: Duration,
454}
455
456impl Scanner {
457    pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
458        ScannerBuilder::new(rules)
459    }
460
461    // This function scans the given event with the rules configured in the scanner.
462    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
463    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
464    // This version uses default scan options (no validation, no blocked rules, no wildcarded indices).
465    pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
466        self.scan_with_options(event, ScanOptions::default())
467    }
468
469    // This function scans the given event with the rules configured in the scanner.
470    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
471    // The options parameter allows customizing the scan behavior (validation, blocked rules, etc.).
472    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
473    pub fn scan_with_options<E: Event>(
474        &self,
475        event: &mut E,
476        options: ScanOptions,
477    ) -> Result<Vec<RuleMatch>, ScannerError> {
478        block_on(self.internal_scan_with_metrics(event, options))
479    }
480
481    // This function scans the given event with the rules configured in the scanner.
482    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
483    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
484    pub async fn scan_async<E: Event>(
485        &self,
486        event: &mut E,
487    ) -> Result<Vec<RuleMatch>, ScannerError> {
488        self.scan_async_with_options(event, ScanOptions::default())
489            .await
490    }
491
492    pub async fn scan_async_with_options<E: Event>(
493        &self,
494        event: &mut E,
495        options: ScanOptions,
496    ) -> Result<Vec<RuleMatch>, ScannerError> {
497        let fut = self.internal_scan_with_metrics(event, options);
498
499        // The sleep from the timeout requires being in a tokio context
500        // The guard needs to be dropped before await since the guard is !Send
501        let timeout = {
502            let _tokio_guard = TOKIO_RUNTIME.enter();
503            timeout(self.async_scan_timeout, fut)
504        };
505
506        timeout.await.unwrap_or(Err(ScannerError::Transient(
507            "Async scan timeout".to_string(),
508        )))
509    }
510
511    fn record_metrics(
512        &self,
513        output_rule_matches: &[RuleMatch],
514        start: Instant,
515        io_duration: Option<Duration>,
516    ) {
517        // Add number of scanned events
518        self.metrics.num_scanned_events.increment(1);
519        // Add number of matches
520        self.metrics
521            .match_count
522            .increment(output_rule_matches.len() as u64);
523
524        if let Some(io_duration) = io_duration {
525            let total_duration = start.elapsed();
526            let cpu_duration = total_duration.saturating_sub(io_duration);
527            self.metrics
528                .cpu_duration
529                .increment(cpu_duration.as_nanos() as u64);
530        }
531    }
532
533    async fn internal_scan_with_metrics<E: Event>(
534        &self,
535        event: &mut E,
536        options: ScanOptions,
537    ) -> Result<Vec<RuleMatch>, ScannerError> {
538        let start = Instant::now();
539        let result = self.internal_scan(event, options).await;
540        match result {
541            Ok((rule_matches, io_duration)) => {
542                self.record_metrics(&rule_matches, start, Some(io_duration));
543                Ok(rule_matches)
544            }
545            Err(e) => {
546                self.record_metrics(&[], start, None);
547                Err(e)
548            }
549        }
550    }
551
552    fn process_rule_matches<E: Event>(
553        &self,
554        event: &mut E,
555        rule_matches: InternalRuleMatchSet<E::Encoding>,
556        excluded_matches: AHashSet<String>,
557        output_rule_matches: &mut Vec<RuleMatch>,
558        need_match_content: bool,
559    ) {
560        if rule_matches.is_empty() {
561            return;
562        }
563        access_regex_caches(|regex_caches| {
564            for (path, mut rule_matches) in rule_matches.into_iter() {
565                // All rule matches in each inner list are for a single path, so they can be processed independently.
566                event.visit_string_mut(&path, |content| {
567                    // calculate_indices requires that matches are sorted by start index
568                    rule_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
569
570                    <<E as Event>::Encoding>::calculate_indices(
571                        content,
572                        rule_matches.iter_mut().map(
573                            |rule_match: &mut InternalRuleMatch<E::Encoding>| EncodeIndices {
574                                utf8_start: rule_match.utf8_start,
575                                utf8_end: rule_match.utf8_end,
576                                custom_start: &mut rule_match.custom_start,
577                                custom_end: &mut rule_match.custom_end,
578                            },
579                        ),
580                    );
581
582                    if self.scanner_features.multipass_v0_enabled {
583                        // Now that the `excluded_matches` set is fully populated, filter out any matches
584                        // that are the same as excluded matches (also known as "Multi-pass V0")
585                        rule_matches.retain(|rule_match| {
586                            if self.rules[rule_match.rule_index]
587                                .inner
588                                .should_exclude_multipass_v0()
589                            {
590                                let is_false_positive = excluded_matches
591                                    .contains(&content[rule_match.utf8_start..rule_match.utf8_end]);
592                                if is_false_positive && self.scanner_features.multipass_v0_enabled {
593                                    self.rules[rule_match.rule_index]
594                                        .on_excluded_match_multipass_v0();
595                                }
596                                !is_false_positive
597                            } else {
598                                true
599                            }
600                        });
601                    }
602
603                    self.suppress_matches::<E::Encoding>(&mut rule_matches, content, regex_caches);
604
605                    self.sort_and_remove_overlapping_rules::<E::Encoding>(&mut rule_matches);
606
607                    let will_mutate = rule_matches.iter().any(|rule_match| {
608                        self.rules[rule_match.rule_index].match_action.is_mutating()
609                    });
610
611                    self.apply_match_actions(
612                        content,
613                        &path,
614                        rule_matches,
615                        output_rule_matches,
616                        need_match_content,
617                    );
618
619                    will_mutate
620                });
621            }
622        });
623    }
624
625    async fn internal_scan<E: Event>(
626        &self,
627        event: &mut E,
628        options: ScanOptions,
629    ) -> Result<(Vec<RuleMatch>, Duration), ScannerError> {
630        // If validation is requested, we need to collect match content even if the scanner
631        // wasn't originally configured to return matches
632        let need_match_content = self.scanner_features.return_matches || options.validate_matches;
633        // All matches, after some (but not all) false-positives have been removed.
634        let mut rule_matches = InternalRuleMatchSet::new();
635        let mut excluded_matches = AHashSet::new();
636        let mut async_jobs = vec![];
637
638        access_regex_caches(|regex_caches| {
639            self.scoped_ruleset.visit_string_rule_combinations(
640                event,
641                ScannerContentVisitor {
642                    scanner: self,
643                    regex_caches,
644                    rule_matches: &mut rule_matches,
645                    blocked_rules: &options.blocked_rules_idx,
646                    excluded_matches: &mut excluded_matches,
647                    per_event_data: SharedData::new(),
648                    wildcarded_indexes: &options.wildcarded_indices,
649                    async_jobs: &mut async_jobs,
650                    event_id: event.get_id().map(|s| s.to_string()),
651                },
652            )
653        })?;
654
655        // The async jobs were already spawned on the tokio runtime, so the
656        // results just need to be collected
657        let mut total_io_duration = Duration::ZERO;
658        for job in async_jobs {
659            let rule_info = job.fut.await.unwrap()?;
660            total_io_duration += rule_info.io_duration;
661            rule_matches.push_async_matches(
662                &job.path,
663                rule_info
664                    .rule_matches
665                    .into_iter()
666                    .map(|x| InternalRuleMatch::new(rule_info.rule_index, x)),
667            );
668        }
669
670        let mut output_rule_matches = vec![];
671
672        self.process_rule_matches(
673            event,
674            rule_matches,
675            excluded_matches,
676            &mut output_rule_matches,
677            need_match_content,
678        );
679
680        if options.validate_matches {
681            self.validate_matches(&mut output_rule_matches);
682        }
683
684        Ok((output_rule_matches, total_io_duration))
685    }
686
687    pub fn suppress_matches<E: Encoding>(
688        &self,
689        rule_matches: &mut Vec<InternalRuleMatch<E>>,
690        content: &str,
691        regex_caches: &mut RegexCaches,
692    ) {
693        rule_matches.retain(|rule_match| {
694            if let Some(suppressions) = &self.rules[rule_match.rule_index].suppressions {
695                let match_should_be_suppressed = suppressions.should_match_be_suppressed(
696                    &content[rule_match.utf8_start..rule_match.utf8_end],
697                    regex_caches,
698                );
699
700                if match_should_be_suppressed {
701                    self.metrics.suppressed_match_count.increment(1);
702                }
703                !match_should_be_suppressed
704            } else {
705                true
706            }
707        });
708    }
709
710    pub fn validate_matches(&self, rule_matches: &mut Vec<RuleMatch>) {
711        // Create MatchValidatorRuleMatch per match_validator_type to pass it to each match_validator
712        let mut match_validator_rule_match_per_type = AHashMap::new();
713
714        let mut validated_rule_matches = vec![];
715
716        for mut rule_match in rule_matches.drain(..) {
717            let rule = &self.rules[rule_match.rule_index];
718            if let Some(match_validation_type) = rule.internal_match_validation_type() {
719                match_validator_rule_match_per_type
720                    .entry(match_validation_type)
721                    .or_insert_with(Vec::new)
722                    .push(rule_match)
723            } else {
724                // There is no match validator for this rule, so mark it as not available.
725                rule_match.match_status.merge(MatchStatus::NotAvailable);
726                validated_rule_matches.push(rule_match);
727            }
728        }
729
730        RAYON_THREAD_POOL.install(|| {
731            use rayon::prelude::*;
732
733            match_validator_rule_match_per_type.par_iter_mut().for_each(
734                |(match_validation_type, matches_per_type)| {
735                    let match_validator = self.match_validators_per_type.get(match_validation_type);
736                    if let Some(match_validator) = match_validator {
737                        match_validator
738                            .as_ref()
739                            .validate(matches_per_type, &self.rules)
740                    }
741                },
742            );
743        });
744
745        // Refill the rule_matches with the validated matches
746        for (_, mut matches) in match_validator_rule_match_per_type {
747            validated_rule_matches.append(&mut matches);
748        }
749
750        // Sort rule_matches by start index
751        validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
752        *rule_matches = validated_rule_matches;
753    }
754
755    /// Apply mutations from actions, and shift indices to match the mutated values.
756    /// This assumes the matches are all from the content given, and are sorted by start index.
757    fn apply_match_actions<E: Encoding>(
758        &self,
759        content: &mut String,
760        path: &Path<'static>,
761        rule_matches: Vec<InternalRuleMatch<E>>,
762        output_rule_matches: &mut Vec<RuleMatch>,
763        need_match_content: bool,
764    ) {
765        let mut utf8_byte_delta: isize = 0;
766        let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
767
768        for rule_match in rule_matches {
769            output_rule_matches.push(self.apply_match_actions_for_string::<E>(
770                content,
771                path.clone(),
772                rule_match,
773                &mut utf8_byte_delta,
774                &mut custom_index_delta,
775                need_match_content,
776            ));
777        }
778    }
779
780    /// This will be called once for each match of a single string. The rules must be passed in in order of the start index. Mutating rules must not overlap.
781    fn apply_match_actions_for_string<E: Encoding>(
782        &self,
783        content: &mut String,
784        path: Path<'static>,
785        rule_match: InternalRuleMatch<E>,
786        // The current difference in length between the original and mutated string
787        utf8_byte_delta: &mut isize,
788
789        // The difference between the custom index on the original string and the mutated string
790        custom_index_delta: &mut <E>::IndexShift,
791        need_match_content: bool,
792    ) -> RuleMatch {
793        let rule = &self.rules[rule_match.rule_index];
794
795        let custom_start =
796            (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
797                + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
798
799        let mut matched_content_copy = None;
800
801        if need_match_content {
802            // This copies part of the is_mutating block but is seperate since can't mix compilation condition and code condition
803            let mutated_utf8_match_start =
804                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
805            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
806
807            // Matches for mutating rules must have valid indices
808            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
809            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
810
811            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
812            matched_content_copy = Some(matched_content.to_string());
813        }
814
815        if rule.match_action.is_mutating() {
816            let mutated_utf8_match_start =
817                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
818            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
819
820            // Matches for mutating rules must have valid indices
821            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
822            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
823
824            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
825            if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
826                let before_replacement = &matched_content[replacement.start..replacement.end];
827
828                // update indices to match the new mutated content
829                <E>::adjust_shift(
830                    custom_index_delta,
831                    before_replacement,
832                    &replacement.replacement,
833                );
834                *utf8_byte_delta +=
835                    replacement.replacement.len() as isize - before_replacement.len() as isize;
836
837                let replacement_start = mutated_utf8_match_start + replacement.start;
838                let replacement_end = mutated_utf8_match_start + replacement.end;
839                content.replace_range(replacement_start..replacement_end, &replacement.replacement);
840            }
841        }
842
843        let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
844        let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
845            + shift_offset) as usize;
846
847        let rule = &self.rules[rule_match.rule_index];
848
849        let match_status: MatchStatus = if rule.match_validation_type.is_some() {
850            MatchStatus::NotChecked
851        } else {
852            MatchStatus::NotAvailable
853        };
854
855        RuleMatch {
856            rule_index: rule_match.rule_index,
857            path,
858            replacement_type: rule.match_action.replacement_type(),
859            start_index: custom_start,
860            end_index_exclusive: custom_end,
861            shift_offset,
862            match_value: matched_content_copy,
863            match_status,
864            keyword: rule_match.keyword,
865        }
866    }
867
868    fn sort_and_remove_overlapping_rules<E: Encoding>(
869        &self,
870        rule_matches: &mut Vec<InternalRuleMatch<E>>,
871    ) {
872        // Some of the scanner code relies on the behavior here, such as the sort order and removal of overlapping mutating rules.
873        // Be very careful if this function is modified.
874
875        rule_matches.sort_unstable_by(|a, b| {
876            // Mutating rules are a higher priority (earlier in the list)
877            let ord = self.rules[a.rule_index]
878                .match_action
879                .is_mutating()
880                .cmp(&self.rules[b.rule_index].match_action.is_mutating())
881                .reverse();
882
883            // Earlier start offset
884            let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
885
886            // Longer matches
887            let ord = ord.then(a.len().cmp(&b.len()).reverse());
888
889            // Matches with higher precedence come first
890            let ord = ord.then(
891                self.rules[a.rule_index]
892                    .precedence
893                    .cmp(&self.rules[b.rule_index].precedence)
894                    .reverse(),
895            );
896
897            // Matches from earlier rules
898            let ord = ord.then(a.rule_index.cmp(&b.rule_index));
899
900            // swap the order of everything so matches can be efficiently popped off the back as they are processed
901            ord.reverse()
902        });
903
904        let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
905
906        'rule_matches: while let Some(rule_match) = rule_matches.pop() {
907            if self.rules[rule_match.rule_index].match_action.is_mutating() {
908                // Mutating rules are kept only if they don't overlap with a previous rule.
909                if let Some(last) = retained_rules.last()
910                    && last.utf8_end > rule_match.utf8_start
911                {
912                    continue;
913                }
914            } else {
915                // Only retain if it doesn't overlap with any other rule. Since mutating matches are sorted before non-mutated matches
916                // this needs to check all retained matches (instead of just the last one)
917                for retained_rule in &retained_rules {
918                    if retained_rule.utf8_start < rule_match.utf8_end
919                        && retained_rule.utf8_end > rule_match.utf8_start
920                    {
921                        continue 'rule_matches;
922                    }
923                }
924            };
925            retained_rules.push(rule_match);
926        }
927
928        // ensure rules are sorted by start index (other parts of the library required this to function correctly)
929        retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
930
931        *rule_matches = retained_rules;
932    }
933}
934
935impl Drop for Scanner {
936    fn drop(&mut self) {
937        let stats = &*GLOBAL_STATS;
938        stats.scanner_deletions.increment(1);
939        stats.decrement_total_scanners();
940    }
941}
942
943#[derive(Default)]
944pub struct ScannerBuilder<'a> {
945    rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
946    labels: Labels,
947    scanner_features: ScannerFeatures,
948    async_scan_timeout: Duration,
949}
950
951impl ScannerBuilder<'_> {
952    pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
953        ScannerBuilder {
954            rules,
955            labels: Labels::empty(),
956            scanner_features: ScannerFeatures::default(),
957            async_scan_timeout: Duration::from_secs(60 * 5),
958        }
959    }
960
961    pub fn labels(mut self, labels: Labels) -> Self {
962        self.labels = labels;
963        self
964    }
965
966    pub fn with_async_scan_timeout(mut self, duration: Duration) -> Self {
967        self.async_scan_timeout = duration;
968        self
969    }
970
971    pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
972        self.scanner_features.add_implicit_index_wildcards = value;
973        self
974    }
975
976    pub fn with_return_matches(mut self, value: bool) -> Self {
977        self.scanner_features.return_matches = value;
978        self
979    }
980
981    /// Enables/Disables the Multipass V0 feature. This defaults to TRUE.
982    /// Multipass V0 saves matches from excluded scopes, and marks any identical
983    /// matches in included scopes as a false positive.
984    pub fn with_multipass_v0(mut self, value: bool) -> Self {
985        self.scanner_features.multipass_v0_enabled = value;
986        self
987    }
988
989    pub fn build(self) -> Result<Scanner, CreateScannerError> {
990        let mut match_validators_per_type = AHashMap::new();
991
992        for rule in self.rules.iter() {
993            if let Some(match_validation_type) = &rule.get_third_party_active_checker()
994                && match_validation_type.can_create_match_validator()
995            {
996                let internal_type = match_validation_type.get_internal_match_validation_type();
997                let match_validator = match_validation_type.into_match_validator();
998                if let Ok(match_validator) = match_validator {
999                    if !match_validators_per_type.contains_key(&internal_type) {
1000                        match_validators_per_type.insert(internal_type, match_validator);
1001                    }
1002                } else {
1003                    return Err(CreateScannerError::InvalidMatchValidator(
1004                        MatchValidatorCreationError::InternalError,
1005                    ));
1006                }
1007            }
1008        }
1009
1010        let compiled_rules = self
1011            .rules
1012            .iter()
1013            .enumerate()
1014            .map(|(rule_index, config)| {
1015                let inner = config.convert_to_compiled_rule(rule_index, self.labels.clone())?;
1016                config.match_action.validate()?;
1017                let compiled_suppressions = match &config.suppressions {
1018                    Some(s) => s.compile()?,
1019                    None => None,
1020                };
1021                Ok(RootCompiledRule {
1022                    inner,
1023                    scope: config.scope.clone(),
1024                    match_action: config.match_action.clone(),
1025                    match_validation_type: config.get_third_party_active_checker().cloned(),
1026                    suppressions: compiled_suppressions,
1027                    precedence: config.precedence,
1028                })
1029            })
1030            .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
1031
1032        let mut per_scanner_data = SharedData::new();
1033
1034        compiled_rules.iter().for_each(|rule| {
1035            rule.init_per_scanner_data(&mut per_scanner_data);
1036        });
1037
1038        let scoped_ruleset = ScopedRuleSet::new(
1039            &compiled_rules
1040                .iter()
1041                .map(|rule| rule.scope.clone())
1042                .collect::<Vec<_>>(),
1043        )
1044        .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
1045
1046        {
1047            let stats = &*GLOBAL_STATS;
1048            stats.scanner_creations.increment(1);
1049            stats.increment_total_scanners();
1050        }
1051
1052        Ok(Scanner {
1053            rules: compiled_rules,
1054            scoped_ruleset,
1055            scanner_features: self.scanner_features,
1056            metrics: ScannerMetrics::new(&self.labels),
1057            match_validators_per_type,
1058            labels: self.labels,
1059            per_scanner_data,
1060            async_scan_timeout: self.async_scan_timeout,
1061        })
1062    }
1063}
1064
1065struct ScannerContentVisitor<'a, E: Encoding> {
1066    scanner: &'a Scanner,
1067    regex_caches: &'a mut RegexCaches,
1068    rule_matches: &'a mut InternalRuleMatchSet<E>,
1069    // Rules that shall be skipped for this scan
1070    // This list shall be small (<10), so a linear search is acceptable
1071    blocked_rules: &'a Vec<usize>,
1072    excluded_matches: &'a mut AHashSet<String>,
1073    per_event_data: SharedData,
1074    wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
1075    async_jobs: &'a mut Vec<PendingRuleJob>,
1076    event_id: Option<String>,
1077}
1078
1079impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
1080    fn visit_content<'b>(
1081        &'b mut self,
1082        path: &Path<'a>,
1083        content: &str,
1084        mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
1085        exclusion_check: ExclusionCheck<'b>,
1086    ) -> Result<bool, ScannerError> {
1087        // matches for a single path
1088        let mut path_rules_matches = vec![];
1089
1090        // Create a map of per rule type data that can be shared between rules of the same type
1091        let mut per_string_data = SharedData::new();
1092        let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
1093
1094        rule_visitor.visit_rule_indices(|rule_index| {
1095            if self.blocked_rules.contains(&rule_index) {
1096                return Ok(());
1097            }
1098            let rule = &self.scanner.rules[rule_index];
1099            {
1100                if rule.inner.allow_scanner_to_exclude_namespace() {
1101                    // check if the path is excluded
1102                    if exclusion_check.is_excluded(rule_index) {
1103                        return Ok(());
1104                    }
1105                }
1106                // creating the emitter is basically free, it will get mostly optimized away
1107                let mut emitter = |rule_match: StringMatch| {
1108                    // This should never happen, but to ensure no empty match is ever generated
1109                    // (which may cause an infinite loop), this will panic instead.
1110                    assert_ne!(
1111                        rule_match.start, rule_match.end,
1112                        "empty match detected on rule with index {rule_index}"
1113                    );
1114                    path_rules_matches.push(InternalRuleMatch::new(rule_index, rule_match));
1115                };
1116
1117                rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
1118
1119                // TODO: move this somewhere higher?
1120                rule.init_per_event_data(&mut self.per_event_data);
1121
1122                let mut ctx = StringMatchesCtx {
1123                    rule_index,
1124                    regex_caches: self.regex_caches,
1125                    exclusion_check: &exclusion_check,
1126                    excluded_matches: self.excluded_matches,
1127                    match_emitter: &mut emitter,
1128                    wildcard_indices: wildcard_indices_per_path,
1129                    per_string_data: &mut per_string_data,
1130                    per_scanner_data: &self.scanner.per_scanner_data,
1131                    per_event_data: &mut self.per_event_data,
1132                    event_id: self.event_id.as_deref(),
1133                };
1134
1135                let async_status = rule.get_string_matches(content, path, &mut ctx)?;
1136
1137                match async_status {
1138                    RuleStatus::Done => {
1139                        // nothing to do
1140                    }
1141                    RuleStatus::Pending(fut) => {
1142                        self.async_jobs.push(PendingRuleJob {
1143                            fut,
1144                            path: path.into_static(),
1145                        });
1146                    }
1147                }
1148            }
1149            Ok(())
1150        })?;
1151
1152        // If there are any matches, the string will need to be accessed to check for false positives from
1153        // excluded matches, any to potentially mutate the string.
1154        // If there are any async jobs, this is also true since it's not known yet whether there
1155        // will be a match
1156        let needs_to_access_content = !path_rules_matches.is_empty() || !self.async_jobs.is_empty();
1157
1158        self.rule_matches
1159            .push_sync_matches(path, path_rules_matches);
1160
1161        Ok(needs_to_access_content)
1162    }
1163}
1164
1165// Calculates the next starting position for a regex match if a the previous match is a false positive
1166fn get_next_regex_start(content: &str, regex_match: (usize, usize)) -> Option<usize> {
1167    // The next valid UTF8 char after the start of the regex match is used
1168    if let Some((i, _)) = content[regex_match.0..].char_indices().nth(1) {
1169        Some(regex_match.0 + i)
1170    } else {
1171        // There are no more chars left in the string to scan
1172        None
1173    }
1174}
1175
1176fn is_false_positive_match(
1177    regex_match_range: (usize, usize),
1178    rule: &RegexCompiledRule,
1179    content: &str,
1180    check_excluded_keywords: bool,
1181) -> bool {
1182    if check_excluded_keywords
1183        && let Some(excluded_keywords) = &rule.excluded_keywords
1184        && excluded_keywords.is_false_positive_match(content, regex_match_range.0)
1185    {
1186        return true;
1187    }
1188
1189    if let Some(validator) = rule.validator.as_ref()
1190        && !validator.is_valid_match(&content[regex_match_range.0..regex_match_range.1])
1191    {
1192        return true;
1193    }
1194    false
1195}