dd_sds/scanner/
mod.rs

1use crate::encoding::Encoding;
2use crate::event::Event;
3use std::future::Future;
4
5use crate::match_validation::{
6    config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
7    match_validator::MatchValidator,
8};
9
10use error::MatchValidatorCreationError;
11
12use self::metrics::ScannerMetrics;
13use crate::match_validation::match_validator::RAYON_THREAD_POOL;
14use crate::observability::labels::Labels;
15use crate::rule_match::{InternalRuleMatch, RuleMatch};
16use crate::scanner::config::RuleConfig;
17use crate::scanner::internal_rule_match_set::InternalRuleMatchSet;
18use crate::scanner::regex_rule::compiled::RegexCompiledRule;
19use crate::scanner::regex_rule::{RegexCaches, access_regex_caches};
20use crate::scanner::scope::Scope;
21pub use crate::scanner::shared_data::SharedData;
22use crate::scanner::suppression::{CompiledSuppressions, SuppressionValidationError, Suppressions};
23use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
24pub use crate::secondary_validation::Validator;
25use crate::stats::GLOBAL_STATS;
26use crate::tokio::TOKIO_RUNTIME;
27use crate::{CreateScannerError, EncodeIndices, MatchAction, Path, ScannerError};
28use ahash::{AHashMap, AHashSet};
29use futures::executor::block_on;
30use serde::{Deserialize, Serialize};
31use serde_with::serde_as;
32use std::ops::Deref;
33use std::pin::Pin;
34use std::sync::Arc;
35use std::time::{Duration, Instant};
36use tokio::task::JoinHandle;
37use tokio::time::timeout;
38
39pub mod config;
40pub mod debug_scan;
41pub mod error;
42pub mod metrics;
43pub mod regex_rule;
44pub mod scope;
45pub mod shared_data;
46pub mod shared_pool;
47pub mod suppression;
48
49mod internal_rule_match_set;
50#[cfg(test)]
51mod test;
52
53#[derive(Copy, Clone)]
54pub struct StringMatch {
55    pub start: usize,
56    pub end: usize,
57}
58
59pub trait MatchEmitter<T = ()> {
60    fn emit(&mut self, string_match: StringMatch) -> T;
61}
62
63// This implements MatchEmitter for mutable closures (so you can use a closure instead of a custom
64// struct that implements MatchEmitter)
65impl<F, T> MatchEmitter<T> for F
66where
67    F: FnMut(StringMatch) -> T,
68{
69    fn emit(&mut self, string_match: StringMatch) -> T {
70        // This just calls the closure (itself)
71        (self)(string_match)
72    }
73}
74
75/// The precedence of a rule. Catchall is the lowest precedence, Specific is the highest precedence.
76/// The default precedence is Specific.
77/// For rules that:
78/// - Have the same mutation priority
79/// - Match at the same index
80/// - Match the same number of characters
81///
82/// Then the rule with the highest precedence will be used.
83#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Copy)]
84pub enum Precedence {
85    Catchall,
86    Generic,
87    Specific,
88}
89
90impl Default for Precedence {
91    fn default() -> Self {
92        Self::Specific
93    }
94}
95
96#[serde_as]
97#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
98pub struct RootRuleConfig<T> {
99    pub match_action: MatchAction,
100    #[serde(default)]
101    pub scope: Scope,
102    #[deprecated(note = "Use `third_party_active_checker` instead")]
103    match_validation_type: Option<MatchValidationType>,
104    third_party_active_checker: Option<MatchValidationType>,
105    suppressions: Option<Suppressions>,
106    #[serde(default)]
107    precedence: Precedence,
108    #[serde(flatten)]
109    pub inner: T,
110}
111
112impl<T> RootRuleConfig<T>
113where
114    T: RuleConfig + 'static,
115{
116    pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
117        RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
118    }
119
120    pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
121        self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
122    }
123}
124
125impl<T> RootRuleConfig<T> {
126    pub fn new(inner: T) -> Self {
127        #[allow(deprecated)]
128        Self {
129            match_action: MatchAction::None,
130            scope: Scope::all(),
131            match_validation_type: None,
132            third_party_active_checker: None,
133            suppressions: None,
134            precedence: Precedence::default(),
135            inner,
136        }
137    }
138
139    pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
140        #[allow(deprecated)]
141        RootRuleConfig {
142            match_action: self.match_action,
143            scope: self.scope,
144            match_validation_type: self.match_validation_type,
145            third_party_active_checker: self.third_party_active_checker,
146            suppressions: self.suppressions,
147            precedence: self.precedence,
148            inner: func(self.inner),
149        }
150    }
151
152    pub fn match_action(mut self, action: MatchAction) -> Self {
153        self.match_action = action;
154        self
155    }
156
157    pub fn precedence(mut self, precedence: Precedence) -> Self {
158        self.precedence = precedence;
159        self
160    }
161
162    pub fn scope(mut self, scope: Scope) -> Self {
163        self.scope = scope;
164        self
165    }
166
167    pub fn third_party_active_checker(
168        mut self,
169        match_validation_type: MatchValidationType,
170    ) -> Self {
171        self.third_party_active_checker = Some(match_validation_type);
172        self
173    }
174
175    pub fn suppressions(mut self, suppressions: Suppressions) -> Self {
176        self.suppressions = Some(suppressions);
177        self
178    }
179
180    fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
181        #[allow(deprecated)]
182        self.third_party_active_checker
183            .as_ref()
184            .or(self.match_validation_type.as_ref())
185    }
186}
187
188impl<T> Deref for RootRuleConfig<T> {
189    type Target = T;
190
191    fn deref(&self) -> &Self::Target {
192        &self.inner
193    }
194}
195pub struct RootCompiledRule {
196    pub inner: Box<dyn CompiledRule>,
197    pub scope: Scope,
198    pub match_action: MatchAction,
199    pub match_validation_type: Option<MatchValidationType>,
200    pub suppressions: Option<CompiledSuppressions>,
201    pub precedence: Precedence,
202}
203
204impl RootCompiledRule {
205    pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
206        self.match_validation_type
207            .as_ref()
208            .map(|x| x.get_internal_match_validation_type())
209    }
210}
211
212impl Deref for RootCompiledRule {
213    type Target = dyn CompiledRule;
214
215    fn deref(&self) -> &Self::Target {
216        self.inner.as_ref()
217    }
218}
219
220pub struct StringMatchesCtx<'a> {
221    rule_index: usize,
222    pub regex_caches: &'a mut RegexCaches,
223    pub exclusion_check: &'a ExclusionCheck<'a>,
224    pub excluded_matches: &'a mut AHashSet<String>,
225    pub match_emitter: &'a mut dyn MatchEmitter,
226    pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
227
228    // Shared Data
229    pub per_string_data: &'a mut SharedData,
230    pub per_scanner_data: &'a SharedData,
231    pub per_event_data: &'a mut SharedData,
232    pub event_id: Option<&'a str>,
233}
234
235impl StringMatchesCtx<'_> {
236    /// If a `get_string_matches` implementation needs to do any async processing (e.g. I/O),
237    /// this function can be used to return an "async job" to find matches. The return value
238    /// of `process_async` should be returned from the `get_string_matches` function. The future
239    /// passed into this function will be spawned and executed immediately without blocking
240    /// other `get_string_matches` calls. This means all the async jobs will run concurrently.
241    ///
242    /// The `ctx` available to async jobs is more restrictive than the normal `ctx` available in
243    /// `get_string_matches`. The only thing you can do is return matches. If other data is needed,
244    /// it should be accessed before `process_async` is called.
245    pub fn process_async(
246        &self,
247        func: impl for<'a> FnOnce(
248            &'a mut AsyncStringMatchesCtx,
249        )
250            -> Pin<Box<dyn Future<Output = Result<(), ScannerError>> + Send + 'a>>
251        + Send
252        + 'static,
253    ) -> RuleResult {
254        let rule_index = self.rule_index;
255
256        // The future is spawned onto the tokio runtime immediately so it starts running
257        // in the background
258        let fut = TOKIO_RUNTIME.spawn(async move {
259            let start = Instant::now();
260            let mut ctx = AsyncStringMatchesCtx {
261                rule_matches: vec![],
262            };
263            (func)(&mut ctx).await?;
264            let io_duration = start.elapsed();
265
266            Ok(AsyncRuleInfo {
267                rule_index,
268                rule_matches: ctx.rule_matches,
269                io_duration,
270            })
271        });
272
273        Ok(RuleStatus::Pending(fut))
274    }
275}
276
277pub struct AsyncStringMatchesCtx {
278    rule_matches: Vec<StringMatch>,
279}
280
281impl AsyncStringMatchesCtx {
282    pub fn emit_match(&mut self, string_match: StringMatch) {
283        self.rule_matches.push(string_match);
284    }
285}
286
287#[must_use]
288pub enum RuleStatus {
289    Done,
290    Pending(PendingRuleResult),
291}
292
293// pub type PendingRuleResult = BoxFuture<'static, Result<AsyncRuleInfo, ScannerError>>;
294pub type PendingRuleResult = JoinHandle<Result<AsyncRuleInfo, ScannerError>>;
295
296pub struct PendingRuleJob {
297    fut: PendingRuleResult,
298    path: Path<'static>,
299}
300
301pub struct AsyncRuleInfo {
302    rule_index: usize,
303    rule_matches: Vec<StringMatch>,
304    io_duration: Duration,
305}
306
307/// A rule result that cannot be async
308pub type RuleResult = Result<RuleStatus, ScannerError>;
309
310// This is the public trait that is used to define the behavior of a compiled rule.
311pub trait CompiledRule: Send + Sync {
312    fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
313        // by default, no per-scanner data is initialized
314    }
315
316    fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
317        // by default, no per-string data is initialized
318    }
319
320    fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
321        // by default, no per-event data is initialized
322    }
323
324    fn get_string_matches(
325        &self,
326        content: &str,
327        path: &Path,
328        ctx: &mut StringMatchesCtx<'_>,
329    ) -> RuleResult;
330
331    // Whether a match from this rule should be excluded (marked as a false-positive)
332    // if the content of this match was found in a match from an excluded scope
333    fn should_exclude_multipass_v0(&self) -> bool {
334        // default is to NOT use Multi-pass V0
335        false
336    }
337
338    fn on_excluded_match_multipass_v0(&self) {
339        // default is to do nothing
340    }
341
342    fn as_regex_rule(&self) -> Option<&RegexCompiledRule> {
343        None
344    }
345
346    fn as_regex_rule_mut(&mut self) -> Option<&mut RegexCompiledRule> {
347        None
348    }
349
350    fn allow_scanner_to_exclude_namespace(&self) -> bool {
351        true
352    }
353}
354
355impl<T> RuleConfig for Box<T>
356where
357    T: RuleConfig + ?Sized,
358{
359    fn convert_to_compiled_rule(
360        &self,
361        rule_index: usize,
362        labels: Labels,
363    ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
364        self.as_ref().convert_to_compiled_rule(rule_index, labels)
365    }
366}
367
368#[derive(Debug, PartialEq, Clone)]
369struct ScannerFeatures {
370    pub add_implicit_index_wildcards: bool,
371    pub multipass_v0_enabled: bool,
372    pub return_matches: bool,
373}
374
375impl Default for ScannerFeatures {
376    fn default() -> Self {
377        Self {
378            add_implicit_index_wildcards: false,
379            multipass_v0_enabled: true,
380            return_matches: false,
381        }
382    }
383}
384
385pub struct ScanOptions {
386    // The blocked_rules_idx parameter is a list of rule indices that should be skipped for this scan.
387    // this list shall be small (<10), so a linear search is acceptable otherwise performance will be impacted.
388    pub blocked_rules_idx: Vec<usize>,
389    // The wildcarded_indices parameter is a map containing a list of tuples of (start, end) indices that should be treated as wildcards (for the message key only) per path.
390    pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
391    // Whether to validate matches using third-party validators (e.g., checksum validation for credit cards).
392    // When enabled, the scanner automatically collects match content needed for validation.
393    pub validate_matches: bool,
394}
395
396impl Default for ScanOptions {
397    fn default() -> Self {
398        Self {
399            blocked_rules_idx: vec![],
400            wildcarded_indices: AHashMap::new(),
401            validate_matches: false,
402        }
403    }
404}
405
406pub struct ScanOptionBuilder {
407    blocked_rules_idx: Vec<usize>,
408    wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
409    validate_matches: bool,
410}
411
412impl ScanOptionBuilder {
413    pub fn new() -> Self {
414        Self {
415            blocked_rules_idx: vec![],
416            wildcarded_indices: AHashMap::new(),
417            validate_matches: false,
418        }
419    }
420
421    pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
422        self.blocked_rules_idx = blocked_rules_idx;
423        self
424    }
425
426    pub fn with_wildcarded_indices(
427        mut self,
428        wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
429    ) -> Self {
430        self.wildcarded_indices = wildcarded_indices;
431        self
432    }
433
434    pub fn with_validate_matching(mut self, validate_matches: bool) -> Self {
435        self.validate_matches = validate_matches;
436        self
437    }
438
439    pub fn build(self) -> ScanOptions {
440        ScanOptions {
441            blocked_rules_idx: self.blocked_rules_idx,
442            wildcarded_indices: self.wildcarded_indices,
443            validate_matches: self.validate_matches,
444        }
445    }
446}
447
448pub struct Scanner {
449    rules: Vec<RootCompiledRule>,
450    scoped_ruleset: ScopedRuleSet,
451    scanner_features: ScannerFeatures,
452    metrics: ScannerMetrics,
453    labels: Labels,
454    match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
455    per_scanner_data: SharedData,
456    async_scan_timeout: Duration,
457}
458
459impl Scanner {
460    pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
461        ScannerBuilder::new(rules)
462    }
463
464    // This function scans the given event with the rules configured in the scanner.
465    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
466    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
467    // This version uses default scan options (no validation, no blocked rules, no wildcarded indices).
468    pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
469        self.scan_with_options(event, ScanOptions::default())
470    }
471
472    // This function scans the given event with the rules configured in the scanner.
473    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
474    // The options parameter allows customizing the scan behavior (validation, blocked rules, etc.).
475    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
476    pub fn scan_with_options<E: Event>(
477        &self,
478        event: &mut E,
479        options: ScanOptions,
480    ) -> Result<Vec<RuleMatch>, ScannerError> {
481        block_on(self.internal_scan_with_metrics(event, options))
482    }
483
484    // This function scans the given event with the rules configured in the scanner.
485    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
486    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
487    pub async fn scan_async<E: Event>(
488        &self,
489        event: &mut E,
490    ) -> Result<Vec<RuleMatch>, ScannerError> {
491        self.scan_async_with_options(event, ScanOptions::default())
492            .await
493    }
494
495    pub async fn scan_async_with_options<E: Event>(
496        &self,
497        event: &mut E,
498        options: ScanOptions,
499    ) -> Result<Vec<RuleMatch>, ScannerError> {
500        let fut = self.internal_scan_with_metrics(event, options);
501
502        // The sleep from the timeout requires being in a tokio context
503        // The guard needs to be dropped before await since the guard is !Send
504        let timeout = {
505            let _tokio_guard = TOKIO_RUNTIME.enter();
506            timeout(self.async_scan_timeout, fut)
507        };
508
509        timeout.await.unwrap_or(Err(ScannerError::Transient(
510            "Async scan timeout".to_string(),
511        )))
512    }
513
514    fn record_metrics(
515        &self,
516        output_rule_matches: &[RuleMatch],
517        start: Instant,
518        io_duration: Option<Duration>,
519    ) {
520        // Add number of scanned events
521        self.metrics.num_scanned_events.increment(1);
522        // Add number of matches
523        self.metrics
524            .match_count
525            .increment(output_rule_matches.len() as u64);
526
527        if let Some(io_duration) = io_duration {
528            let total_duration = start.elapsed();
529            let cpu_duration = total_duration.saturating_sub(io_duration);
530            self.metrics
531                .cpu_duration
532                .increment(cpu_duration.as_nanos() as u64);
533        }
534    }
535
536    async fn internal_scan_with_metrics<E: Event>(
537        &self,
538        event: &mut E,
539        options: ScanOptions,
540    ) -> Result<Vec<RuleMatch>, ScannerError> {
541        let start = Instant::now();
542        let result = self.internal_scan(event, options).await;
543        match result {
544            Ok((rule_matches, io_duration)) => {
545                self.record_metrics(&rule_matches, start, Some(io_duration));
546                Ok(rule_matches)
547            }
548            Err(e) => {
549                self.record_metrics(&[], start, None);
550                Err(e)
551            }
552        }
553    }
554
555    fn process_rule_matches<E: Event>(
556        &self,
557        event: &mut E,
558        rule_matches: InternalRuleMatchSet<E::Encoding>,
559        excluded_matches: AHashSet<String>,
560        output_rule_matches: &mut Vec<RuleMatch>,
561        need_match_content: bool,
562    ) {
563        if rule_matches.is_empty() {
564            return;
565        }
566        access_regex_caches(|regex_caches| {
567            for (path, mut rule_matches) in rule_matches.into_iter() {
568                // All rule matches in each inner list are for a single path, so they can be processed independently.
569                event.visit_string_mut(&path, |content| {
570                    // calculate_indices requires that matches are sorted by start index
571                    rule_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
572
573                    <<E as Event>::Encoding>::calculate_indices(
574                        content,
575                        rule_matches.iter_mut().map(
576                            |rule_match: &mut InternalRuleMatch<E::Encoding>| EncodeIndices {
577                                utf8_start: rule_match.utf8_start,
578                                utf8_end: rule_match.utf8_end,
579                                custom_start: &mut rule_match.custom_start,
580                                custom_end: &mut rule_match.custom_end,
581                            },
582                        ),
583                    );
584
585                    if self.scanner_features.multipass_v0_enabled {
586                        // Now that the `excluded_matches` set is fully populated, filter out any matches
587                        // that are the same as excluded matches (also known as "Multi-pass V0")
588                        rule_matches.retain(|rule_match| {
589                            if self.rules[rule_match.rule_index]
590                                .inner
591                                .should_exclude_multipass_v0()
592                            {
593                                let is_false_positive = excluded_matches
594                                    .contains(&content[rule_match.utf8_start..rule_match.utf8_end]);
595                                if is_false_positive && self.scanner_features.multipass_v0_enabled {
596                                    self.rules[rule_match.rule_index]
597                                        .on_excluded_match_multipass_v0();
598                                }
599                                !is_false_positive
600                            } else {
601                                true
602                            }
603                        });
604                    }
605
606                    self.suppress_matches::<E::Encoding>(&mut rule_matches, content, regex_caches);
607
608                    self.sort_and_remove_overlapping_rules::<E::Encoding>(&mut rule_matches);
609
610                    let will_mutate = rule_matches.iter().any(|rule_match| {
611                        self.rules[rule_match.rule_index].match_action.is_mutating()
612                    });
613
614                    self.apply_match_actions(
615                        content,
616                        &path,
617                        &mut rule_matches,
618                        output_rule_matches,
619                        need_match_content,
620                    );
621
622                    will_mutate
623                });
624            }
625        });
626    }
627
628    async fn internal_scan<E: Event>(
629        &self,
630        event: &mut E,
631        options: ScanOptions,
632    ) -> Result<(Vec<RuleMatch>, Duration), ScannerError> {
633        // If validation is requested, we need to collect match content even if the scanner
634        // wasn't originally configured to return matches
635        let need_match_content = self.scanner_features.return_matches || options.validate_matches;
636        // All matches, after some (but not all) false-positives have been removed.
637        let mut rule_matches = InternalRuleMatchSet::new();
638        let mut excluded_matches = AHashSet::new();
639        let mut async_jobs = vec![];
640
641        access_regex_caches(|regex_caches| {
642            self.scoped_ruleset.visit_string_rule_combinations(
643                event,
644                ScannerContentVisitor {
645                    scanner: self,
646                    regex_caches,
647                    rule_matches: &mut rule_matches,
648                    blocked_rules: &options.blocked_rules_idx,
649                    excluded_matches: &mut excluded_matches,
650                    per_event_data: SharedData::new(),
651                    wildcarded_indexes: &options.wildcarded_indices,
652                    async_jobs: &mut async_jobs,
653                    event_id: event.get_id().map(|s| s.to_string()),
654                },
655            )
656        })?;
657
658        // The async jobs were already spawned on the tokio runtime, so the
659        // results just need to be collected
660        let mut total_io_duration = Duration::ZERO;
661        for job in async_jobs {
662            let rule_info = job.fut.await.unwrap()?;
663            total_io_duration += rule_info.io_duration;
664            rule_matches.push_async_matches(
665                &job.path,
666                rule_info
667                    .rule_matches
668                    .into_iter()
669                    .map(|x| InternalRuleMatch::new(rule_info.rule_index, x)),
670            );
671        }
672
673        let mut output_rule_matches = vec![];
674
675        self.process_rule_matches(
676            event,
677            rule_matches,
678            excluded_matches,
679            &mut output_rule_matches,
680            need_match_content,
681        );
682
683        if options.validate_matches {
684            self.validate_matches(&mut output_rule_matches);
685        }
686
687        Ok((output_rule_matches, total_io_duration))
688    }
689
690    pub fn suppress_matches<E: Encoding>(
691        &self,
692        rule_matches: &mut Vec<InternalRuleMatch<E>>,
693        content: &str,
694        regex_caches: &mut RegexCaches,
695    ) {
696        rule_matches.retain(|rule_match| {
697            if let Some(suppressions) = &self.rules[rule_match.rule_index].suppressions {
698                let match_should_be_suppressed = suppressions.should_match_be_suppressed(
699                    &content[rule_match.utf8_start..rule_match.utf8_end],
700                    regex_caches,
701                );
702
703                if match_should_be_suppressed {
704                    self.metrics.suppressed_match_count.increment(1);
705                }
706                !match_should_be_suppressed
707            } else {
708                true
709            }
710        });
711    }
712
713    pub fn validate_matches(&self, rule_matches: &mut Vec<RuleMatch>) {
714        // Create MatchValidatorRuleMatch per match_validator_type to pass it to each match_validator
715        let mut match_validator_rule_match_per_type = AHashMap::new();
716
717        let mut validated_rule_matches = vec![];
718
719        for mut rule_match in rule_matches.drain(..) {
720            let rule = &self.rules[rule_match.rule_index];
721            if let Some(match_validation_type) = rule.internal_match_validation_type() {
722                match_validator_rule_match_per_type
723                    .entry(match_validation_type)
724                    .or_insert_with(Vec::new)
725                    .push(rule_match)
726            } else {
727                // There is no match validator for this rule, so mark it as not available.
728                rule_match.match_status.merge(MatchStatus::NotAvailable);
729                validated_rule_matches.push(rule_match);
730            }
731        }
732
733        RAYON_THREAD_POOL.install(|| {
734            use rayon::prelude::*;
735
736            match_validator_rule_match_per_type.par_iter_mut().for_each(
737                |(match_validation_type, matches_per_type)| {
738                    let match_validator = self.match_validators_per_type.get(match_validation_type);
739                    if let Some(match_validator) = match_validator {
740                        match_validator
741                            .as_ref()
742                            .validate(matches_per_type, &self.rules)
743                    }
744                },
745            );
746        });
747
748        // Refill the rule_matches with the validated matches
749        for (_, mut matches) in match_validator_rule_match_per_type {
750            validated_rule_matches.append(&mut matches);
751        }
752
753        // Sort rule_matches by start index
754        validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
755        *rule_matches = validated_rule_matches;
756    }
757
758    /// Apply mutations from actions, and shift indices to match the mutated values.
759    /// This assumes the matches are all from the content given, and are sorted by start index.
760    fn apply_match_actions<E: Encoding>(
761        &self,
762        content: &mut String,
763        path: &Path<'static>,
764        rule_matches: &mut [InternalRuleMatch<E>],
765        output_rule_matches: &mut Vec<RuleMatch>,
766        need_match_content: bool,
767    ) {
768        let mut utf8_byte_delta: isize = 0;
769        let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
770
771        for rule_match in rule_matches {
772            output_rule_matches.push(self.apply_match_actions_for_string::<E>(
773                content,
774                path.clone(),
775                rule_match,
776                &mut utf8_byte_delta,
777                &mut custom_index_delta,
778                need_match_content,
779            ));
780        }
781    }
782
783    /// This will be called once for each match of a single string. The rules must be passed in in order of the start index. Mutating rules must not overlap.
784    fn apply_match_actions_for_string<E: Encoding>(
785        &self,
786        content: &mut String,
787        path: Path<'static>,
788        rule_match: &InternalRuleMatch<E>,
789        // The current difference in length between the original and mutated string
790        utf8_byte_delta: &mut isize,
791
792        // The difference between the custom index on the original string and the mutated string
793        custom_index_delta: &mut <E>::IndexShift,
794        need_match_content: bool,
795    ) -> RuleMatch {
796        let rule = &self.rules[rule_match.rule_index];
797
798        let custom_start =
799            (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
800                + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
801
802        let mut matched_content_copy = None;
803
804        if need_match_content {
805            // This copies part of the is_mutating block but is seperate since can't mix compilation condition and code condition
806            let mutated_utf8_match_start =
807                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
808            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
809
810            // Matches for mutating rules must have valid indices
811            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
812            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
813
814            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
815            matched_content_copy = Some(matched_content.to_string());
816        }
817
818        if rule.match_action.is_mutating() {
819            let mutated_utf8_match_start =
820                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
821            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
822
823            // Matches for mutating rules must have valid indices
824            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
825            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
826
827            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
828            if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
829                let before_replacement = &matched_content[replacement.start..replacement.end];
830
831                // update indices to match the new mutated content
832                <E>::adjust_shift(
833                    custom_index_delta,
834                    before_replacement,
835                    &replacement.replacement,
836                );
837                *utf8_byte_delta +=
838                    replacement.replacement.len() as isize - before_replacement.len() as isize;
839
840                let replacement_start = mutated_utf8_match_start + replacement.start;
841                let replacement_end = mutated_utf8_match_start + replacement.end;
842                content.replace_range(replacement_start..replacement_end, &replacement.replacement);
843            }
844        }
845
846        let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
847        let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
848            + shift_offset) as usize;
849
850        let rule = &self.rules[rule_match.rule_index];
851
852        let match_status: MatchStatus = if rule.match_validation_type.is_some() {
853            MatchStatus::NotChecked
854        } else {
855            MatchStatus::NotAvailable
856        };
857
858        RuleMatch {
859            rule_index: rule_match.rule_index,
860            path,
861            replacement_type: rule.match_action.replacement_type(),
862            start_index: custom_start,
863            end_index_exclusive: custom_end,
864            shift_offset,
865            match_value: matched_content_copy,
866            match_status,
867        }
868    }
869
870    fn sort_and_remove_overlapping_rules<E: Encoding>(
871        &self,
872        rule_matches: &mut Vec<InternalRuleMatch<E>>,
873    ) {
874        // Some of the scanner code relies on the behavior here, such as the sort order and removal of overlapping mutating rules.
875        // Be very careful if this function is modified.
876
877        rule_matches.sort_unstable_by(|a, b| {
878            // Mutating rules are a higher priority (earlier in the list)
879            let ord = self.rules[a.rule_index]
880                .match_action
881                .is_mutating()
882                .cmp(&self.rules[b.rule_index].match_action.is_mutating())
883                .reverse();
884
885            // Earlier start offset
886            let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
887
888            // Longer matches
889            let ord = ord.then(a.len().cmp(&b.len()).reverse());
890
891            // Matches with higher precedence come first
892            let ord = ord.then(
893                self.rules[a.rule_index]
894                    .precedence
895                    .cmp(&self.rules[b.rule_index].precedence)
896                    .reverse(),
897            );
898
899            // Matches from earlier rules
900            let ord = ord.then(a.rule_index.cmp(&b.rule_index));
901
902            // swap the order of everything so matches can be efficiently popped off the back as they are processed
903            ord.reverse()
904        });
905
906        let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
907
908        'rule_matches: while let Some(rule_match) = rule_matches.pop() {
909            if self.rules[rule_match.rule_index].match_action.is_mutating() {
910                // Mutating rules are kept only if they don't overlap with a previous rule.
911                if let Some(last) = retained_rules.last()
912                    && last.utf8_end > rule_match.utf8_start
913                {
914                    continue;
915                }
916            } else {
917                // Only retain if it doesn't overlap with any other rule. Since mutating matches are sorted before non-mutated matches
918                // this needs to check all retained matches (instead of just the last one)
919                for retained_rule in &retained_rules {
920                    if retained_rule.utf8_start < rule_match.utf8_end
921                        && retained_rule.utf8_end > rule_match.utf8_start
922                    {
923                        continue 'rule_matches;
924                    }
925                }
926            };
927            retained_rules.push(rule_match);
928        }
929
930        // ensure rules are sorted by start index (other parts of the library required this to function correctly)
931        retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
932
933        *rule_matches = retained_rules;
934    }
935}
936
937impl Drop for Scanner {
938    fn drop(&mut self) {
939        let stats = &*GLOBAL_STATS;
940        stats.scanner_deletions.increment(1);
941        stats.decrement_total_scanners();
942    }
943}
944
945#[derive(Default)]
946pub struct ScannerBuilder<'a> {
947    rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
948    labels: Labels,
949    scanner_features: ScannerFeatures,
950    async_scan_timeout: Duration,
951}
952
953impl ScannerBuilder<'_> {
954    pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
955        ScannerBuilder {
956            rules,
957            labels: Labels::empty(),
958            scanner_features: ScannerFeatures::default(),
959            async_scan_timeout: Duration::from_secs(60 * 5),
960        }
961    }
962
963    pub fn labels(mut self, labels: Labels) -> Self {
964        self.labels = labels;
965        self
966    }
967
968    pub fn with_async_scan_timeout(mut self, duration: Duration) -> Self {
969        self.async_scan_timeout = duration;
970        self
971    }
972
973    pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
974        self.scanner_features.add_implicit_index_wildcards = value;
975        self
976    }
977
978    pub fn with_return_matches(mut self, value: bool) -> Self {
979        self.scanner_features.return_matches = value;
980        self
981    }
982
983    /// Enables/Disables the Multipass V0 feature. This defaults to TRUE.
984    /// Multipass V0 saves matches from excluded scopes, and marks any identical
985    /// matches in included scopes as a false positive.
986    pub fn with_multipass_v0(mut self, value: bool) -> Self {
987        self.scanner_features.multipass_v0_enabled = value;
988        self
989    }
990
991    pub fn build(self) -> Result<Scanner, CreateScannerError> {
992        let mut match_validators_per_type = AHashMap::new();
993
994        for rule in self.rules.iter() {
995            if let Some(match_validation_type) = &rule.get_third_party_active_checker()
996                && match_validation_type.can_create_match_validator()
997            {
998                let internal_type = match_validation_type.get_internal_match_validation_type();
999                let match_validator = match_validation_type.into_match_validator();
1000                if let Ok(match_validator) = match_validator {
1001                    if !match_validators_per_type.contains_key(&internal_type) {
1002                        match_validators_per_type.insert(internal_type, match_validator);
1003                    }
1004                } else {
1005                    return Err(CreateScannerError::InvalidMatchValidator(
1006                        MatchValidatorCreationError::InternalError,
1007                    ));
1008                }
1009            }
1010        }
1011
1012        let compiled_rules = self
1013            .rules
1014            .iter()
1015            .enumerate()
1016            .map(|(rule_index, config)| {
1017                let inner = config.convert_to_compiled_rule(rule_index, self.labels.clone())?;
1018                config.match_action.validate()?;
1019                let compiled_suppressions = match &config.suppressions {
1020                    Some(s) => s.compile()?,
1021                    None => None,
1022                };
1023                Ok(RootCompiledRule {
1024                    inner,
1025                    scope: config.scope.clone(),
1026                    match_action: config.match_action.clone(),
1027                    match_validation_type: config.get_third_party_active_checker().cloned(),
1028                    suppressions: compiled_suppressions,
1029                    precedence: config.precedence,
1030                })
1031            })
1032            .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
1033
1034        let mut per_scanner_data = SharedData::new();
1035
1036        compiled_rules.iter().for_each(|rule| {
1037            rule.init_per_scanner_data(&mut per_scanner_data);
1038        });
1039
1040        let scoped_ruleset = ScopedRuleSet::new(
1041            &compiled_rules
1042                .iter()
1043                .map(|rule| rule.scope.clone())
1044                .collect::<Vec<_>>(),
1045        )
1046        .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
1047
1048        {
1049            let stats = &*GLOBAL_STATS;
1050            stats.scanner_creations.increment(1);
1051            stats.increment_total_scanners();
1052        }
1053
1054        Ok(Scanner {
1055            rules: compiled_rules,
1056            scoped_ruleset,
1057            scanner_features: self.scanner_features,
1058            metrics: ScannerMetrics::new(&self.labels),
1059            match_validators_per_type,
1060            labels: self.labels,
1061            per_scanner_data,
1062            async_scan_timeout: self.async_scan_timeout,
1063        })
1064    }
1065}
1066
1067struct ScannerContentVisitor<'a, E: Encoding> {
1068    scanner: &'a Scanner,
1069    regex_caches: &'a mut RegexCaches,
1070    rule_matches: &'a mut InternalRuleMatchSet<E>,
1071    // Rules that shall be skipped for this scan
1072    // This list shall be small (<10), so a linear search is acceptable
1073    blocked_rules: &'a Vec<usize>,
1074    excluded_matches: &'a mut AHashSet<String>,
1075    per_event_data: SharedData,
1076    wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
1077    async_jobs: &'a mut Vec<PendingRuleJob>,
1078    event_id: Option<String>,
1079}
1080
1081impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
1082    fn visit_content<'b>(
1083        &'b mut self,
1084        path: &Path<'a>,
1085        content: &str,
1086        mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
1087        exclusion_check: ExclusionCheck<'b>,
1088    ) -> Result<bool, ScannerError> {
1089        // matches for a single path
1090        let mut path_rules_matches = vec![];
1091
1092        // Create a map of per rule type data that can be shared between rules of the same type
1093        let mut per_string_data = SharedData::new();
1094        let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
1095
1096        rule_visitor.visit_rule_indices(|rule_index| {
1097            if self.blocked_rules.contains(&rule_index) {
1098                return Ok(());
1099            }
1100            let rule = &self.scanner.rules[rule_index];
1101            {
1102                if rule.inner.allow_scanner_to_exclude_namespace() {
1103                    // check if the path is excluded
1104                    if exclusion_check.is_excluded(rule_index) {
1105                        return Ok(());
1106                    }
1107                }
1108                // creating the emitter is basically free, it will get mostly optimized away
1109                let mut emitter = |rule_match: StringMatch| {
1110                    // This should never happen, but to ensure no empty match is ever generated
1111                    // (which may cause an infinite loop), this will panic instead.
1112                    assert_ne!(rule_match.start, rule_match.end, "empty match detected");
1113                    path_rules_matches.push(InternalRuleMatch::new(rule_index, rule_match));
1114                };
1115
1116                rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
1117
1118                // TODO: move this somewhere higher?
1119                rule.init_per_event_data(&mut self.per_event_data);
1120
1121                let mut ctx = StringMatchesCtx {
1122                    rule_index,
1123                    regex_caches: self.regex_caches,
1124                    exclusion_check: &exclusion_check,
1125                    excluded_matches: self.excluded_matches,
1126                    match_emitter: &mut emitter,
1127                    wildcard_indices: wildcard_indices_per_path,
1128                    per_string_data: &mut per_string_data,
1129                    per_scanner_data: &self.scanner.per_scanner_data,
1130                    per_event_data: &mut self.per_event_data,
1131                    event_id: self.event_id.as_deref(),
1132                };
1133
1134                let async_status = rule.get_string_matches(content, path, &mut ctx)?;
1135
1136                match async_status {
1137                    RuleStatus::Done => {
1138                        // nothing to do
1139                    }
1140                    RuleStatus::Pending(fut) => {
1141                        self.async_jobs.push(PendingRuleJob {
1142                            fut,
1143                            path: path.into_static(),
1144                        });
1145                    }
1146                }
1147            }
1148            Ok(())
1149        })?;
1150
1151        // If there are any matches, the string will need to be accessed to check for false positives from
1152        // excluded matches, any to potentially mutate the string.
1153        // If there are any async jobs, this is also true since it's not known yet whether there
1154        // will be a match
1155        let needs_to_access_content = !path_rules_matches.is_empty() || !self.async_jobs.is_empty();
1156
1157        self.rule_matches
1158            .push_sync_matches(path, path_rules_matches);
1159
1160        Ok(needs_to_access_content)
1161    }
1162}
1163
1164// Calculates the next starting position for a regex match if a the previous match is a false positive
1165fn get_next_regex_start(content: &str, regex_match: (usize, usize)) -> Option<usize> {
1166    // The next valid UTF8 char after the start of the regex match is used
1167    if let Some((i, _)) = content[regex_match.0..].char_indices().nth(1) {
1168        Some(regex_match.0 + i)
1169    } else {
1170        // There are no more chars left in the string to scan
1171        None
1172    }
1173}
1174
1175fn is_false_positive_match(
1176    regex_match_range: (usize, usize),
1177    rule: &RegexCompiledRule,
1178    content: &str,
1179    check_excluded_keywords: bool,
1180) -> bool {
1181    if check_excluded_keywords
1182        && let Some(excluded_keywords) = &rule.excluded_keywords
1183        && excluded_keywords.is_false_positive_match(content, regex_match_range.0)
1184    {
1185        return true;
1186    }
1187
1188    if let Some(validator) = rule.validator.as_ref()
1189        && !validator.is_valid_match(&content[regex_match_range.0..regex_match_range.1])
1190    {
1191        return true;
1192    }
1193    false
1194}