dd_sds/scanner/
mod.rs

1use crate::encoding::Encoding;
2use crate::event::Event;
3use std::future::Future;
4
5use crate::match_validation::{
6    config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
7    match_validator::MatchValidator,
8};
9
10use error::MatchValidatorCreationError;
11
12use self::metrics::ScannerMetrics;
13use crate::match_validation::match_validator::RAYON_THREAD_POOL;
14use crate::observability::labels::Labels;
15use crate::rule_match::{InternalRuleMatch, RuleMatch};
16use crate::scanner::config::RuleConfig;
17use crate::scanner::internal_rule_match_set::InternalRuleMatchSet;
18use crate::scanner::regex_rule::compiled::RegexCompiledRule;
19use crate::scanner::regex_rule::{RegexCaches, access_regex_caches};
20use crate::scanner::scope::Scope;
21pub use crate::scanner::shared_data::SharedData;
22use crate::scanner::suppression::{CompiledSuppressions, SuppressionValidationError, Suppressions};
23use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
24pub use crate::secondary_validation::Validator;
25use crate::stats::GLOBAL_STATS;
26use crate::tokio::TOKIO_RUNTIME;
27use crate::{CreateScannerError, EncodeIndices, MatchAction, Path, ScannerError};
28use ahash::AHashMap;
29use futures::executor::block_on;
30use serde::{Deserialize, Serialize};
31use serde_with::serde_as;
32use std::ops::Deref;
33use std::pin::Pin;
34use std::sync::Arc;
35use std::time::{Duration, Instant};
36use tokio::task::JoinHandle;
37use tokio::time::timeout;
38
39pub mod config;
40pub mod debug_scan;
41pub mod error;
42pub mod metrics;
43pub mod regex_rule;
44pub mod scope;
45pub mod shared_data;
46pub mod shared_pool;
47pub mod suppression;
48
49mod internal_rule_match_set;
50#[cfg(test)]
51mod test;
52
53#[derive(Clone)]
54pub struct StringMatch {
55    pub start: usize,
56    pub end: usize,
57    // The keyword that was used to match this rule. Optional, only some rules may set this value.
58    pub keyword: Option<String>,
59}
60
61pub trait MatchEmitter<T = ()> {
62    fn emit(&mut self, string_match: StringMatch) -> T;
63}
64
65// This implements MatchEmitter for mutable closures (so you can use a closure instead of a custom
66// struct that implements MatchEmitter)
67impl<F, T> MatchEmitter<T> for F
68where
69    F: FnMut(StringMatch) -> T,
70{
71    fn emit(&mut self, string_match: StringMatch) -> T {
72        // This just calls the closure (itself)
73        (self)(string_match)
74    }
75}
76
77/// The precedence of a rule. Catchall is the lowest precedence, Specific is the highest precedence.
78/// The default precedence is Specific.
79/// For rules that:
80/// - Have the same mutation priority
81/// - Match at the same index
82/// - Match the same number of characters
83///
84/// Then the rule with the highest precedence will be used.
85#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Default)]
86pub enum Precedence {
87    Catchall,
88    Generic,
89    #[default]
90    Specific,
91}
92
93#[serde_as]
94#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
95pub struct RootRuleConfig<T> {
96    pub match_action: MatchAction,
97    #[serde(default)]
98    pub scope: Scope,
99    #[deprecated(note = "Use `third_party_active_checker` instead")]
100    match_validation_type: Option<MatchValidationType>,
101    third_party_active_checker: Option<MatchValidationType>,
102    suppressions: Option<Suppressions>,
103    #[serde(default)]
104    precedence: Precedence,
105    #[serde(default)]
106    pub is_supporting_rule: bool,
107    #[serde(flatten)]
108    pub inner: T,
109}
110
111impl<T> RootRuleConfig<T>
112where
113    T: RuleConfig + 'static,
114{
115    pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
116        RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
117    }
118
119    pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
120        self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
121    }
122}
123
124impl<T> RootRuleConfig<T> {
125    pub fn new(inner: T) -> Self {
126        #[allow(deprecated)]
127        Self {
128            match_action: MatchAction::None,
129            scope: Scope::all(),
130            match_validation_type: None,
131            third_party_active_checker: None,
132            suppressions: None,
133            precedence: Precedence::default(),
134            is_supporting_rule: false,
135            inner,
136        }
137    }
138
139    pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
140        #[allow(deprecated)]
141        RootRuleConfig {
142            match_action: self.match_action,
143            scope: self.scope,
144            match_validation_type: self.match_validation_type,
145            third_party_active_checker: self.third_party_active_checker,
146            suppressions: self.suppressions,
147            precedence: self.precedence,
148            is_supporting_rule: self.is_supporting_rule,
149            inner: func(self.inner),
150        }
151    }
152
153    pub fn match_action(mut self, action: MatchAction) -> Self {
154        self.match_action = action;
155        self
156    }
157
158    pub fn precedence(mut self, precedence: Precedence) -> Self {
159        self.precedence = precedence;
160        self
161    }
162
163    pub fn scope(mut self, scope: Scope) -> Self {
164        self.scope = scope;
165        self
166    }
167
168    pub fn third_party_active_checker(
169        mut self,
170        match_validation_type: MatchValidationType,
171    ) -> Self {
172        self.third_party_active_checker = Some(match_validation_type);
173        self
174    }
175
176    pub fn suppressions(mut self, suppressions: Suppressions) -> Self {
177        self.suppressions = Some(suppressions);
178        self
179    }
180
181    pub fn is_supporting_rule(mut self, value: bool) -> Self {
182        self.is_supporting_rule = value;
183        self
184    }
185
186    fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
187        #[allow(deprecated)]
188        self.third_party_active_checker
189            .as_ref()
190            .or(self.match_validation_type.as_ref())
191    }
192}
193
194impl<T> Deref for RootRuleConfig<T> {
195    type Target = T;
196
197    fn deref(&self) -> &Self::Target {
198        &self.inner
199    }
200}
201pub struct RootCompiledRule {
202    pub inner: Box<dyn CompiledRule>,
203    pub scope: Scope,
204    pub match_action: MatchAction,
205    pub match_validation_type: Option<MatchValidationType>,
206    pub suppressions: Option<CompiledSuppressions>,
207    pub precedence: Precedence,
208    pub is_supporting_rule: bool,
209}
210
211impl RootCompiledRule {
212    pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
213        self.match_validation_type
214            .as_ref()
215            .map(|x| x.get_internal_match_validation_type())
216    }
217}
218
219impl Deref for RootCompiledRule {
220    type Target = dyn CompiledRule;
221
222    fn deref(&self) -> &Self::Target {
223        self.inner.as_ref()
224    }
225}
226
227pub struct StringMatchesCtx<'a> {
228    rule_index: usize,
229    pub regex_caches: &'a mut RegexCaches,
230    pub exclusion_check: &'a ExclusionCheck<'a>,
231    pub excluded_matches: &'a mut AHashMap<String, String>,
232    pub match_emitter: &'a mut dyn MatchEmitter,
233    pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
234    pub enable_debug_observability: bool,
235
236    // Shared Data
237    pub per_string_data: &'a mut SharedData,
238    pub per_scanner_data: &'a SharedData,
239    pub per_event_data: &'a mut SharedData,
240    pub event_id: Option<&'a str>,
241}
242
243impl StringMatchesCtx<'_> {
244    /// If a `get_string_matches` implementation needs to do any async processing (e.g. I/O),
245    /// this function can be used to return an "async job" to find matches. The return value
246    /// of `process_async` should be returned from the `get_string_matches` function. The future
247    /// passed into this function will be spawned and executed immediately without blocking
248    /// other `get_string_matches` calls. This means all the async jobs will run concurrently.
249    ///
250    /// The `ctx` available to async jobs is more restrictive than the normal `ctx` available in
251    /// `get_string_matches`. The only thing you can do is return matches. If other data is needed,
252    /// it should be accessed before `process_async` is called.
253    pub fn process_async(
254        &self,
255        func: impl for<'a> FnOnce(
256            &'a mut AsyncStringMatchesCtx,
257        )
258            -> Pin<Box<dyn Future<Output = Result<(), ScannerError>> + Send + 'a>>
259        + Send
260        + 'static,
261    ) -> RuleResult {
262        let rule_index = self.rule_index;
263
264        // The future is spawned onto the tokio runtime immediately so it starts running
265        // in the background
266        let fut = TOKIO_RUNTIME.spawn(async move {
267            let start = Instant::now();
268            let mut ctx = AsyncStringMatchesCtx {
269                rule_matches: vec![],
270            };
271            (func)(&mut ctx).await?;
272            let io_duration = start.elapsed();
273
274            Ok(AsyncRuleInfo {
275                rule_index,
276                rule_matches: ctx.rule_matches,
277                io_duration,
278            })
279        });
280
281        Ok(RuleStatus::Pending(fut))
282    }
283}
284
285pub struct AsyncStringMatchesCtx {
286    rule_matches: Vec<StringMatch>,
287}
288
289impl AsyncStringMatchesCtx {
290    pub fn emit_match(&mut self, string_match: StringMatch) {
291        self.rule_matches.push(string_match);
292    }
293}
294
295#[must_use]
296pub enum RuleStatus {
297    Done,
298    Pending(PendingRuleResult),
299}
300
301// pub type PendingRuleResult = BoxFuture<'static, Result<AsyncRuleInfo, ScannerError>>;
302pub type PendingRuleResult = JoinHandle<Result<AsyncRuleInfo, ScannerError>>;
303
304pub struct PendingRuleJob {
305    fut: PendingRuleResult,
306    path: Path<'static>,
307}
308
309pub struct AsyncRuleInfo {
310    rule_index: usize,
311    rule_matches: Vec<StringMatch>,
312    io_duration: Duration,
313}
314
315/// A rule result that cannot be async
316pub type RuleResult = Result<RuleStatus, ScannerError>;
317
318// This is the public trait that is used to define the behavior of a compiled rule.
319pub trait CompiledRule: Send + Sync {
320    fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
321        // by default, no per-scanner data is initialized
322    }
323
324    fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
325        // by default, no per-string data is initialized
326    }
327
328    fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
329        // by default, no per-event data is initialized
330    }
331
332    fn get_string_matches(
333        &self,
334        content: &str,
335        path: &Path,
336        ctx: &mut StringMatchesCtx<'_>,
337    ) -> RuleResult;
338
339    // Whether a match from this rule should be excluded (marked as a false-positive)
340    // if the content of this match was found in a match from an excluded scope
341    fn should_exclude_multipass_v0(&self) -> bool {
342        // default is to NOT use Multi-pass V0
343        false
344    }
345
346    fn on_excluded_match_multipass_v0(
347        &self,
348        _path: &Path,
349        _excluded_path: &str,
350        _enable_debug_observability: bool,
351    ) {
352        // default is to do nothing
353    }
354
355    fn as_regex_rule(&self) -> Option<&RegexCompiledRule> {
356        None
357    }
358
359    fn as_regex_rule_mut(&mut self) -> Option<&mut RegexCompiledRule> {
360        None
361    }
362
363    fn allow_scanner_to_exclude_namespace(&self) -> bool {
364        true
365    }
366}
367
368impl<T> RuleConfig for Box<T>
369where
370    T: RuleConfig + ?Sized,
371{
372    fn convert_to_compiled_rule(
373        &self,
374        rule_index: usize,
375        labels: Labels,
376    ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
377        self.as_ref().convert_to_compiled_rule(rule_index, labels)
378    }
379}
380
381#[derive(Debug, PartialEq, Clone)]
382struct ScannerFeatures {
383    pub add_implicit_index_wildcards: bool,
384    pub multipass_v0_enabled: bool,
385    pub return_matches: bool,
386    pub enable_debug_observability: bool,
387}
388
389impl Default for ScannerFeatures {
390    fn default() -> Self {
391        Self {
392            add_implicit_index_wildcards: false,
393            multipass_v0_enabled: true,
394            return_matches: false,
395            enable_debug_observability: false,
396        }
397    }
398}
399
400pub struct ScanOptions {
401    // The blocked_rules_idx parameter is a list of rule indices that should be skipped for this scan.
402    // this list shall be small (<10), so a linear search is acceptable otherwise performance will be impacted.
403    pub blocked_rules_idx: Vec<usize>,
404    // The wildcarded_indices parameter is a map containing a list of tuples of (start, end) indices that should be treated as wildcards (for the message key only) per path.
405    pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
406    // Whether to validate matches using third-party validators (e.g., checksum validation for credit cards).
407    // When enabled, the scanner automatically collects match content needed for validation.
408    pub validate_matches: bool,
409}
410
411impl Default for ScanOptions {
412    fn default() -> Self {
413        Self {
414            blocked_rules_idx: vec![],
415            wildcarded_indices: AHashMap::new(),
416            validate_matches: false,
417        }
418    }
419}
420
421pub struct ScanOptionBuilder {
422    blocked_rules_idx: Vec<usize>,
423    wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
424    validate_matches: bool,
425}
426
427impl ScanOptionBuilder {
428    pub fn new() -> Self {
429        Self {
430            blocked_rules_idx: vec![],
431            wildcarded_indices: AHashMap::new(),
432            validate_matches: false,
433        }
434    }
435
436    pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
437        self.blocked_rules_idx = blocked_rules_idx;
438        self
439    }
440
441    pub fn with_wildcarded_indices(
442        mut self,
443        wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
444    ) -> Self {
445        self.wildcarded_indices = wildcarded_indices;
446        self
447    }
448
449    pub fn with_validate_matching(mut self, validate_matches: bool) -> Self {
450        self.validate_matches = validate_matches;
451        self
452    }
453
454    pub fn build(self) -> ScanOptions {
455        ScanOptions {
456            blocked_rules_idx: self.blocked_rules_idx,
457            wildcarded_indices: self.wildcarded_indices,
458            validate_matches: self.validate_matches,
459        }
460    }
461}
462
463pub struct Scanner {
464    rules: Vec<RootCompiledRule>,
465    scoped_ruleset: ScopedRuleSet,
466    scanner_features: ScannerFeatures,
467    metrics: ScannerMetrics,
468    labels: Labels,
469    match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
470    per_scanner_data: SharedData,
471    async_scan_timeout: Duration,
472}
473
474impl Scanner {
475    pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
476        ScannerBuilder::new(rules)
477    }
478
479    // This function scans the given event with the rules configured in the scanner.
480    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
481    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
482    // This version uses default scan options (no validation, no blocked rules, no wildcarded indices).
483    pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
484        self.scan_with_options(event, ScanOptions::default())
485    }
486
487    // This function scans the given event with the rules configured in the scanner.
488    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
489    // The options parameter allows customizing the scan behavior (validation, blocked rules, etc.).
490    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
491    pub fn scan_with_options<E: Event>(
492        &self,
493        event: &mut E,
494        options: ScanOptions,
495    ) -> Result<Vec<RuleMatch>, ScannerError> {
496        let start = Instant::now();
497        let validate = options.validate_matches;
498        // Collect matches inside block_on, then run finalize_matches (which uses rayon) outside of
499        // it to avoid re-entrancy between the futures LocalPool executor and RAYON_THREAD_POOL.
500        let result = block_on(self.internal_scan_collect(event, options));
501        match result {
502            Ok((mut rule_matches, io_duration)) => {
503                self.finalize_matches(&mut rule_matches, validate);
504                self.record_metrics(&rule_matches, start, Some(io_duration));
505                Ok(rule_matches)
506            }
507            Err(e) => {
508                self.record_metrics(&[], start, None);
509                Err(e)
510            }
511        }
512    }
513
514    // This function scans the given event with the rules configured in the scanner.
515    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
516    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
517    pub async fn scan_async<E: Event>(
518        &self,
519        event: &mut E,
520    ) -> Result<Vec<RuleMatch>, ScannerError> {
521        self.scan_async_with_options(event, ScanOptions::default())
522            .await
523    }
524
525    pub async fn scan_async_with_options<E: Event>(
526        &self,
527        event: &mut E,
528        options: ScanOptions,
529    ) -> Result<Vec<RuleMatch>, ScannerError> {
530        let start = Instant::now();
531        let validate = options.validate_matches;
532        let fut = self.internal_scan_collect(event, options);
533
534        // The sleep from the timeout requires being in a tokio context
535        // The guard needs to be dropped before await since the guard is !Send
536        let timeout_result = {
537            let _tokio_guard = TOKIO_RUNTIME.enter();
538            timeout(self.async_scan_timeout, fut)
539        };
540
541        let result = timeout_result.await.unwrap_or(Err(ScannerError::Transient(
542            "Async scan timeout".to_string(),
543        )));
544
545        match result {
546            Ok((mut rule_matches, io_duration)) => {
547                self.finalize_matches(&mut rule_matches, validate);
548                self.record_metrics(&rule_matches, start, Some(io_duration));
549                Ok(rule_matches)
550            }
551            Err(e) => {
552                self.record_metrics(&[], start, None);
553                Err(e)
554            }
555        }
556    }
557
558    fn record_metrics(
559        &self,
560        output_rule_matches: &[RuleMatch],
561        start: Instant,
562        io_duration: Option<Duration>,
563    ) {
564        // Add number of scanned events
565        self.metrics.num_scanned_events.increment(1);
566        // Add number of matches
567        self.metrics
568            .match_count
569            .increment(output_rule_matches.len() as u64);
570
571        if let Some(io_duration) = io_duration {
572            let total_duration = start.elapsed();
573            let cpu_duration = total_duration.saturating_sub(io_duration);
574            self.metrics
575                .cpu_duration
576                .increment(cpu_duration.as_nanos() as u64);
577        }
578    }
579
580    fn process_rule_matches<E: Event>(
581        &self,
582        event: &mut E,
583        rule_matches: InternalRuleMatchSet<E::Encoding>,
584        excluded_matches: AHashMap<String, String>,
585        output_rule_matches: &mut Vec<RuleMatch>,
586        need_match_content: bool,
587    ) {
588        if rule_matches.is_empty() {
589            return;
590        }
591        access_regex_caches(|regex_caches| {
592            for (path, mut rule_matches) in rule_matches.into_iter() {
593                // All rule matches in each inner list are for a single path, so they can be processed independently.
594                event.visit_string_mut(&path, |content| {
595                    // calculate_indices requires that matches are sorted by start index
596                    rule_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
597
598                    <<E as Event>::Encoding>::calculate_indices(
599                        content,
600                        rule_matches.iter_mut().map(
601                            |rule_match: &mut InternalRuleMatch<E::Encoding>| EncodeIndices {
602                                utf8_start: rule_match.utf8_start,
603                                utf8_end: rule_match.utf8_end,
604                                custom_start: &mut rule_match.custom_start,
605                                custom_end: &mut rule_match.custom_end,
606                            },
607                        ),
608                    );
609
610                    if self.scanner_features.multipass_v0_enabled {
611                        // Now that the `excluded_matches` set is fully populated, filter out any matches
612                        // that are the same as excluded matches (also known as "Multi-pass V0")
613                        rule_matches.retain(|rule_match| {
614                            if self.rules[rule_match.rule_index]
615                                .inner
616                                .should_exclude_multipass_v0()
617                            {
618                                let match_content =
619                                    &content[rule_match.utf8_start..rule_match.utf8_end];
620                                let excluded_path = excluded_matches.get(match_content);
621                                if let Some(excluded_path) = excluded_path {
622                                    self.rules[rule_match.rule_index]
623                                        .on_excluded_match_multipass_v0(
624                                            &path,
625                                            excluded_path,
626                                            self.scanner_features.enable_debug_observability,
627                                        );
628                                }
629                                excluded_path.is_none()
630                            } else {
631                                true
632                            }
633                        });
634                    }
635
636                    self.suppress_matches::<E::Encoding>(&mut rule_matches, content, regex_caches);
637
638                    self.sort_and_remove_overlapping_rules::<E::Encoding>(&mut rule_matches);
639
640                    let will_mutate = rule_matches.iter().any(|rule_match| {
641                        self.rules[rule_match.rule_index].match_action.is_mutating()
642                    });
643
644                    self.apply_match_actions(
645                        content,
646                        &path,
647                        rule_matches,
648                        output_rule_matches,
649                        need_match_content,
650                    );
651
652                    will_mutate
653                });
654            }
655        });
656    }
657
658    async fn internal_scan_collect<E: Event>(
659        &self,
660        event: &mut E,
661        options: ScanOptions,
662    ) -> Result<(Vec<RuleMatch>, Duration), ScannerError> {
663        // If validation is requested, we need to collect match content even if the scanner
664        // wasn't originally configured to return matches
665        let need_match_content = self.scanner_features.return_matches || options.validate_matches;
666        // All matches, after some (but not all) false-positives have been removed.
667        let mut rule_matches = InternalRuleMatchSet::new();
668        let mut excluded_matches = AHashMap::new();
669        let mut async_jobs = vec![];
670
671        access_regex_caches(|regex_caches| {
672            self.scoped_ruleset.visit_string_rule_combinations(
673                event,
674                ScannerContentVisitor {
675                    scanner: self,
676                    regex_caches,
677                    rule_matches: &mut rule_matches,
678                    blocked_rules: &options.blocked_rules_idx,
679                    excluded_matches: &mut excluded_matches,
680                    per_event_data: SharedData::new(),
681                    wildcarded_indexes: &options.wildcarded_indices,
682                    async_jobs: &mut async_jobs,
683                    event_id: event.get_id().map(|s| s.to_string()),
684                },
685            )
686        })?;
687
688        // The async jobs were already spawned on the tokio runtime, so the
689        // results just need to be collected
690        let mut total_io_duration = Duration::ZERO;
691        for job in async_jobs {
692            let rule_info = job.fut.await.unwrap()?;
693            total_io_duration += rule_info.io_duration;
694            rule_matches.push_async_matches(
695                &job.path,
696                rule_info
697                    .rule_matches
698                    .into_iter()
699                    .map(|x| InternalRuleMatch::new(rule_info.rule_index, x)),
700            );
701        }
702
703        let mut output_rule_matches = vec![];
704
705        self.process_rule_matches(
706            event,
707            rule_matches,
708            excluded_matches,
709            &mut output_rule_matches,
710            need_match_content,
711        );
712
713        Ok((output_rule_matches, total_io_duration))
714    }
715
716    pub fn suppress_matches<E: Encoding>(
717        &self,
718        rule_matches: &mut Vec<InternalRuleMatch<E>>,
719        content: &str,
720        regex_caches: &mut RegexCaches,
721    ) {
722        rule_matches.retain(|rule_match| {
723            if let Some(suppressions) = &self.rules[rule_match.rule_index].suppressions {
724                let match_should_be_suppressed = suppressions.should_match_be_suppressed(
725                    &content[rule_match.utf8_start..rule_match.utf8_end],
726                    regex_caches,
727                );
728
729                if match_should_be_suppressed {
730                    self.metrics.suppressed_match_count.increment(1);
731                }
732                !match_should_be_suppressed
733            } else {
734                true
735            }
736        });
737    }
738
739    pub fn validate_matches(&self, rule_matches: &mut Vec<RuleMatch>) {
740        // Create MatchValidatorRuleMatch per match_validator_type to pass it to each match_validator
741        let mut match_validator_rule_match_per_type = AHashMap::new();
742
743        let mut validated_rule_matches = vec![];
744
745        for mut rule_match in rule_matches.drain(..) {
746            let rule = &self.rules[rule_match.rule_index];
747            if let Some(match_validation_type) = rule.internal_match_validation_type() {
748                match_validator_rule_match_per_type
749                    .entry(match_validation_type)
750                    .or_insert_with(Vec::new)
751                    .push(rule_match)
752            } else {
753                // There is no match validator for this rule, so mark it as not available.
754                rule_match.match_status.merge(MatchStatus::NotAvailable);
755                validated_rule_matches.push(rule_match);
756            }
757        }
758
759        RAYON_THREAD_POOL.install(|| {
760            use rayon::prelude::*;
761
762            match_validator_rule_match_per_type.par_iter_mut().for_each(
763                |(match_validation_type, matches_per_type)| {
764                    let match_validator = self.match_validators_per_type.get(match_validation_type);
765                    if let Some(match_validator) = match_validator {
766                        match_validator
767                            .as_ref()
768                            .validate(matches_per_type, &self.rules)
769                    }
770                },
771            );
772        });
773
774        // Refill the rule_matches with the validated matches
775        for (_, mut matches) in match_validator_rule_match_per_type {
776            validated_rule_matches.append(&mut matches);
777        }
778
779        // Sort rule_matches by start index
780        validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
781        *rule_matches = validated_rule_matches;
782    }
783
784    // Runs optional validation and drops supporting-rule matches from the output.
785    // Must be called OUTSIDE of any futures executor (e.g. block_on) because validate_matches
786    // uses RAYON_THREAD_POOL internally; running rayon inside block_on causes an EnterError panic
787    // when the calling thread re-enters the LocalPool executor context.
788    fn finalize_matches(&self, rule_matches: &mut Vec<RuleMatch>, validate: bool) {
789        if validate {
790            self.validate_matches(rule_matches);
791        }
792        // Supporting rules exist only to provide template variables to CustomHttpV2 validators of
793        // other rules. Their matches must not appear in the final output. They are retained until
794        // after validate_matches so that match pairing can reference their match values.
795        rule_matches.retain(|rule_match| !self.rules[rule_match.rule_index].is_supporting_rule);
796    }
797
798    /// Apply mutations from actions, and shift indices to match the mutated values.
799    /// This assumes the matches are all from the content given, and are sorted by start index.
800    fn apply_match_actions<E: Encoding>(
801        &self,
802        content: &mut String,
803        path: &Path<'static>,
804        rule_matches: Vec<InternalRuleMatch<E>>,
805        output_rule_matches: &mut Vec<RuleMatch>,
806        need_match_content: bool,
807    ) {
808        let mut utf8_byte_delta: isize = 0;
809        let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
810
811        for rule_match in rule_matches {
812            output_rule_matches.push(self.apply_match_actions_for_string::<E>(
813                content,
814                path.clone(),
815                rule_match,
816                &mut utf8_byte_delta,
817                &mut custom_index_delta,
818                need_match_content,
819            ));
820        }
821    }
822
823    /// This will be called once for each match of a single string. The rules must be passed in in order of the start index. Mutating rules must not overlap.
824    fn apply_match_actions_for_string<E: Encoding>(
825        &self,
826        content: &mut String,
827        path: Path<'static>,
828        rule_match: InternalRuleMatch<E>,
829        // The current difference in length between the original and mutated string
830        utf8_byte_delta: &mut isize,
831
832        // The difference between the custom index on the original string and the mutated string
833        custom_index_delta: &mut <E>::IndexShift,
834        need_match_content: bool,
835    ) -> RuleMatch {
836        let rule = &self.rules[rule_match.rule_index];
837
838        let custom_start =
839            (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
840                + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
841
842        let mut matched_content_copy = None;
843
844        if need_match_content {
845            // This copies part of the is_mutating block but is seperate since can't mix compilation condition and code condition
846            let mutated_utf8_match_start =
847                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
848            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
849
850            // Matches for mutating rules must have valid indices
851            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
852            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
853
854            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
855            matched_content_copy = Some(matched_content.to_string());
856        }
857
858        if rule.match_action.is_mutating() {
859            let mutated_utf8_match_start =
860                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
861            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
862
863            // Matches for mutating rules must have valid indices
864            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
865            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
866
867            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
868            if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
869                let before_replacement = &matched_content[replacement.start..replacement.end];
870
871                // update indices to match the new mutated content
872                <E>::adjust_shift(
873                    custom_index_delta,
874                    before_replacement,
875                    &replacement.replacement,
876                );
877                *utf8_byte_delta +=
878                    replacement.replacement.len() as isize - before_replacement.len() as isize;
879
880                let replacement_start = mutated_utf8_match_start + replacement.start;
881                let replacement_end = mutated_utf8_match_start + replacement.end;
882                content.replace_range(replacement_start..replacement_end, &replacement.replacement);
883            }
884        }
885
886        let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
887        let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
888            + shift_offset) as usize;
889
890        let rule = &self.rules[rule_match.rule_index];
891
892        let match_status: MatchStatus = if rule.match_validation_type.is_some() {
893            MatchStatus::NotChecked
894        } else {
895            MatchStatus::NotAvailable
896        };
897
898        RuleMatch {
899            rule_index: rule_match.rule_index,
900            path,
901            replacement_type: rule.match_action.replacement_type(),
902            start_index: custom_start,
903            end_index_exclusive: custom_end,
904            shift_offset,
905            match_value: matched_content_copy,
906            match_status,
907            keyword: rule_match.keyword,
908        }
909    }
910
911    fn sort_and_remove_overlapping_rules<E: Encoding>(
912        &self,
913        rule_matches: &mut Vec<InternalRuleMatch<E>>,
914    ) {
915        // Some of the scanner code relies on the behavior here, such as the sort order and removal of overlapping mutating rules.
916        // Be very careful if this function is modified.
917
918        rule_matches.sort_unstable_by(|a, b| {
919            // Mutating rules are a higher priority (earlier in the list)
920            let ord = self.rules[a.rule_index]
921                .match_action
922                .is_mutating()
923                .cmp(&self.rules[b.rule_index].match_action.is_mutating())
924                .reverse();
925
926            // Earlier start offset
927            let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
928
929            // Longer matches
930            let ord = ord.then(a.len().cmp(&b.len()).reverse());
931
932            // Matches with higher precedence come first
933            let ord = ord.then(
934                self.rules[a.rule_index]
935                    .precedence
936                    .cmp(&self.rules[b.rule_index].precedence)
937                    .reverse(),
938            );
939
940            // Matches from earlier rules
941            let ord = ord.then(a.rule_index.cmp(&b.rule_index));
942
943            // swap the order of everything so matches can be efficiently popped off the back as they are processed
944            ord.reverse()
945        });
946
947        let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
948
949        'rule_matches: while let Some(rule_match) = rule_matches.pop() {
950            if self.rules[rule_match.rule_index].match_action.is_mutating() {
951                // Mutating rules are kept only if they don't overlap with a previous rule.
952                if let Some(last) = retained_rules.last()
953                    && last.utf8_end > rule_match.utf8_start
954                {
955                    continue;
956                }
957            } else {
958                // Only retain if it doesn't overlap with any other rule. Since mutating matches are sorted before non-mutated matches
959                // this needs to check all retained matches (instead of just the last one)
960                for retained_rule in &retained_rules {
961                    if retained_rule.utf8_start < rule_match.utf8_end
962                        && retained_rule.utf8_end > rule_match.utf8_start
963                    {
964                        continue 'rule_matches;
965                    }
966                }
967            };
968            retained_rules.push(rule_match);
969        }
970
971        // ensure rules are sorted by start index (other parts of the library required this to function correctly)
972        retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
973
974        *rule_matches = retained_rules;
975    }
976}
977
978impl Drop for Scanner {
979    fn drop(&mut self) {
980        let stats = &*GLOBAL_STATS;
981        stats.scanner_deletions.increment(1);
982        stats.decrement_total_scanners();
983    }
984}
985
986#[derive(Default)]
987pub struct ScannerBuilder<'a> {
988    rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
989    labels: Labels,
990    scanner_features: ScannerFeatures,
991    async_scan_timeout: Duration,
992}
993
994impl ScannerBuilder<'_> {
995    pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
996        ScannerBuilder {
997            rules,
998            labels: Labels::empty(),
999            scanner_features: ScannerFeatures::default(),
1000            async_scan_timeout: Duration::from_secs(60 * 5),
1001        }
1002    }
1003
1004    pub fn labels(mut self, labels: Labels) -> Self {
1005        self.labels = labels;
1006        self
1007    }
1008
1009    pub fn with_async_scan_timeout(mut self, duration: Duration) -> Self {
1010        self.async_scan_timeout = duration;
1011        self
1012    }
1013
1014    pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
1015        self.scanner_features.add_implicit_index_wildcards = value;
1016        self
1017    }
1018
1019    pub fn with_return_matches(mut self, value: bool) -> Self {
1020        self.scanner_features.return_matches = value;
1021        self
1022    }
1023
1024    /// Enables/Disables the Multipass V0 feature. This defaults to TRUE.
1025    /// Multipass V0 saves matches from excluded scopes, and marks any identical
1026    /// matches in included scopes as a false positive.
1027    pub fn with_multipass_v0(mut self, value: bool) -> Self {
1028        self.scanner_features.multipass_v0_enabled = value;
1029        self
1030    }
1031
1032    /// Enables/Disables debug observability features. This defaults to FALSE.
1033    /// When enabled, metrics will include additional tags (such as `sds_namespace`)
1034    /// to help debug the source of matches.
1035    pub fn with_debug_observability(mut self, value: bool) -> Self {
1036        self.scanner_features.enable_debug_observability = value;
1037        self
1038    }
1039
1040    pub fn build(self) -> Result<Scanner, CreateScannerError> {
1041        let mut match_validators_per_type = AHashMap::new();
1042
1043        for rule in self.rules.iter() {
1044            if let Some(match_validation_type) = &rule.get_third_party_active_checker()
1045                && match_validation_type.can_create_match_validator()
1046            {
1047                let internal_type = match_validation_type.get_internal_match_validation_type();
1048                let match_validator = match_validation_type.into_match_validator();
1049                if let Ok(match_validator) = match_validator {
1050                    if !match_validators_per_type.contains_key(&internal_type) {
1051                        match_validators_per_type.insert(internal_type, match_validator);
1052                    }
1053                } else {
1054                    return Err(CreateScannerError::InvalidMatchValidator(
1055                        MatchValidatorCreationError::InternalError,
1056                    ));
1057                }
1058            }
1059        }
1060
1061        let compiled_rules = self
1062            .rules
1063            .iter()
1064            .enumerate()
1065            .map(|(rule_index, config)| {
1066                if config.is_supporting_rule && config.match_action != MatchAction::None {
1067                    return Err(CreateScannerError::SupportingRuleHasMatchAction);
1068                }
1069                let inner = config.convert_to_compiled_rule(rule_index, self.labels.clone())?;
1070                config.match_action.validate()?;
1071                let compiled_suppressions = match &config.suppressions {
1072                    Some(s) => s.compile()?,
1073                    None => None,
1074                };
1075                Ok(RootCompiledRule {
1076                    inner,
1077                    scope: config.scope.clone(),
1078                    match_action: config.match_action.clone(),
1079                    match_validation_type: config.get_third_party_active_checker().cloned(),
1080                    suppressions: compiled_suppressions,
1081                    precedence: config.precedence,
1082                    is_supporting_rule: config.is_supporting_rule,
1083                })
1084            })
1085            .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
1086
1087        let mut per_scanner_data = SharedData::new();
1088
1089        compiled_rules.iter().for_each(|rule| {
1090            rule.init_per_scanner_data(&mut per_scanner_data);
1091        });
1092
1093        let scoped_ruleset = ScopedRuleSet::new(
1094            &compiled_rules
1095                .iter()
1096                .map(|rule| rule.scope.clone())
1097                .collect::<Vec<_>>(),
1098        )
1099        .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
1100
1101        {
1102            let stats = &*GLOBAL_STATS;
1103            stats.scanner_creations.increment(1);
1104            stats.increment_total_scanners();
1105        }
1106
1107        Ok(Scanner {
1108            rules: compiled_rules,
1109            scoped_ruleset,
1110            scanner_features: self.scanner_features,
1111            metrics: ScannerMetrics::new(&self.labels),
1112            match_validators_per_type,
1113            labels: self.labels,
1114            per_scanner_data,
1115            async_scan_timeout: self.async_scan_timeout,
1116        })
1117    }
1118}
1119
1120struct ScannerContentVisitor<'a, E: Encoding> {
1121    scanner: &'a Scanner,
1122    regex_caches: &'a mut RegexCaches,
1123    rule_matches: &'a mut InternalRuleMatchSet<E>,
1124    // Rules that shall be skipped for this scan
1125    // This list shall be small (<10), so a linear search is acceptable
1126    blocked_rules: &'a Vec<usize>,
1127    excluded_matches: &'a mut AHashMap<String, String>,
1128    per_event_data: SharedData,
1129    wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
1130    async_jobs: &'a mut Vec<PendingRuleJob>,
1131    event_id: Option<String>,
1132}
1133
1134impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
1135    fn visit_content<'b>(
1136        &'b mut self,
1137        path: &Path<'a>,
1138        content: &str,
1139        mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
1140        exclusion_check: ExclusionCheck<'b>,
1141    ) -> Result<bool, ScannerError> {
1142        // matches for a single path
1143        let mut path_rules_matches = vec![];
1144
1145        // Create a map of per rule type data that can be shared between rules of the same type
1146        let mut per_string_data = SharedData::new();
1147        let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
1148
1149        rule_visitor.visit_rule_indices(|rule_index| {
1150            if self.blocked_rules.contains(&rule_index) {
1151                return Ok(());
1152            }
1153            let rule = &self.scanner.rules[rule_index];
1154            {
1155                if rule.inner.allow_scanner_to_exclude_namespace() {
1156                    // check if the path is excluded
1157                    if exclusion_check.is_excluded(rule_index) {
1158                        return Ok(());
1159                    }
1160                }
1161                // creating the emitter is basically free, it will get mostly optimized away
1162                let mut emitter = |rule_match: StringMatch| {
1163                    // This should never happen, but to ensure no empty match is ever generated
1164                    // (which may cause an infinite loop), this will panic instead.
1165                    assert_ne!(
1166                        rule_match.start, rule_match.end,
1167                        "empty match detected on rule with index {rule_index}"
1168                    );
1169                    path_rules_matches.push(InternalRuleMatch::new(rule_index, rule_match));
1170                };
1171
1172                rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
1173
1174                // TODO: move this somewhere higher?
1175                rule.init_per_event_data(&mut self.per_event_data);
1176
1177                let mut ctx = StringMatchesCtx {
1178                    rule_index,
1179                    regex_caches: self.regex_caches,
1180                    exclusion_check: &exclusion_check,
1181                    excluded_matches: self.excluded_matches,
1182                    match_emitter: &mut emitter,
1183                    wildcard_indices: wildcard_indices_per_path,
1184                    enable_debug_observability: self
1185                        .scanner
1186                        .scanner_features
1187                        .enable_debug_observability,
1188                    per_string_data: &mut per_string_data,
1189                    per_scanner_data: &self.scanner.per_scanner_data,
1190                    per_event_data: &mut self.per_event_data,
1191                    event_id: self.event_id.as_deref(),
1192                };
1193
1194                let async_status = rule.get_string_matches(content, path, &mut ctx)?;
1195
1196                match async_status {
1197                    RuleStatus::Done => {
1198                        // nothing to do
1199                    }
1200                    RuleStatus::Pending(fut) => {
1201                        self.async_jobs.push(PendingRuleJob {
1202                            fut,
1203                            path: path.into_static(),
1204                        });
1205                    }
1206                }
1207            }
1208            Ok(())
1209        })?;
1210
1211        // If there are any matches, the string will need to be accessed to check for false positives from
1212        // excluded matches, any to potentially mutate the string.
1213        // If there are any async jobs, this is also true since it's not known yet whether there
1214        // will be a match
1215        let needs_to_access_content = !path_rules_matches.is_empty() || !self.async_jobs.is_empty();
1216
1217        self.rule_matches
1218            .push_sync_matches(path, path_rules_matches);
1219
1220        Ok(needs_to_access_content)
1221    }
1222}
1223
1224// Calculates the next starting position for a regex match if a the previous match is a false positive
1225fn get_next_regex_start(content: &str, regex_match: (usize, usize)) -> Option<usize> {
1226    // The next valid UTF8 char after the start of the regex match is used
1227    if let Some((i, _)) = content[regex_match.0..].char_indices().nth(1) {
1228        Some(regex_match.0 + i)
1229    } else {
1230        // There are no more chars left in the string to scan
1231        None
1232    }
1233}
1234
1235fn is_false_positive_match(
1236    regex_match_range: (usize, usize),
1237    rule: &RegexCompiledRule,
1238    content: &str,
1239    check_excluded_keywords: bool,
1240) -> bool {
1241    if check_excluded_keywords
1242        && let Some(excluded_keywords) = &rule.excluded_keywords
1243        && excluded_keywords.is_false_positive_match(content, regex_match_range.0)
1244    {
1245        return true;
1246    }
1247
1248    if let Some(validator) = rule.validator.as_ref()
1249        && !validator.is_valid_match(&content[regex_match_range.0..regex_match_range.1])
1250    {
1251        return true;
1252    }
1253    false
1254}