dd_sds/scanner/
mod.rs

1use crate::encoding::Encoding;
2use crate::event::Event;
3use std::future::Future;
4
5use crate::match_validation::{
6    config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
7    match_validator::MatchValidator,
8};
9
10use error::MatchValidatorCreationError;
11
12use self::metrics::ScannerMetrics;
13use crate::match_validation::match_validator::RAYON_THREAD_POOL;
14use crate::observability::labels::Labels;
15use crate::rule_match::{InternalRuleMatch, RuleMatch};
16use crate::scanner::config::RuleConfig;
17use crate::scanner::internal_rule_match_set::InternalRuleMatchSet;
18use crate::scanner::regex_rule::compiled::RegexCompiledRule;
19use crate::scanner::regex_rule::{RegexCaches, access_regex_caches};
20use crate::scanner::scope::Scope;
21pub use crate::scanner::shared_data::SharedData;
22use crate::scanner::suppression::{CompiledSuppressions, SuppressionValidationError, Suppressions};
23use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
24pub use crate::secondary_validation::Validator;
25use crate::stats::GLOBAL_STATS;
26use crate::tokio::TOKIO_RUNTIME;
27use crate::{CreateScannerError, EncodeIndices, MatchAction, Path, ScannerError};
28use ahash::AHashMap;
29use futures::executor::block_on;
30use serde::{Deserialize, Serialize};
31use serde_with::serde_as;
32use std::ops::Deref;
33use std::pin::Pin;
34use std::sync::Arc;
35use std::time::{Duration, Instant};
36use tokio::task::JoinHandle;
37use tokio::time::timeout;
38
39pub mod config;
40pub mod debug_scan;
41pub mod error;
42pub mod metrics;
43pub mod regex_rule;
44pub mod scope;
45pub mod shared_data;
46pub mod shared_pool;
47pub mod suppression;
48
49mod internal_rule_match_set;
50#[cfg(test)]
51mod test;
52
53#[derive(Clone)]
54pub struct StringMatch {
55    pub start: usize,
56    pub end: usize,
57    // The keyword that was used to match this rule. Optional, only some rules may set this value.
58    pub keyword: Option<String>,
59}
60
61pub trait MatchEmitter<T = ()> {
62    fn emit(&mut self, string_match: StringMatch) -> T;
63}
64
65// This implements MatchEmitter for mutable closures (so you can use a closure instead of a custom
66// struct that implements MatchEmitter)
67impl<F, T> MatchEmitter<T> for F
68where
69    F: FnMut(StringMatch) -> T,
70{
71    fn emit(&mut self, string_match: StringMatch) -> T {
72        // This just calls the closure (itself)
73        (self)(string_match)
74    }
75}
76
77/// The precedence of a rule. Catchall is the lowest precedence, Specific is the highest precedence.
78/// The default precedence is Specific.
79/// For rules that:
80/// - Have the same mutation priority
81/// - Match at the same index
82/// - Match the same number of characters
83///
84/// Then the rule with the highest precedence will be used.
85#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Default)]
86pub enum Precedence {
87    Catchall,
88    Generic,
89    #[default]
90    Specific,
91}
92
93#[serde_as]
94#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
95pub struct RootRuleConfig<T> {
96    pub match_action: MatchAction,
97    #[serde(default)]
98    pub scope: Scope,
99    #[deprecated(note = "Use `third_party_active_checker` instead")]
100    match_validation_type: Option<MatchValidationType>,
101    third_party_active_checker: Option<MatchValidationType>,
102    suppressions: Option<Suppressions>,
103    #[serde(default)]
104    precedence: Precedence,
105    #[serde(flatten)]
106    pub inner: T,
107}
108
109impl<T> RootRuleConfig<T>
110where
111    T: RuleConfig + 'static,
112{
113    pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
114        RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
115    }
116
117    pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
118        self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
119    }
120}
121
122impl<T> RootRuleConfig<T> {
123    pub fn new(inner: T) -> Self {
124        #[allow(deprecated)]
125        Self {
126            match_action: MatchAction::None,
127            scope: Scope::all(),
128            match_validation_type: None,
129            third_party_active_checker: None,
130            suppressions: None,
131            precedence: Precedence::default(),
132            inner,
133        }
134    }
135
136    pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
137        #[allow(deprecated)]
138        RootRuleConfig {
139            match_action: self.match_action,
140            scope: self.scope,
141            match_validation_type: self.match_validation_type,
142            third_party_active_checker: self.third_party_active_checker,
143            suppressions: self.suppressions,
144            precedence: self.precedence,
145            inner: func(self.inner),
146        }
147    }
148
149    pub fn match_action(mut self, action: MatchAction) -> Self {
150        self.match_action = action;
151        self
152    }
153
154    pub fn precedence(mut self, precedence: Precedence) -> Self {
155        self.precedence = precedence;
156        self
157    }
158
159    pub fn scope(mut self, scope: Scope) -> Self {
160        self.scope = scope;
161        self
162    }
163
164    pub fn third_party_active_checker(
165        mut self,
166        match_validation_type: MatchValidationType,
167    ) -> Self {
168        self.third_party_active_checker = Some(match_validation_type);
169        self
170    }
171
172    pub fn suppressions(mut self, suppressions: Suppressions) -> Self {
173        self.suppressions = Some(suppressions);
174        self
175    }
176
177    fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
178        #[allow(deprecated)]
179        self.third_party_active_checker
180            .as_ref()
181            .or(self.match_validation_type.as_ref())
182    }
183}
184
185impl<T> Deref for RootRuleConfig<T> {
186    type Target = T;
187
188    fn deref(&self) -> &Self::Target {
189        &self.inner
190    }
191}
192pub struct RootCompiledRule {
193    pub inner: Box<dyn CompiledRule>,
194    pub scope: Scope,
195    pub match_action: MatchAction,
196    pub match_validation_type: Option<MatchValidationType>,
197    pub suppressions: Option<CompiledSuppressions>,
198    pub precedence: Precedence,
199}
200
201impl RootCompiledRule {
202    pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
203        self.match_validation_type
204            .as_ref()
205            .map(|x| x.get_internal_match_validation_type())
206    }
207}
208
209impl Deref for RootCompiledRule {
210    type Target = dyn CompiledRule;
211
212    fn deref(&self) -> &Self::Target {
213        self.inner.as_ref()
214    }
215}
216
217pub struct StringMatchesCtx<'a> {
218    rule_index: usize,
219    pub regex_caches: &'a mut RegexCaches,
220    pub exclusion_check: &'a ExclusionCheck<'a>,
221    pub excluded_matches: &'a mut AHashMap<String, String>,
222    pub match_emitter: &'a mut dyn MatchEmitter,
223    pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
224    pub enable_debug_observability: bool,
225
226    // Shared Data
227    pub per_string_data: &'a mut SharedData,
228    pub per_scanner_data: &'a SharedData,
229    pub per_event_data: &'a mut SharedData,
230    pub event_id: Option<&'a str>,
231}
232
233impl StringMatchesCtx<'_> {
234    /// If a `get_string_matches` implementation needs to do any async processing (e.g. I/O),
235    /// this function can be used to return an "async job" to find matches. The return value
236    /// of `process_async` should be returned from the `get_string_matches` function. The future
237    /// passed into this function will be spawned and executed immediately without blocking
238    /// other `get_string_matches` calls. This means all the async jobs will run concurrently.
239    ///
240    /// The `ctx` available to async jobs is more restrictive than the normal `ctx` available in
241    /// `get_string_matches`. The only thing you can do is return matches. If other data is needed,
242    /// it should be accessed before `process_async` is called.
243    pub fn process_async(
244        &self,
245        func: impl for<'a> FnOnce(
246            &'a mut AsyncStringMatchesCtx,
247        )
248            -> Pin<Box<dyn Future<Output = Result<(), ScannerError>> + Send + 'a>>
249        + Send
250        + 'static,
251    ) -> RuleResult {
252        let rule_index = self.rule_index;
253
254        // The future is spawned onto the tokio runtime immediately so it starts running
255        // in the background
256        let fut = TOKIO_RUNTIME.spawn(async move {
257            let start = Instant::now();
258            let mut ctx = AsyncStringMatchesCtx {
259                rule_matches: vec![],
260            };
261            (func)(&mut ctx).await?;
262            let io_duration = start.elapsed();
263
264            Ok(AsyncRuleInfo {
265                rule_index,
266                rule_matches: ctx.rule_matches,
267                io_duration,
268            })
269        });
270
271        Ok(RuleStatus::Pending(fut))
272    }
273}
274
275pub struct AsyncStringMatchesCtx {
276    rule_matches: Vec<StringMatch>,
277}
278
279impl AsyncStringMatchesCtx {
280    pub fn emit_match(&mut self, string_match: StringMatch) {
281        self.rule_matches.push(string_match);
282    }
283}
284
285#[must_use]
286pub enum RuleStatus {
287    Done,
288    Pending(PendingRuleResult),
289}
290
291// pub type PendingRuleResult = BoxFuture<'static, Result<AsyncRuleInfo, ScannerError>>;
292pub type PendingRuleResult = JoinHandle<Result<AsyncRuleInfo, ScannerError>>;
293
294pub struct PendingRuleJob {
295    fut: PendingRuleResult,
296    path: Path<'static>,
297}
298
299pub struct AsyncRuleInfo {
300    rule_index: usize,
301    rule_matches: Vec<StringMatch>,
302    io_duration: Duration,
303}
304
305/// A rule result that cannot be async
306pub type RuleResult = Result<RuleStatus, ScannerError>;
307
308// This is the public trait that is used to define the behavior of a compiled rule.
309pub trait CompiledRule: Send + Sync {
310    fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
311        // by default, no per-scanner data is initialized
312    }
313
314    fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
315        // by default, no per-string data is initialized
316    }
317
318    fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
319        // by default, no per-event data is initialized
320    }
321
322    fn get_string_matches(
323        &self,
324        content: &str,
325        path: &Path,
326        ctx: &mut StringMatchesCtx<'_>,
327    ) -> RuleResult;
328
329    // Whether a match from this rule should be excluded (marked as a false-positive)
330    // if the content of this match was found in a match from an excluded scope
331    fn should_exclude_multipass_v0(&self) -> bool {
332        // default is to NOT use Multi-pass V0
333        false
334    }
335
336    fn on_excluded_match_multipass_v0(
337        &self,
338        _path: &Path,
339        _excluded_path: &str,
340        _enable_debug_observability: bool,
341    ) {
342        // default is to do nothing
343    }
344
345    fn as_regex_rule(&self) -> Option<&RegexCompiledRule> {
346        None
347    }
348
349    fn as_regex_rule_mut(&mut self) -> Option<&mut RegexCompiledRule> {
350        None
351    }
352
353    fn allow_scanner_to_exclude_namespace(&self) -> bool {
354        true
355    }
356}
357
358impl<T> RuleConfig for Box<T>
359where
360    T: RuleConfig + ?Sized,
361{
362    fn convert_to_compiled_rule(
363        &self,
364        rule_index: usize,
365        labels: Labels,
366    ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
367        self.as_ref().convert_to_compiled_rule(rule_index, labels)
368    }
369}
370
371#[derive(Debug, PartialEq, Clone)]
372struct ScannerFeatures {
373    pub add_implicit_index_wildcards: bool,
374    pub multipass_v0_enabled: bool,
375    pub return_matches: bool,
376    pub enable_debug_observability: bool,
377}
378
379impl Default for ScannerFeatures {
380    fn default() -> Self {
381        Self {
382            add_implicit_index_wildcards: false,
383            multipass_v0_enabled: true,
384            return_matches: false,
385            enable_debug_observability: false,
386        }
387    }
388}
389
390pub struct ScanOptions {
391    // The blocked_rules_idx parameter is a list of rule indices that should be skipped for this scan.
392    // this list shall be small (<10), so a linear search is acceptable otherwise performance will be impacted.
393    pub blocked_rules_idx: Vec<usize>,
394    // The wildcarded_indices parameter is a map containing a list of tuples of (start, end) indices that should be treated as wildcards (for the message key only) per path.
395    pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
396    // Whether to validate matches using third-party validators (e.g., checksum validation for credit cards).
397    // When enabled, the scanner automatically collects match content needed for validation.
398    pub validate_matches: bool,
399}
400
401impl Default for ScanOptions {
402    fn default() -> Self {
403        Self {
404            blocked_rules_idx: vec![],
405            wildcarded_indices: AHashMap::new(),
406            validate_matches: false,
407        }
408    }
409}
410
411pub struct ScanOptionBuilder {
412    blocked_rules_idx: Vec<usize>,
413    wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
414    validate_matches: bool,
415}
416
417impl ScanOptionBuilder {
418    pub fn new() -> Self {
419        Self {
420            blocked_rules_idx: vec![],
421            wildcarded_indices: AHashMap::new(),
422            validate_matches: false,
423        }
424    }
425
426    pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
427        self.blocked_rules_idx = blocked_rules_idx;
428        self
429    }
430
431    pub fn with_wildcarded_indices(
432        mut self,
433        wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
434    ) -> Self {
435        self.wildcarded_indices = wildcarded_indices;
436        self
437    }
438
439    pub fn with_validate_matching(mut self, validate_matches: bool) -> Self {
440        self.validate_matches = validate_matches;
441        self
442    }
443
444    pub fn build(self) -> ScanOptions {
445        ScanOptions {
446            blocked_rules_idx: self.blocked_rules_idx,
447            wildcarded_indices: self.wildcarded_indices,
448            validate_matches: self.validate_matches,
449        }
450    }
451}
452
453pub struct Scanner {
454    rules: Vec<RootCompiledRule>,
455    scoped_ruleset: ScopedRuleSet,
456    scanner_features: ScannerFeatures,
457    metrics: ScannerMetrics,
458    labels: Labels,
459    match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
460    per_scanner_data: SharedData,
461    async_scan_timeout: Duration,
462}
463
464impl Scanner {
465    pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
466        ScannerBuilder::new(rules)
467    }
468
469    // This function scans the given event with the rules configured in the scanner.
470    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
471    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
472    // This version uses default scan options (no validation, no blocked rules, no wildcarded indices).
473    pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
474        self.scan_with_options(event, ScanOptions::default())
475    }
476
477    // This function scans the given event with the rules configured in the scanner.
478    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
479    // The options parameter allows customizing the scan behavior (validation, blocked rules, etc.).
480    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
481    pub fn scan_with_options<E: Event>(
482        &self,
483        event: &mut E,
484        options: ScanOptions,
485    ) -> Result<Vec<RuleMatch>, ScannerError> {
486        block_on(self.internal_scan_with_metrics(event, options))
487    }
488
489    // This function scans the given event with the rules configured in the scanner.
490    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
491    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
492    pub async fn scan_async<E: Event>(
493        &self,
494        event: &mut E,
495    ) -> Result<Vec<RuleMatch>, ScannerError> {
496        self.scan_async_with_options(event, ScanOptions::default())
497            .await
498    }
499
500    pub async fn scan_async_with_options<E: Event>(
501        &self,
502        event: &mut E,
503        options: ScanOptions,
504    ) -> Result<Vec<RuleMatch>, ScannerError> {
505        let fut = self.internal_scan_with_metrics(event, options);
506
507        // The sleep from the timeout requires being in a tokio context
508        // The guard needs to be dropped before await since the guard is !Send
509        let timeout = {
510            let _tokio_guard = TOKIO_RUNTIME.enter();
511            timeout(self.async_scan_timeout, fut)
512        };
513
514        timeout.await.unwrap_or(Err(ScannerError::Transient(
515            "Async scan timeout".to_string(),
516        )))
517    }
518
519    fn record_metrics(
520        &self,
521        output_rule_matches: &[RuleMatch],
522        start: Instant,
523        io_duration: Option<Duration>,
524    ) {
525        // Add number of scanned events
526        self.metrics.num_scanned_events.increment(1);
527        // Add number of matches
528        self.metrics
529            .match_count
530            .increment(output_rule_matches.len() as u64);
531
532        if let Some(io_duration) = io_duration {
533            let total_duration = start.elapsed();
534            let cpu_duration = total_duration.saturating_sub(io_duration);
535            self.metrics
536                .cpu_duration
537                .increment(cpu_duration.as_nanos() as u64);
538        }
539    }
540
541    async fn internal_scan_with_metrics<E: Event>(
542        &self,
543        event: &mut E,
544        options: ScanOptions,
545    ) -> Result<Vec<RuleMatch>, ScannerError> {
546        let start = Instant::now();
547        let result = self.internal_scan(event, options).await;
548        match result {
549            Ok((rule_matches, io_duration)) => {
550                self.record_metrics(&rule_matches, start, Some(io_duration));
551                Ok(rule_matches)
552            }
553            Err(e) => {
554                self.record_metrics(&[], start, None);
555                Err(e)
556            }
557        }
558    }
559
560    fn process_rule_matches<E: Event>(
561        &self,
562        event: &mut E,
563        rule_matches: InternalRuleMatchSet<E::Encoding>,
564        excluded_matches: AHashMap<String, String>,
565        output_rule_matches: &mut Vec<RuleMatch>,
566        need_match_content: bool,
567    ) {
568        if rule_matches.is_empty() {
569            return;
570        }
571        access_regex_caches(|regex_caches| {
572            for (path, mut rule_matches) in rule_matches.into_iter() {
573                // All rule matches in each inner list are for a single path, so they can be processed independently.
574                event.visit_string_mut(&path, |content| {
575                    // calculate_indices requires that matches are sorted by start index
576                    rule_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
577
578                    <<E as Event>::Encoding>::calculate_indices(
579                        content,
580                        rule_matches.iter_mut().map(
581                            |rule_match: &mut InternalRuleMatch<E::Encoding>| EncodeIndices {
582                                utf8_start: rule_match.utf8_start,
583                                utf8_end: rule_match.utf8_end,
584                                custom_start: &mut rule_match.custom_start,
585                                custom_end: &mut rule_match.custom_end,
586                            },
587                        ),
588                    );
589
590                    if self.scanner_features.multipass_v0_enabled {
591                        // Now that the `excluded_matches` set is fully populated, filter out any matches
592                        // that are the same as excluded matches (also known as "Multi-pass V0")
593                        rule_matches.retain(|rule_match| {
594                            if self.rules[rule_match.rule_index]
595                                .inner
596                                .should_exclude_multipass_v0()
597                            {
598                                let match_content =
599                                    &content[rule_match.utf8_start..rule_match.utf8_end];
600                                let excluded_path = excluded_matches.get(match_content);
601                                if let Some(excluded_path) = excluded_path {
602                                    self.rules[rule_match.rule_index]
603                                        .on_excluded_match_multipass_v0(
604                                            &path,
605                                            excluded_path,
606                                            self.scanner_features.enable_debug_observability,
607                                        );
608                                }
609                                excluded_path.is_none()
610                            } else {
611                                true
612                            }
613                        });
614                    }
615
616                    self.suppress_matches::<E::Encoding>(&mut rule_matches, content, regex_caches);
617
618                    self.sort_and_remove_overlapping_rules::<E::Encoding>(&mut rule_matches);
619
620                    let will_mutate = rule_matches.iter().any(|rule_match| {
621                        self.rules[rule_match.rule_index].match_action.is_mutating()
622                    });
623
624                    self.apply_match_actions(
625                        content,
626                        &path,
627                        rule_matches,
628                        output_rule_matches,
629                        need_match_content,
630                    );
631
632                    will_mutate
633                });
634            }
635        });
636    }
637
638    async fn internal_scan<E: Event>(
639        &self,
640        event: &mut E,
641        options: ScanOptions,
642    ) -> Result<(Vec<RuleMatch>, Duration), ScannerError> {
643        // If validation is requested, we need to collect match content even if the scanner
644        // wasn't originally configured to return matches
645        let need_match_content = self.scanner_features.return_matches || options.validate_matches;
646        // All matches, after some (but not all) false-positives have been removed.
647        let mut rule_matches = InternalRuleMatchSet::new();
648        let mut excluded_matches = AHashMap::new();
649        let mut async_jobs = vec![];
650
651        access_regex_caches(|regex_caches| {
652            self.scoped_ruleset.visit_string_rule_combinations(
653                event,
654                ScannerContentVisitor {
655                    scanner: self,
656                    regex_caches,
657                    rule_matches: &mut rule_matches,
658                    blocked_rules: &options.blocked_rules_idx,
659                    excluded_matches: &mut excluded_matches,
660                    per_event_data: SharedData::new(),
661                    wildcarded_indexes: &options.wildcarded_indices,
662                    async_jobs: &mut async_jobs,
663                    event_id: event.get_id().map(|s| s.to_string()),
664                },
665            )
666        })?;
667
668        // The async jobs were already spawned on the tokio runtime, so the
669        // results just need to be collected
670        let mut total_io_duration = Duration::ZERO;
671        for job in async_jobs {
672            let rule_info = job.fut.await.unwrap()?;
673            total_io_duration += rule_info.io_duration;
674            rule_matches.push_async_matches(
675                &job.path,
676                rule_info
677                    .rule_matches
678                    .into_iter()
679                    .map(|x| InternalRuleMatch::new(rule_info.rule_index, x)),
680            );
681        }
682
683        let mut output_rule_matches = vec![];
684
685        self.process_rule_matches(
686            event,
687            rule_matches,
688            excluded_matches,
689            &mut output_rule_matches,
690            need_match_content,
691        );
692
693        if options.validate_matches {
694            self.validate_matches(&mut output_rule_matches);
695        }
696
697        Ok((output_rule_matches, total_io_duration))
698    }
699
700    pub fn suppress_matches<E: Encoding>(
701        &self,
702        rule_matches: &mut Vec<InternalRuleMatch<E>>,
703        content: &str,
704        regex_caches: &mut RegexCaches,
705    ) {
706        rule_matches.retain(|rule_match| {
707            if let Some(suppressions) = &self.rules[rule_match.rule_index].suppressions {
708                let match_should_be_suppressed = suppressions.should_match_be_suppressed(
709                    &content[rule_match.utf8_start..rule_match.utf8_end],
710                    regex_caches,
711                );
712
713                if match_should_be_suppressed {
714                    self.metrics.suppressed_match_count.increment(1);
715                }
716                !match_should_be_suppressed
717            } else {
718                true
719            }
720        });
721    }
722
723    pub fn validate_matches(&self, rule_matches: &mut Vec<RuleMatch>) {
724        // Create MatchValidatorRuleMatch per match_validator_type to pass it to each match_validator
725        let mut match_validator_rule_match_per_type = AHashMap::new();
726
727        let mut validated_rule_matches = vec![];
728
729        for mut rule_match in rule_matches.drain(..) {
730            let rule = &self.rules[rule_match.rule_index];
731            if let Some(match_validation_type) = rule.internal_match_validation_type() {
732                match_validator_rule_match_per_type
733                    .entry(match_validation_type)
734                    .or_insert_with(Vec::new)
735                    .push(rule_match)
736            } else {
737                // There is no match validator for this rule, so mark it as not available.
738                rule_match.match_status.merge(MatchStatus::NotAvailable);
739                validated_rule_matches.push(rule_match);
740            }
741        }
742
743        RAYON_THREAD_POOL.install(|| {
744            use rayon::prelude::*;
745
746            match_validator_rule_match_per_type.par_iter_mut().for_each(
747                |(match_validation_type, matches_per_type)| {
748                    let match_validator = self.match_validators_per_type.get(match_validation_type);
749                    if let Some(match_validator) = match_validator {
750                        match_validator
751                            .as_ref()
752                            .validate(matches_per_type, &self.rules)
753                    }
754                },
755            );
756        });
757
758        // Refill the rule_matches with the validated matches
759        for (_, mut matches) in match_validator_rule_match_per_type {
760            validated_rule_matches.append(&mut matches);
761        }
762
763        // Sort rule_matches by start index
764        validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
765        *rule_matches = validated_rule_matches;
766    }
767
768    /// Apply mutations from actions, and shift indices to match the mutated values.
769    /// This assumes the matches are all from the content given, and are sorted by start index.
770    fn apply_match_actions<E: Encoding>(
771        &self,
772        content: &mut String,
773        path: &Path<'static>,
774        rule_matches: Vec<InternalRuleMatch<E>>,
775        output_rule_matches: &mut Vec<RuleMatch>,
776        need_match_content: bool,
777    ) {
778        let mut utf8_byte_delta: isize = 0;
779        let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
780
781        for rule_match in rule_matches {
782            output_rule_matches.push(self.apply_match_actions_for_string::<E>(
783                content,
784                path.clone(),
785                rule_match,
786                &mut utf8_byte_delta,
787                &mut custom_index_delta,
788                need_match_content,
789            ));
790        }
791    }
792
793    /// This will be called once for each match of a single string. The rules must be passed in in order of the start index. Mutating rules must not overlap.
794    fn apply_match_actions_for_string<E: Encoding>(
795        &self,
796        content: &mut String,
797        path: Path<'static>,
798        rule_match: InternalRuleMatch<E>,
799        // The current difference in length between the original and mutated string
800        utf8_byte_delta: &mut isize,
801
802        // The difference between the custom index on the original string and the mutated string
803        custom_index_delta: &mut <E>::IndexShift,
804        need_match_content: bool,
805    ) -> RuleMatch {
806        let rule = &self.rules[rule_match.rule_index];
807
808        let custom_start =
809            (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
810                + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
811
812        let mut matched_content_copy = None;
813
814        if need_match_content {
815            // This copies part of the is_mutating block but is seperate since can't mix compilation condition and code condition
816            let mutated_utf8_match_start =
817                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
818            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
819
820            // Matches for mutating rules must have valid indices
821            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
822            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
823
824            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
825            matched_content_copy = Some(matched_content.to_string());
826        }
827
828        if rule.match_action.is_mutating() {
829            let mutated_utf8_match_start =
830                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
831            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
832
833            // Matches for mutating rules must have valid indices
834            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
835            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
836
837            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
838            if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
839                let before_replacement = &matched_content[replacement.start..replacement.end];
840
841                // update indices to match the new mutated content
842                <E>::adjust_shift(
843                    custom_index_delta,
844                    before_replacement,
845                    &replacement.replacement,
846                );
847                *utf8_byte_delta +=
848                    replacement.replacement.len() as isize - before_replacement.len() as isize;
849
850                let replacement_start = mutated_utf8_match_start + replacement.start;
851                let replacement_end = mutated_utf8_match_start + replacement.end;
852                content.replace_range(replacement_start..replacement_end, &replacement.replacement);
853            }
854        }
855
856        let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
857        let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
858            + shift_offset) as usize;
859
860        let rule = &self.rules[rule_match.rule_index];
861
862        let match_status: MatchStatus = if rule.match_validation_type.is_some() {
863            MatchStatus::NotChecked
864        } else {
865            MatchStatus::NotAvailable
866        };
867
868        RuleMatch {
869            rule_index: rule_match.rule_index,
870            path,
871            replacement_type: rule.match_action.replacement_type(),
872            start_index: custom_start,
873            end_index_exclusive: custom_end,
874            shift_offset,
875            match_value: matched_content_copy,
876            match_status,
877            keyword: rule_match.keyword,
878        }
879    }
880
881    fn sort_and_remove_overlapping_rules<E: Encoding>(
882        &self,
883        rule_matches: &mut Vec<InternalRuleMatch<E>>,
884    ) {
885        // Some of the scanner code relies on the behavior here, such as the sort order and removal of overlapping mutating rules.
886        // Be very careful if this function is modified.
887
888        rule_matches.sort_unstable_by(|a, b| {
889            // Mutating rules are a higher priority (earlier in the list)
890            let ord = self.rules[a.rule_index]
891                .match_action
892                .is_mutating()
893                .cmp(&self.rules[b.rule_index].match_action.is_mutating())
894                .reverse();
895
896            // Earlier start offset
897            let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
898
899            // Longer matches
900            let ord = ord.then(a.len().cmp(&b.len()).reverse());
901
902            // Matches with higher precedence come first
903            let ord = ord.then(
904                self.rules[a.rule_index]
905                    .precedence
906                    .cmp(&self.rules[b.rule_index].precedence)
907                    .reverse(),
908            );
909
910            // Matches from earlier rules
911            let ord = ord.then(a.rule_index.cmp(&b.rule_index));
912
913            // swap the order of everything so matches can be efficiently popped off the back as they are processed
914            ord.reverse()
915        });
916
917        let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
918
919        'rule_matches: while let Some(rule_match) = rule_matches.pop() {
920            if self.rules[rule_match.rule_index].match_action.is_mutating() {
921                // Mutating rules are kept only if they don't overlap with a previous rule.
922                if let Some(last) = retained_rules.last()
923                    && last.utf8_end > rule_match.utf8_start
924                {
925                    continue;
926                }
927            } else {
928                // Only retain if it doesn't overlap with any other rule. Since mutating matches are sorted before non-mutated matches
929                // this needs to check all retained matches (instead of just the last one)
930                for retained_rule in &retained_rules {
931                    if retained_rule.utf8_start < rule_match.utf8_end
932                        && retained_rule.utf8_end > rule_match.utf8_start
933                    {
934                        continue 'rule_matches;
935                    }
936                }
937            };
938            retained_rules.push(rule_match);
939        }
940
941        // ensure rules are sorted by start index (other parts of the library required this to function correctly)
942        retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
943
944        *rule_matches = retained_rules;
945    }
946}
947
948impl Drop for Scanner {
949    fn drop(&mut self) {
950        let stats = &*GLOBAL_STATS;
951        stats.scanner_deletions.increment(1);
952        stats.decrement_total_scanners();
953    }
954}
955
956#[derive(Default)]
957pub struct ScannerBuilder<'a> {
958    rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
959    labels: Labels,
960    scanner_features: ScannerFeatures,
961    async_scan_timeout: Duration,
962}
963
964impl ScannerBuilder<'_> {
965    pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
966        ScannerBuilder {
967            rules,
968            labels: Labels::empty(),
969            scanner_features: ScannerFeatures::default(),
970            async_scan_timeout: Duration::from_secs(60 * 5),
971        }
972    }
973
974    pub fn labels(mut self, labels: Labels) -> Self {
975        self.labels = labels;
976        self
977    }
978
979    pub fn with_async_scan_timeout(mut self, duration: Duration) -> Self {
980        self.async_scan_timeout = duration;
981        self
982    }
983
984    pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
985        self.scanner_features.add_implicit_index_wildcards = value;
986        self
987    }
988
989    pub fn with_return_matches(mut self, value: bool) -> Self {
990        self.scanner_features.return_matches = value;
991        self
992    }
993
994    /// Enables/Disables the Multipass V0 feature. This defaults to TRUE.
995    /// Multipass V0 saves matches from excluded scopes, and marks any identical
996    /// matches in included scopes as a false positive.
997    pub fn with_multipass_v0(mut self, value: bool) -> Self {
998        self.scanner_features.multipass_v0_enabled = value;
999        self
1000    }
1001
1002    /// Enables/Disables debug observability features. This defaults to FALSE.
1003    /// When enabled, metrics will include additional tags (such as `sds_namespace`)
1004    /// to help debug the source of matches.
1005    pub fn with_debug_observability(mut self, value: bool) -> Self {
1006        self.scanner_features.enable_debug_observability = value;
1007        self
1008    }
1009
1010    pub fn build(self) -> Result<Scanner, CreateScannerError> {
1011        let mut match_validators_per_type = AHashMap::new();
1012
1013        for rule in self.rules.iter() {
1014            if let Some(match_validation_type) = &rule.get_third_party_active_checker()
1015                && match_validation_type.can_create_match_validator()
1016            {
1017                let internal_type = match_validation_type.get_internal_match_validation_type();
1018                let match_validator = match_validation_type.into_match_validator();
1019                if let Ok(match_validator) = match_validator {
1020                    if !match_validators_per_type.contains_key(&internal_type) {
1021                        match_validators_per_type.insert(internal_type, match_validator);
1022                    }
1023                } else {
1024                    return Err(CreateScannerError::InvalidMatchValidator(
1025                        MatchValidatorCreationError::InternalError,
1026                    ));
1027                }
1028            }
1029        }
1030
1031        let compiled_rules = self
1032            .rules
1033            .iter()
1034            .enumerate()
1035            .map(|(rule_index, config)| {
1036                let inner = config.convert_to_compiled_rule(rule_index, self.labels.clone())?;
1037                config.match_action.validate()?;
1038                let compiled_suppressions = match &config.suppressions {
1039                    Some(s) => s.compile()?,
1040                    None => None,
1041                };
1042                Ok(RootCompiledRule {
1043                    inner,
1044                    scope: config.scope.clone(),
1045                    match_action: config.match_action.clone(),
1046                    match_validation_type: config.get_third_party_active_checker().cloned(),
1047                    suppressions: compiled_suppressions,
1048                    precedence: config.precedence,
1049                })
1050            })
1051            .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
1052
1053        let mut per_scanner_data = SharedData::new();
1054
1055        compiled_rules.iter().for_each(|rule| {
1056            rule.init_per_scanner_data(&mut per_scanner_data);
1057        });
1058
1059        let scoped_ruleset = ScopedRuleSet::new(
1060            &compiled_rules
1061                .iter()
1062                .map(|rule| rule.scope.clone())
1063                .collect::<Vec<_>>(),
1064        )
1065        .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
1066
1067        {
1068            let stats = &*GLOBAL_STATS;
1069            stats.scanner_creations.increment(1);
1070            stats.increment_total_scanners();
1071        }
1072
1073        Ok(Scanner {
1074            rules: compiled_rules,
1075            scoped_ruleset,
1076            scanner_features: self.scanner_features,
1077            metrics: ScannerMetrics::new(&self.labels),
1078            match_validators_per_type,
1079            labels: self.labels,
1080            per_scanner_data,
1081            async_scan_timeout: self.async_scan_timeout,
1082        })
1083    }
1084}
1085
1086struct ScannerContentVisitor<'a, E: Encoding> {
1087    scanner: &'a Scanner,
1088    regex_caches: &'a mut RegexCaches,
1089    rule_matches: &'a mut InternalRuleMatchSet<E>,
1090    // Rules that shall be skipped for this scan
1091    // This list shall be small (<10), so a linear search is acceptable
1092    blocked_rules: &'a Vec<usize>,
1093    excluded_matches: &'a mut AHashMap<String, String>,
1094    per_event_data: SharedData,
1095    wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
1096    async_jobs: &'a mut Vec<PendingRuleJob>,
1097    event_id: Option<String>,
1098}
1099
1100impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
1101    fn visit_content<'b>(
1102        &'b mut self,
1103        path: &Path<'a>,
1104        content: &str,
1105        mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
1106        exclusion_check: ExclusionCheck<'b>,
1107    ) -> Result<bool, ScannerError> {
1108        // matches for a single path
1109        let mut path_rules_matches = vec![];
1110
1111        // Create a map of per rule type data that can be shared between rules of the same type
1112        let mut per_string_data = SharedData::new();
1113        let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
1114
1115        rule_visitor.visit_rule_indices(|rule_index| {
1116            if self.blocked_rules.contains(&rule_index) {
1117                return Ok(());
1118            }
1119            let rule = &self.scanner.rules[rule_index];
1120            {
1121                if rule.inner.allow_scanner_to_exclude_namespace() {
1122                    // check if the path is excluded
1123                    if exclusion_check.is_excluded(rule_index) {
1124                        return Ok(());
1125                    }
1126                }
1127                // creating the emitter is basically free, it will get mostly optimized away
1128                let mut emitter = |rule_match: StringMatch| {
1129                    // This should never happen, but to ensure no empty match is ever generated
1130                    // (which may cause an infinite loop), this will panic instead.
1131                    assert_ne!(
1132                        rule_match.start, rule_match.end,
1133                        "empty match detected on rule with index {rule_index}"
1134                    );
1135                    path_rules_matches.push(InternalRuleMatch::new(rule_index, rule_match));
1136                };
1137
1138                rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
1139
1140                // TODO: move this somewhere higher?
1141                rule.init_per_event_data(&mut self.per_event_data);
1142
1143                let mut ctx = StringMatchesCtx {
1144                    rule_index,
1145                    regex_caches: self.regex_caches,
1146                    exclusion_check: &exclusion_check,
1147                    excluded_matches: self.excluded_matches,
1148                    match_emitter: &mut emitter,
1149                    wildcard_indices: wildcard_indices_per_path,
1150                    enable_debug_observability: self
1151                        .scanner
1152                        .scanner_features
1153                        .enable_debug_observability,
1154                    per_string_data: &mut per_string_data,
1155                    per_scanner_data: &self.scanner.per_scanner_data,
1156                    per_event_data: &mut self.per_event_data,
1157                    event_id: self.event_id.as_deref(),
1158                };
1159
1160                let async_status = rule.get_string_matches(content, path, &mut ctx)?;
1161
1162                match async_status {
1163                    RuleStatus::Done => {
1164                        // nothing to do
1165                    }
1166                    RuleStatus::Pending(fut) => {
1167                        self.async_jobs.push(PendingRuleJob {
1168                            fut,
1169                            path: path.into_static(),
1170                        });
1171                    }
1172                }
1173            }
1174            Ok(())
1175        })?;
1176
1177        // If there are any matches, the string will need to be accessed to check for false positives from
1178        // excluded matches, any to potentially mutate the string.
1179        // If there are any async jobs, this is also true since it's not known yet whether there
1180        // will be a match
1181        let needs_to_access_content = !path_rules_matches.is_empty() || !self.async_jobs.is_empty();
1182
1183        self.rule_matches
1184            .push_sync_matches(path, path_rules_matches);
1185
1186        Ok(needs_to_access_content)
1187    }
1188}
1189
1190// Calculates the next starting position for a regex match if a the previous match is a false positive
1191fn get_next_regex_start(content: &str, regex_match: (usize, usize)) -> Option<usize> {
1192    // The next valid UTF8 char after the start of the regex match is used
1193    if let Some((i, _)) = content[regex_match.0..].char_indices().nth(1) {
1194        Some(regex_match.0 + i)
1195    } else {
1196        // There are no more chars left in the string to scan
1197        None
1198    }
1199}
1200
1201fn is_false_positive_match(
1202    regex_match_range: (usize, usize),
1203    rule: &RegexCompiledRule,
1204    content: &str,
1205    check_excluded_keywords: bool,
1206) -> bool {
1207    if check_excluded_keywords
1208        && let Some(excluded_keywords) = &rule.excluded_keywords
1209        && excluded_keywords.is_false_positive_match(content, regex_match_range.0)
1210    {
1211        return true;
1212    }
1213
1214    if let Some(validator) = rule.validator.as_ref()
1215        && !validator.is_valid_match(&content[regex_match_range.0..regex_match_range.1])
1216    {
1217        return true;
1218    }
1219    false
1220}