dd_sds/scanner/
mod.rs

1use crate::encoding::Encoding;
2use crate::event::Event;
3use std::future::Future;
4
5use crate::match_validation::{
6    config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
7    match_validator::MatchValidator,
8};
9
10use error::MatchValidatorCreationError;
11
12use self::metrics::ScannerMetrics;
13use crate::match_validation::match_validator::RAYON_THREAD_POOL;
14use crate::observability::labels::Labels;
15use crate::rule_match::{InternalRuleMatch, RuleMatch};
16use crate::scanner::config::RuleConfig;
17use crate::scanner::internal_rule_match_set::InternalRuleMatchSet;
18use crate::scanner::regex_rule::compiled::RegexCompiledRule;
19use crate::scanner::regex_rule::{RegexCaches, access_regex_caches};
20use crate::scanner::scope::Scope;
21pub use crate::scanner::shared_data::SharedData;
22use crate::scanner::suppression::{CompiledSuppressions, Suppressions};
23use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
24pub use crate::secondary_validation::Validator;
25use crate::stats::GLOBAL_STATS;
26use crate::tokio::TOKIO_RUNTIME;
27use crate::{
28    CreateScannerError, EncodeIndices, MatchAction, Path, RegexValidationError, ScannerError,
29};
30use ahash::{AHashMap, AHashSet};
31use futures::executor::block_on;
32use regex_automata::Match;
33use serde::{Deserialize, Serialize};
34use serde_with::serde_as;
35use std::ops::Deref;
36use std::pin::Pin;
37use std::sync::Arc;
38use std::time::{Duration, Instant};
39use tokio::task::JoinHandle;
40use tokio::time::timeout;
41
42pub mod config;
43pub mod error;
44pub mod metrics;
45pub mod regex_rule;
46pub mod scope;
47pub mod shared_data;
48pub mod shared_pool;
49pub mod suppression;
50
51mod internal_rule_match_set;
52#[cfg(test)]
53mod test;
54
55#[derive(Copy, Clone)]
56pub struct StringMatch {
57    pub start: usize,
58    pub end: usize,
59}
60
61pub trait MatchEmitter<T = ()> {
62    fn emit(&mut self, string_match: StringMatch) -> T;
63}
64
65// This implements MatchEmitter for mutable closures (so you can use a closure instead of a custom
66// struct that implements MatchEmitter)
67impl<F, T> MatchEmitter<T> for F
68where
69    F: FnMut(StringMatch) -> T,
70{
71    fn emit(&mut self, string_match: StringMatch) -> T {
72        // This just calls the closure (itself)
73        (self)(string_match)
74    }
75}
76
77#[serde_as]
78#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
79pub struct RootRuleConfig<T> {
80    pub match_action: MatchAction,
81    #[serde(default)]
82    pub scope: Scope,
83    #[deprecated(note = "Use `third_party_active_checker` instead")]
84    match_validation_type: Option<MatchValidationType>,
85    third_party_active_checker: Option<MatchValidationType>,
86    suppressions: Option<Suppressions>,
87    #[serde(flatten)]
88    pub inner: T,
89}
90
91impl<T> RootRuleConfig<T>
92where
93    T: RuleConfig + 'static,
94{
95    pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
96        RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
97    }
98
99    pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
100        self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
101    }
102}
103
104impl<T> RootRuleConfig<T> {
105    pub fn new(inner: T) -> Self {
106        #[allow(deprecated)]
107        Self {
108            match_action: MatchAction::None,
109            scope: Scope::all(),
110            match_validation_type: None,
111            third_party_active_checker: None,
112            suppressions: None,
113            inner,
114        }
115    }
116
117    pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
118        #[allow(deprecated)]
119        RootRuleConfig {
120            match_action: self.match_action,
121            scope: self.scope,
122            match_validation_type: self.match_validation_type,
123            third_party_active_checker: self.third_party_active_checker,
124            suppressions: self.suppressions,
125            inner: func(self.inner),
126        }
127    }
128
129    pub fn match_action(mut self, action: MatchAction) -> Self {
130        self.match_action = action;
131        self
132    }
133
134    pub fn scope(mut self, scope: Scope) -> Self {
135        self.scope = scope;
136        self
137    }
138
139    pub fn third_party_active_checker(
140        mut self,
141        match_validation_type: MatchValidationType,
142    ) -> Self {
143        self.third_party_active_checker = Some(match_validation_type);
144        self
145    }
146
147    pub fn suppressions(mut self, suppressions: Suppressions) -> Self {
148        self.suppressions = Some(suppressions);
149        self
150    }
151
152    fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
153        #[allow(deprecated)]
154        self.third_party_active_checker
155            .as_ref()
156            .or(self.match_validation_type.as_ref())
157    }
158}
159
160impl<T> Deref for RootRuleConfig<T> {
161    type Target = T;
162
163    fn deref(&self) -> &Self::Target {
164        &self.inner
165    }
166}
167pub struct RootCompiledRule {
168    pub inner: Box<dyn CompiledRule>,
169    pub scope: Scope,
170    pub match_action: MatchAction,
171    pub match_validation_type: Option<MatchValidationType>,
172    pub suppressions: Option<CompiledSuppressions>,
173}
174
175impl RootCompiledRule {
176    pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
177        self.match_validation_type
178            .as_ref()
179            .map(|x| x.get_internal_match_validation_type())
180    }
181}
182
183impl Deref for RootCompiledRule {
184    type Target = dyn CompiledRule;
185
186    fn deref(&self) -> &Self::Target {
187        self.inner.as_ref()
188    }
189}
190
191pub struct StringMatchesCtx<'a> {
192    rule_index: usize,
193    pub regex_caches: &'a mut RegexCaches,
194    pub exclusion_check: &'a ExclusionCheck<'a>,
195    pub excluded_matches: &'a mut AHashSet<String>,
196    pub match_emitter: &'a mut dyn MatchEmitter,
197    pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
198
199    // Shared Data
200    pub per_string_data: &'a mut SharedData,
201    pub per_scanner_data: &'a SharedData,
202    pub per_event_data: &'a mut SharedData,
203}
204
205impl StringMatchesCtx<'_> {
206    /// If a `get_string_matches` implementation needs to do any async processing (e.g. I/O),
207    /// this function can be used to return an "async job" to find matches. The return value
208    /// of `process_async` should be returned from the `get_string_matches` function. The future
209    /// passed into this function will be spawned and executed immediately without blocking
210    /// other `get_string_matches` calls. This means all the async jobs will run concurrently.
211    ///
212    /// The `ctx` available to async jobs is more restrictive than the normal `ctx` available in
213    /// `get_string_matches`. The only thing you can do is return matches. If other data is needed,
214    /// it should be accessed before `process_async` is called.
215    pub fn process_async(
216        &self,
217        func: impl for<'a> FnOnce(
218            &'a mut AsyncStringMatchesCtx,
219        )
220            -> Pin<Box<dyn Future<Output = Result<(), ScannerError>> + Send + 'a>>
221        + Send
222        + 'static,
223    ) -> RuleResult {
224        let rule_index = self.rule_index;
225
226        // The future is spawned onto the tokio runtime immediately so it starts running
227        // in the background
228        let fut = TOKIO_RUNTIME.spawn(async move {
229            let mut ctx = AsyncStringMatchesCtx {
230                rule_matches: vec![],
231            };
232            (func)(&mut ctx).await?;
233
234            Ok(AsyncRuleInfo {
235                rule_index,
236                rule_matches: ctx.rule_matches,
237            })
238        });
239
240        Ok(RuleStatus::Pending(fut))
241    }
242}
243
244pub struct AsyncStringMatchesCtx {
245    rule_matches: Vec<StringMatch>,
246}
247
248impl AsyncStringMatchesCtx {
249    pub fn emit_match(&mut self, string_match: StringMatch) {
250        self.rule_matches.push(string_match);
251    }
252}
253
254#[must_use]
255pub enum RuleStatus {
256    Done,
257    Pending(PendingRuleResult),
258}
259
260// pub type PendingRuleResult = BoxFuture<'static, Result<AsyncRuleInfo, ScannerError>>;
261pub type PendingRuleResult = JoinHandle<Result<AsyncRuleInfo, ScannerError>>;
262
263pub struct PendingRuleJob {
264    fut: PendingRuleResult,
265    path: Path<'static>,
266}
267
268pub struct AsyncRuleInfo {
269    rule_index: usize,
270    rule_matches: Vec<StringMatch>,
271}
272
273/// A rule result that cannot be async
274pub type RuleResult = Result<RuleStatus, ScannerError>;
275
276// This is the public trait that is used to define the behavior of a compiled rule.
277pub trait CompiledRule: Send + Sync {
278    fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
279        // by default, no per-scanner data is initialized
280    }
281
282    fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
283        // by default, no per-string data is initialized
284    }
285
286    fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
287        // by default, no per-event data is initialized
288    }
289
290    fn get_string_matches(
291        &self,
292        content: &str,
293        path: &Path,
294        ctx: &mut StringMatchesCtx<'_>,
295    ) -> RuleResult;
296
297    // Whether a match from this rule should be excluded (marked as a false-positive)
298    // if the content of this match was found in a match from an excluded scope
299    fn should_exclude_multipass_v0(&self) -> bool {
300        // default is to NOT use Multi-pass V0
301        false
302    }
303
304    fn on_excluded_match_multipass_v0(&self) {
305        // default is to do nothing
306    }
307}
308
309impl<T> RuleConfig for Box<T>
310where
311    T: RuleConfig + ?Sized,
312{
313    fn convert_to_compiled_rule(
314        &self,
315        rule_index: usize,
316        labels: Labels,
317    ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
318        self.as_ref().convert_to_compiled_rule(rule_index, labels)
319    }
320}
321
322#[derive(Debug, PartialEq, Clone)]
323struct ScannerFeatures {
324    pub add_implicit_index_wildcards: bool,
325    pub multipass_v0_enabled: bool,
326    pub return_matches: bool,
327    // This is a temporary flag to disable failed rules (instead of fail the entire scanner)
328    // for regex rules that match an empty string
329    pub skip_rules_with_regex_matching_empty_string: bool,
330}
331
332impl Default for ScannerFeatures {
333    fn default() -> Self {
334        Self {
335            add_implicit_index_wildcards: false,
336            multipass_v0_enabled: true,
337            return_matches: false,
338            skip_rules_with_regex_matching_empty_string: false,
339        }
340    }
341}
342
343pub struct ScanOptions {
344    // The blocked_rules_idx parameter is a list of rule indices that should be skipped for this scan.
345    // this list shall be small (<10), so a linear search is acceptable otherwise performance will be impacted.
346    pub blocked_rules_idx: Vec<usize>,
347    // The wildcarded_indices parameter is a map containing a list of tuples of (start, end) indices that should be treated as wildcards (for the message key only) per path.
348    pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
349    // Whether to validate matches using third-party validators (e.g., checksum validation for credit cards).
350    // When enabled, the scanner automatically collects match content needed for validation.
351    pub validate_matches: bool,
352}
353
354impl Default for ScanOptions {
355    fn default() -> Self {
356        Self {
357            blocked_rules_idx: vec![],
358            wildcarded_indices: AHashMap::new(),
359            validate_matches: false,
360        }
361    }
362}
363
364pub struct ScanOptionBuilder {
365    blocked_rules_idx: Vec<usize>,
366    wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
367    validate_matches: bool,
368}
369
370impl ScanOptionBuilder {
371    pub fn new() -> Self {
372        Self {
373            blocked_rules_idx: vec![],
374            wildcarded_indices: AHashMap::new(),
375            validate_matches: false,
376        }
377    }
378
379    pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
380        self.blocked_rules_idx = blocked_rules_idx;
381        self
382    }
383
384    pub fn with_wildcarded_indices(
385        mut self,
386        wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
387    ) -> Self {
388        self.wildcarded_indices = wildcarded_indices;
389        self
390    }
391
392    pub fn with_validate_matching(mut self, validate_matches: bool) -> Self {
393        self.validate_matches = validate_matches;
394        self
395    }
396
397    pub fn build(self) -> ScanOptions {
398        ScanOptions {
399            blocked_rules_idx: self.blocked_rules_idx,
400            wildcarded_indices: self.wildcarded_indices,
401            validate_matches: self.validate_matches,
402        }
403    }
404}
405
406pub struct Scanner {
407    rules: Vec<RootCompiledRule>,
408    scoped_ruleset: ScopedRuleSet,
409    scanner_features: ScannerFeatures,
410    metrics: ScannerMetrics,
411    labels: Labels,
412    match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
413    per_scanner_data: SharedData,
414    async_scan_timeout: Duration,
415}
416
417impl Scanner {
418    pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
419        ScannerBuilder::new(rules)
420    }
421
422    // This function scans the given event with the rules configured in the scanner.
423    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
424    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
425    // This version uses default scan options (no validation, no blocked rules, no wildcarded indices).
426    pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
427        self.scan_with_options(event, ScanOptions::default())
428    }
429
430    // This function scans the given event with the rules configured in the scanner.
431    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
432    // The options parameter allows customizing the scan behavior (validation, blocked rules, etc.).
433    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
434    pub fn scan_with_options<E: Event>(
435        &self,
436        event: &mut E,
437        options: ScanOptions,
438    ) -> Result<Vec<RuleMatch>, ScannerError> {
439        block_on(self.internal_scan_with_metrics(event, options))
440    }
441
442    // This function scans the given event with the rules configured in the scanner.
443    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
444    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
445    pub async fn scan_async<E: Event>(
446        &self,
447        event: &mut E,
448    ) -> Result<Vec<RuleMatch>, ScannerError> {
449        self.scan_async_with_options(event, ScanOptions::default())
450            .await
451    }
452
453    pub async fn scan_async_with_options<E: Event>(
454        &self,
455        event: &mut E,
456        options: ScanOptions,
457    ) -> Result<Vec<RuleMatch>, ScannerError> {
458        let fut = self.internal_scan_with_metrics(event, options);
459
460        // The sleep from the timeout requires being in a tokio context
461        // The guard needs to be dropped before await since the guard is !Send
462        let timeout = {
463            let _tokio_guard = TOKIO_RUNTIME.enter();
464            timeout(self.async_scan_timeout, fut)
465        };
466
467        timeout.await.unwrap_or(Err(ScannerError::Transient(
468            "Async scan timeout".to_string(),
469        )))
470    }
471
472    fn record_metrics(&self, output_rule_matches: &[RuleMatch], start: Instant) {
473        // Record detection time
474        self.metrics
475            .duration_ns
476            .increment(start.elapsed().as_nanos() as u64);
477        // Add number of scanned events
478        self.metrics.num_scanned_events.increment(1);
479        // Add number of matches
480        self.metrics
481            .match_count
482            .increment(output_rule_matches.len() as u64);
483    }
484
485    async fn internal_scan_with_metrics<E: Event>(
486        &self,
487        event: &mut E,
488        options: ScanOptions,
489    ) -> Result<Vec<RuleMatch>, ScannerError> {
490        let start = Instant::now();
491        let result = self.internal_scan(event, options).await;
492        match &result {
493            Ok(rule_matches) => {
494                self.record_metrics(rule_matches, start);
495            }
496            Err(_) => {
497                self.record_metrics(&[], start);
498            }
499        }
500        result
501    }
502
503    async fn internal_scan<E: Event>(
504        &self,
505        event: &mut E,
506        options: ScanOptions,
507    ) -> Result<Vec<RuleMatch>, ScannerError> {
508        // If validation is requested, we need to collect match content even if the scanner
509        // wasn't originally configured to return matches
510        let need_match_content = self.scanner_features.return_matches || options.validate_matches;
511        // All matches, after some (but not all) false-positives have been removed.
512        let mut rule_matches = InternalRuleMatchSet::new();
513        let mut excluded_matches = AHashSet::new();
514        let mut async_jobs = vec![];
515
516        access_regex_caches(|regex_caches| {
517            self.scoped_ruleset.visit_string_rule_combinations(
518                event,
519                ScannerContentVisitor {
520                    scanner: self,
521                    regex_caches,
522                    rule_matches: &mut rule_matches,
523                    blocked_rules: &options.blocked_rules_idx,
524                    excluded_matches: &mut excluded_matches,
525                    per_event_data: SharedData::new(),
526                    wildcarded_indexes: &options.wildcarded_indices,
527                    async_jobs: &mut async_jobs,
528                },
529            )
530        })?;
531
532        // The async jobs were already spawned on the tokio runtime, so the
533        // results just need to be collected
534        for job in async_jobs {
535            let rule_info = job.fut.await.unwrap()?;
536            rule_matches.push_async_matches(
537                &job.path,
538                rule_info
539                    .rule_matches
540                    .into_iter()
541                    .map(|x| InternalRuleMatch::new(rule_info.rule_index, x)),
542            );
543        }
544
545        let mut output_rule_matches = vec![];
546
547        for (path, mut rule_matches) in rule_matches.into_iter() {
548            // All rule matches in each inner list are for a single path, so they can be processed independently.
549            event.visit_string_mut(&path, |content| {
550                // calculate_indices requires that matches are sorted by start index
551                rule_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
552
553                <<E as Event>::Encoding>::calculate_indices(
554                    content,
555                    rule_matches.iter_mut().map(
556                        |rule_match: &mut InternalRuleMatch<E::Encoding>| EncodeIndices {
557                            utf8_start: rule_match.utf8_start,
558                            utf8_end: rule_match.utf8_end,
559                            custom_start: &mut rule_match.custom_start,
560                            custom_end: &mut rule_match.custom_end,
561                        },
562                    ),
563                );
564
565                if self.scanner_features.multipass_v0_enabled {
566                    // Now that the `excluded_matches` set is fully populated, filter out any matches
567                    // that are the same as excluded matches (also known as "Multi-pass V0")
568                    rule_matches.retain(|rule_match| {
569                        if self.rules[rule_match.rule_index]
570                            .inner
571                            .should_exclude_multipass_v0()
572                        {
573                            let is_false_positive = excluded_matches
574                                .contains(&content[rule_match.utf8_start..rule_match.utf8_end]);
575                            if is_false_positive && self.scanner_features.multipass_v0_enabled {
576                                self.rules[rule_match.rule_index].on_excluded_match_multipass_v0();
577                            }
578                            !is_false_positive
579                        } else {
580                            true
581                        }
582                    });
583                }
584
585                self.suppress_matches::<E::Encoding>(&mut rule_matches, content);
586
587                self.sort_and_remove_overlapping_rules::<E::Encoding>(&mut rule_matches);
588
589                let will_mutate = rule_matches
590                    .iter()
591                    .any(|rule_match| self.rules[rule_match.rule_index].match_action.is_mutating());
592
593                self.apply_match_actions(
594                    content,
595                    &path,
596                    &mut rule_matches,
597                    &mut output_rule_matches,
598                    need_match_content,
599                );
600
601                will_mutate
602            });
603        }
604
605        if options.validate_matches {
606            self.validate_matches(&mut output_rule_matches);
607        }
608
609        Ok(output_rule_matches)
610    }
611
612    pub fn suppress_matches<E: Encoding>(
613        &self,
614        rule_matches: &mut Vec<InternalRuleMatch<E>>,
615        content: &str,
616    ) {
617        rule_matches.retain(|rule_match| {
618            if let Some(suppressions) = &self.rules[rule_match.rule_index].suppressions {
619                !suppressions.should_match_be_suppressed(content)
620            } else {
621                true
622            }
623        });
624    }
625
626    pub fn validate_matches(&self, rule_matches: &mut Vec<RuleMatch>) {
627        // Create MatchValidatorRuleMatch per match_validator_type to pass it to each match_validator
628        let mut match_validator_rule_match_per_type = AHashMap::new();
629
630        let mut validated_rule_matches = vec![];
631
632        for mut rule_match in rule_matches.drain(..) {
633            let rule = &self.rules[rule_match.rule_index];
634            if let Some(match_validation_type) = rule.internal_match_validation_type() {
635                match_validator_rule_match_per_type
636                    .entry(match_validation_type)
637                    .or_insert_with(Vec::new)
638                    .push(rule_match)
639            } else {
640                // There is no match validator for this rule, so mark it as not available.
641                rule_match.match_status.merge(MatchStatus::NotAvailable);
642                validated_rule_matches.push(rule_match);
643            }
644        }
645
646        RAYON_THREAD_POOL.install(|| {
647            use rayon::prelude::*;
648
649            match_validator_rule_match_per_type.par_iter_mut().for_each(
650                |(match_validation_type, matches_per_type)| {
651                    let match_validator = self.match_validators_per_type.get(match_validation_type);
652                    if let Some(match_validator) = match_validator {
653                        match_validator
654                            .as_ref()
655                            .validate(matches_per_type, &self.rules)
656                    }
657                },
658            );
659        });
660
661        // Refill the rule_matches with the validated matches
662        for (_, mut matches) in match_validator_rule_match_per_type {
663            validated_rule_matches.append(&mut matches);
664        }
665
666        // Sort rule_matches by start index
667        validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
668        *rule_matches = validated_rule_matches;
669    }
670
671    /// Apply mutations from actions, and shift indices to match the mutated values.
672    /// This assumes the matches are all from the content given, and are sorted by start index.
673    fn apply_match_actions<E: Encoding>(
674        &self,
675        content: &mut String,
676        path: &Path<'static>,
677        rule_matches: &mut [InternalRuleMatch<E>],
678        output_rule_matches: &mut Vec<RuleMatch>,
679        need_match_content: bool,
680    ) {
681        let mut utf8_byte_delta: isize = 0;
682        let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
683
684        for rule_match in rule_matches {
685            output_rule_matches.push(self.apply_match_actions_for_string::<E>(
686                content,
687                path.clone(),
688                rule_match,
689                &mut utf8_byte_delta,
690                &mut custom_index_delta,
691                need_match_content,
692            ));
693        }
694    }
695
696    /// This will be called once for each match of a single string. The rules must be passed in in order of the start index. Mutating rules must not overlap.
697    fn apply_match_actions_for_string<E: Encoding>(
698        &self,
699        content: &mut String,
700        path: Path<'static>,
701        rule_match: &InternalRuleMatch<E>,
702        // The current difference in length between the original and mutated string
703        utf8_byte_delta: &mut isize,
704
705        // The difference between the custom index on the original string and the mutated string
706        custom_index_delta: &mut <E>::IndexShift,
707        need_match_content: bool,
708    ) -> RuleMatch {
709        let rule = &self.rules[rule_match.rule_index];
710
711        let custom_start =
712            (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
713                + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
714
715        let mut matched_content_copy = None;
716
717        if need_match_content {
718            // This copies part of the is_mutating block but is seperate since can't mix compilation condition and code condition
719            let mutated_utf8_match_start =
720                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
721            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
722
723            // Matches for mutating rules must have valid indices
724            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
725            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
726
727            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
728            matched_content_copy = Some(matched_content.to_string());
729        }
730
731        if rule.match_action.is_mutating() {
732            let mutated_utf8_match_start =
733                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
734            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
735
736            // Matches for mutating rules must have valid indices
737            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
738            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
739
740            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
741            if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
742                let before_replacement = &matched_content[replacement.start..replacement.end];
743
744                // update indices to match the new mutated content
745                <E>::adjust_shift(
746                    custom_index_delta,
747                    before_replacement,
748                    &replacement.replacement,
749                );
750                *utf8_byte_delta +=
751                    replacement.replacement.len() as isize - before_replacement.len() as isize;
752
753                let replacement_start = mutated_utf8_match_start + replacement.start;
754                let replacement_end = mutated_utf8_match_start + replacement.end;
755                content.replace_range(replacement_start..replacement_end, &replacement.replacement);
756            }
757        }
758
759        let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
760        let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
761            + shift_offset) as usize;
762
763        let rule = &self.rules[rule_match.rule_index];
764
765        let match_status: MatchStatus = if rule.match_validation_type.is_some() {
766            MatchStatus::NotChecked
767        } else {
768            MatchStatus::NotAvailable
769        };
770
771        RuleMatch {
772            rule_index: rule_match.rule_index,
773            path,
774            replacement_type: rule.match_action.replacement_type(),
775            start_index: custom_start,
776            end_index_exclusive: custom_end,
777            shift_offset,
778            match_value: matched_content_copy,
779            match_status,
780        }
781    }
782
783    fn sort_and_remove_overlapping_rules<E: Encoding>(
784        &self,
785        rule_matches: &mut Vec<InternalRuleMatch<E>>,
786    ) {
787        // Some of the scanner code relies on the behavior here, such as the sort order and removal of overlapping mutating rules.
788        // Be very careful if this function is modified.
789
790        rule_matches.sort_unstable_by(|a, b| {
791            // Mutating rules are a higher priority (earlier in the list)
792            let ord = self.rules[a.rule_index]
793                .match_action
794                .is_mutating()
795                .cmp(&self.rules[b.rule_index].match_action.is_mutating())
796                .reverse();
797
798            // Earlier start offset
799            let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
800
801            // Longer matches
802            let ord = ord.then(a.len().cmp(&b.len()).reverse());
803
804            // Matches from earlier rules
805            let ord = ord.then(a.rule_index.cmp(&b.rule_index));
806
807            // swap the order of everything so matches can be efficiently popped off the back as they are processed
808            ord.reverse()
809        });
810
811        let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
812
813        'rule_matches: while let Some(rule_match) = rule_matches.pop() {
814            if self.rules[rule_match.rule_index].match_action.is_mutating() {
815                // Mutating rules are kept only if they don't overlap with a previous rule.
816                if let Some(last) = retained_rules.last()
817                    && last.utf8_end > rule_match.utf8_start
818                {
819                    continue;
820                }
821            } else {
822                // Only retain if it doesn't overlap with any other rule. Since mutating matches are sorted before non-mutated matches
823                // this needs to check all retained matches (instead of just the last one)
824                for retained_rule in &retained_rules {
825                    if retained_rule.utf8_start < rule_match.utf8_end
826                        && retained_rule.utf8_end > rule_match.utf8_start
827                    {
828                        continue 'rule_matches;
829                    }
830                }
831            };
832            retained_rules.push(rule_match);
833        }
834
835        // ensure rules are sorted by start index (other parts of the library required this to function correctly)
836        retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
837
838        *rule_matches = retained_rules;
839    }
840}
841
842impl Drop for Scanner {
843    fn drop(&mut self) {
844        let stats = &*GLOBAL_STATS;
845        stats.scanner_deletions.increment(1);
846        stats.decrement_total_scanners();
847    }
848}
849
850#[derive(Default)]
851pub struct ScannerBuilder<'a> {
852    rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
853    labels: Labels,
854    scanner_features: ScannerFeatures,
855    async_scan_timeout: Duration,
856}
857
858impl ScannerBuilder<'_> {
859    pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
860        ScannerBuilder {
861            rules,
862            labels: Labels::empty(),
863            scanner_features: ScannerFeatures::default(),
864            async_scan_timeout: Duration::from_secs(60 * 5),
865        }
866    }
867
868    pub fn labels(mut self, labels: Labels) -> Self {
869        self.labels = labels;
870        self
871    }
872
873    pub fn with_async_scan_timeout(mut self, duration: Duration) -> Self {
874        self.async_scan_timeout = duration;
875        self
876    }
877
878    pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
879        self.scanner_features.add_implicit_index_wildcards = value;
880        self
881    }
882
883    pub fn with_return_matches(mut self, value: bool) -> Self {
884        self.scanner_features.return_matches = value;
885        self
886    }
887
888    /// Enables/Disables the Multipass V0 feature. This defaults to TRUE.
889    /// Multipass V0 saves matches from excluded scopes, and marks any identical
890    /// matches in included scopes as a false positive.
891    pub fn with_multipass_v0(mut self, value: bool) -> Self {
892        self.scanner_features.multipass_v0_enabled = value;
893        self
894    }
895
896    pub fn with_skip_rules_with_regex_matching_empty_string(mut self, value: bool) -> Self {
897        self.scanner_features
898            .skip_rules_with_regex_matching_empty_string = value;
899        self
900    }
901
902    pub fn build(self) -> Result<Scanner, CreateScannerError> {
903        let mut match_validators_per_type = AHashMap::new();
904
905        for rule in self.rules.iter() {
906            if let Some(match_validation_type) = &rule.get_third_party_active_checker()
907                && match_validation_type.can_create_match_validator()
908            {
909                let internal_type = match_validation_type.get_internal_match_validation_type();
910                let match_validator = match_validation_type.into_match_validator();
911                if let Ok(match_validator) = match_validator {
912                    if !match_validators_per_type.contains_key(&internal_type) {
913                        match_validators_per_type.insert(internal_type, match_validator);
914                    }
915                } else {
916                    return Err(CreateScannerError::InvalidMatchValidator(
917                        MatchValidatorCreationError::InternalError,
918                    ));
919                }
920            }
921        }
922
923        let compiled_rules = self
924            .rules
925            .iter()
926            .enumerate()
927            .filter_map(|(rule_index, config)| {
928                let inner = match config.convert_to_compiled_rule(rule_index, self.labels.clone()) {
929                    Ok(inner) => Ok(inner),
930                    Err(err) => {
931                        if self
932                            .scanner_features
933                            .skip_rules_with_regex_matching_empty_string
934                            && err
935                            == CreateScannerError::InvalidRegex(
936                            RegexValidationError::MatchesEmptyString,
937                        )
938                        {
939                            // this is a temporary feature to skip rules that should be considered invalid.
940                            #[allow(clippy::print_stdout)]
941                            {
942                                println!("skipping rule that matches empty string: rule_index={}, labels={:?}", rule_index, self.labels.clone());
943                            }
944                            return None;
945                        } else {
946                            Err(err)
947                        }
948                    }
949                };
950                Some((config, inner))
951            })
952            .map(|(config, inner)| {
953                config.match_action.validate()?;
954                Ok(RootCompiledRule {
955                    inner: inner?,
956                    scope: config.scope.clone(),
957                    match_action: config.match_action.clone(),
958                    match_validation_type: config.get_third_party_active_checker().cloned(),
959                    suppressions: config.suppressions.clone().map(|config| config.into()),
960                })
961            })
962            .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
963
964        let mut per_scanner_data = SharedData::new();
965
966        compiled_rules.iter().for_each(|rule| {
967            rule.init_per_scanner_data(&mut per_scanner_data);
968        });
969
970        let scoped_ruleset = ScopedRuleSet::new(
971            &compiled_rules
972                .iter()
973                .map(|rule| rule.scope.clone())
974                .collect::<Vec<_>>(),
975        )
976        .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
977
978        {
979            let stats = &*GLOBAL_STATS;
980            stats.scanner_creations.increment(1);
981            stats.increment_total_scanners();
982        }
983
984        Ok(Scanner {
985            rules: compiled_rules,
986            scoped_ruleset,
987            scanner_features: self.scanner_features,
988            metrics: ScannerMetrics::new(&self.labels),
989            match_validators_per_type,
990            labels: self.labels,
991            per_scanner_data,
992            async_scan_timeout: self.async_scan_timeout,
993        })
994    }
995}
996
997struct ScannerContentVisitor<'a, E: Encoding> {
998    scanner: &'a Scanner,
999    regex_caches: &'a mut RegexCaches,
1000    rule_matches: &'a mut InternalRuleMatchSet<E>,
1001    // Rules that shall be skipped for this scan
1002    // This list shall be small (<10), so a linear search is acceptable
1003    blocked_rules: &'a Vec<usize>,
1004    excluded_matches: &'a mut AHashSet<String>,
1005    per_event_data: SharedData,
1006    wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
1007    async_jobs: &'a mut Vec<PendingRuleJob>,
1008}
1009
1010impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
1011    fn visit_content<'b>(
1012        &'b mut self,
1013        path: &Path<'a>,
1014        content: &str,
1015        mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
1016        exclusion_check: ExclusionCheck<'b>,
1017    ) -> Result<bool, ScannerError> {
1018        // matches for a single path
1019        let mut path_rules_matches = vec![];
1020
1021        // Create a map of per rule type data that can be shared between rules of the same type
1022        let mut per_string_data = SharedData::new();
1023        let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
1024
1025        rule_visitor.visit_rule_indices(|rule_index| {
1026            if self.blocked_rules.contains(&rule_index) {
1027                return Ok(());
1028            }
1029            let rule = &self.scanner.rules[rule_index];
1030            {
1031                // creating the emitter is basically free, it will get mostly optimized away
1032                let mut emitter = |rule_match: StringMatch| {
1033                    // This should never happen, but to ensure no empty match is ever generated
1034                    // (which may cause an infinite loop), this will panic instead.
1035                    assert_ne!(rule_match.start, rule_match.end, "empty match detected");
1036                    path_rules_matches.push(InternalRuleMatch::new(rule_index, rule_match));
1037                };
1038
1039                rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
1040
1041                // TODO: move this somewhere higher?
1042                rule.init_per_event_data(&mut self.per_event_data);
1043
1044                let mut ctx = StringMatchesCtx {
1045                    rule_index,
1046                    regex_caches: self.regex_caches,
1047                    exclusion_check: &exclusion_check,
1048                    excluded_matches: self.excluded_matches,
1049                    match_emitter: &mut emitter,
1050                    wildcard_indices: wildcard_indices_per_path,
1051                    per_string_data: &mut per_string_data,
1052                    per_scanner_data: &self.scanner.per_scanner_data,
1053                    per_event_data: &mut self.per_event_data,
1054                };
1055
1056                let async_status = rule.get_string_matches(content, path, &mut ctx)?;
1057
1058                match async_status {
1059                    RuleStatus::Done => {
1060                        // nothing to do
1061                    }
1062                    RuleStatus::Pending(fut) => {
1063                        self.async_jobs.push(PendingRuleJob {
1064                            fut,
1065                            path: path.into_static(),
1066                        });
1067                    }
1068                }
1069            }
1070            Ok(())
1071        })?;
1072
1073        // If there are any matches, the string will need to be accessed to check for false positives from
1074        // excluded matches, any to potentially mutate the string.
1075        // If there are any async jobs, this is also true since it's not known yet whether there
1076        // will be a match
1077        let needs_to_access_content = !path_rules_matches.is_empty() || !self.async_jobs.is_empty();
1078
1079        self.rule_matches
1080            .push_sync_matches(path, path_rules_matches);
1081
1082        Ok(needs_to_access_content)
1083    }
1084}
1085
1086// Calculates the next starting position for a regex match if a the previous match is a false positive
1087fn get_next_regex_start(content: &str, regex_match: &Match) -> Option<usize> {
1088    // The next valid UTF8 char after the start of the regex match is used
1089    if let Some((i, _)) = content[regex_match.start()..].char_indices().nth(1) {
1090        Some(regex_match.start() + i)
1091    } else {
1092        // There are no more chars left in the string to scan
1093        None
1094    }
1095}
1096
1097fn is_false_positive_match(
1098    regex_match: &Match,
1099    rule: &RegexCompiledRule,
1100    content: &str,
1101    check_excluded_keywords: bool,
1102) -> bool {
1103    if check_excluded_keywords
1104        && let Some(excluded_keywords) = &rule.excluded_keywords
1105        && excluded_keywords.is_false_positive_match(content, regex_match.start())
1106    {
1107        return true;
1108    }
1109
1110    if let Some(validator) = rule.validator.as_ref()
1111        && !validator.is_valid_match(&content[regex_match.range()])
1112    {
1113        return true;
1114    }
1115    false
1116}