dd_sds/scanner/
mod.rs

1use crate::encoding::Encoding;
2use crate::event::Event;
3use std::future::Future;
4
5use crate::match_validation::{
6    config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
7    match_validator::MatchValidator,
8};
9
10use error::MatchValidatorCreationError;
11
12use self::metrics::ScannerMetrics;
13use crate::match_validation::match_validator::RAYON_THREAD_POOL;
14use crate::observability::labels::Labels;
15use crate::rule_match::{InternalRuleMatch, RuleMatch};
16use crate::scanner::config::RuleConfig;
17use crate::scanner::internal_rule_match_set::InternalRuleMatchSet;
18use crate::scanner::regex_rule::compiled::RegexCompiledRule;
19use crate::scanner::regex_rule::{RegexCaches, access_regex_caches};
20use crate::scanner::scope::Scope;
21pub use crate::scanner::shared_data::SharedData;
22use crate::scanner::suppression::{CompiledSuppressions, SuppressionValidationError, Suppressions};
23use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
24pub use crate::secondary_validation::Validator;
25use crate::stats::GLOBAL_STATS;
26use crate::tokio::TOKIO_RUNTIME;
27use crate::{
28    CreateScannerError, EncodeIndices, MatchAction, Path, RegexValidationError, ScannerError,
29};
30use ahash::{AHashMap, AHashSet};
31use futures::executor::block_on;
32use serde::{Deserialize, Serialize};
33use serde_with::serde_as;
34use std::ops::Deref;
35use std::pin::Pin;
36use std::sync::Arc;
37use std::time::{Duration, Instant};
38use tokio::task::JoinHandle;
39use tokio::time::timeout;
40
41pub mod config;
42pub mod error;
43pub mod metrics;
44pub mod regex_rule;
45pub mod scope;
46pub mod shared_data;
47pub mod shared_pool;
48pub mod suppression;
49
50mod internal_rule_match_set;
51#[cfg(test)]
52mod test;
53
54#[derive(Copy, Clone)]
55pub struct StringMatch {
56    pub start: usize,
57    pub end: usize,
58}
59
60pub trait MatchEmitter<T = ()> {
61    fn emit(&mut self, string_match: StringMatch) -> T;
62}
63
64// This implements MatchEmitter for mutable closures (so you can use a closure instead of a custom
65// struct that implements MatchEmitter)
66impl<F, T> MatchEmitter<T> for F
67where
68    F: FnMut(StringMatch) -> T,
69{
70    fn emit(&mut self, string_match: StringMatch) -> T {
71        // This just calls the closure (itself)
72        (self)(string_match)
73    }
74}
75
76#[serde_as]
77#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
78pub struct RootRuleConfig<T> {
79    pub match_action: MatchAction,
80    #[serde(default)]
81    pub scope: Scope,
82    #[deprecated(note = "Use `third_party_active_checker` instead")]
83    match_validation_type: Option<MatchValidationType>,
84    third_party_active_checker: Option<MatchValidationType>,
85    suppressions: Option<Suppressions>,
86    #[serde(flatten)]
87    pub inner: T,
88}
89
90impl<T> RootRuleConfig<T>
91where
92    T: RuleConfig + 'static,
93{
94    pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
95        RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
96    }
97
98    pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
99        self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
100    }
101}
102
103impl<T> RootRuleConfig<T> {
104    pub fn new(inner: T) -> Self {
105        #[allow(deprecated)]
106        Self {
107            match_action: MatchAction::None,
108            scope: Scope::all(),
109            match_validation_type: None,
110            third_party_active_checker: None,
111            suppressions: None,
112            inner,
113        }
114    }
115
116    pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
117        #[allow(deprecated)]
118        RootRuleConfig {
119            match_action: self.match_action,
120            scope: self.scope,
121            match_validation_type: self.match_validation_type,
122            third_party_active_checker: self.third_party_active_checker,
123            suppressions: self.suppressions,
124            inner: func(self.inner),
125        }
126    }
127
128    pub fn match_action(mut self, action: MatchAction) -> Self {
129        self.match_action = action;
130        self
131    }
132
133    pub fn scope(mut self, scope: Scope) -> Self {
134        self.scope = scope;
135        self
136    }
137
138    pub fn third_party_active_checker(
139        mut self,
140        match_validation_type: MatchValidationType,
141    ) -> Self {
142        self.third_party_active_checker = Some(match_validation_type);
143        self
144    }
145
146    pub fn suppressions(mut self, suppressions: Suppressions) -> Self {
147        self.suppressions = Some(suppressions);
148        self
149    }
150
151    fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
152        #[allow(deprecated)]
153        self.third_party_active_checker
154            .as_ref()
155            .or(self.match_validation_type.as_ref())
156    }
157}
158
159impl<T> Deref for RootRuleConfig<T> {
160    type Target = T;
161
162    fn deref(&self) -> &Self::Target {
163        &self.inner
164    }
165}
166pub struct RootCompiledRule {
167    pub inner: Box<dyn CompiledRule>,
168    pub scope: Scope,
169    pub match_action: MatchAction,
170    pub match_validation_type: Option<MatchValidationType>,
171    pub suppressions: Option<CompiledSuppressions>,
172}
173
174impl RootCompiledRule {
175    pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
176        self.match_validation_type
177            .as_ref()
178            .map(|x| x.get_internal_match_validation_type())
179    }
180}
181
182impl Deref for RootCompiledRule {
183    type Target = dyn CompiledRule;
184
185    fn deref(&self) -> &Self::Target {
186        self.inner.as_ref()
187    }
188}
189
190pub struct StringMatchesCtx<'a> {
191    rule_index: usize,
192    pub regex_caches: &'a mut RegexCaches,
193    pub exclusion_check: &'a ExclusionCheck<'a>,
194    pub excluded_matches: &'a mut AHashSet<String>,
195    pub match_emitter: &'a mut dyn MatchEmitter,
196    pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
197
198    // Shared Data
199    pub per_string_data: &'a mut SharedData,
200    pub per_scanner_data: &'a SharedData,
201    pub per_event_data: &'a mut SharedData,
202    pub event_id: Option<&'a str>,
203}
204
205impl StringMatchesCtx<'_> {
206    /// If a `get_string_matches` implementation needs to do any async processing (e.g. I/O),
207    /// this function can be used to return an "async job" to find matches. The return value
208    /// of `process_async` should be returned from the `get_string_matches` function. The future
209    /// passed into this function will be spawned and executed immediately without blocking
210    /// other `get_string_matches` calls. This means all the async jobs will run concurrently.
211    ///
212    /// The `ctx` available to async jobs is more restrictive than the normal `ctx` available in
213    /// `get_string_matches`. The only thing you can do is return matches. If other data is needed,
214    /// it should be accessed before `process_async` is called.
215    pub fn process_async(
216        &self,
217        func: impl for<'a> FnOnce(
218            &'a mut AsyncStringMatchesCtx,
219        )
220            -> Pin<Box<dyn Future<Output = Result<(), ScannerError>> + Send + 'a>>
221        + Send
222        + 'static,
223    ) -> RuleResult {
224        let rule_index = self.rule_index;
225
226        // The future is spawned onto the tokio runtime immediately so it starts running
227        // in the background
228        let fut = TOKIO_RUNTIME.spawn(async move {
229            let mut ctx = AsyncStringMatchesCtx {
230                rule_matches: vec![],
231            };
232            (func)(&mut ctx).await?;
233
234            Ok(AsyncRuleInfo {
235                rule_index,
236                rule_matches: ctx.rule_matches,
237            })
238        });
239
240        Ok(RuleStatus::Pending(fut))
241    }
242}
243
244pub struct AsyncStringMatchesCtx {
245    rule_matches: Vec<StringMatch>,
246}
247
248impl AsyncStringMatchesCtx {
249    pub fn emit_match(&mut self, string_match: StringMatch) {
250        self.rule_matches.push(string_match);
251    }
252}
253
254#[must_use]
255pub enum RuleStatus {
256    Done,
257    Pending(PendingRuleResult),
258}
259
260// pub type PendingRuleResult = BoxFuture<'static, Result<AsyncRuleInfo, ScannerError>>;
261pub type PendingRuleResult = JoinHandle<Result<AsyncRuleInfo, ScannerError>>;
262
263pub struct PendingRuleJob {
264    fut: PendingRuleResult,
265    path: Path<'static>,
266}
267
268pub struct AsyncRuleInfo {
269    rule_index: usize,
270    rule_matches: Vec<StringMatch>,
271}
272
273/// A rule result that cannot be async
274pub type RuleResult = Result<RuleStatus, ScannerError>;
275
276// This is the public trait that is used to define the behavior of a compiled rule.
277pub trait CompiledRule: Send + Sync {
278    fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
279        // by default, no per-scanner data is initialized
280    }
281
282    fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
283        // by default, no per-string data is initialized
284    }
285
286    fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
287        // by default, no per-event data is initialized
288    }
289
290    fn get_string_matches(
291        &self,
292        content: &str,
293        path: &Path,
294        ctx: &mut StringMatchesCtx<'_>,
295    ) -> RuleResult;
296
297    // Whether a match from this rule should be excluded (marked as a false-positive)
298    // if the content of this match was found in a match from an excluded scope
299    fn should_exclude_multipass_v0(&self) -> bool {
300        // default is to NOT use Multi-pass V0
301        false
302    }
303
304    fn on_excluded_match_multipass_v0(&self) {
305        // default is to do nothing
306    }
307}
308
309impl<T> RuleConfig for Box<T>
310where
311    T: RuleConfig + ?Sized,
312{
313    fn convert_to_compiled_rule(
314        &self,
315        rule_index: usize,
316        labels: Labels,
317    ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
318        self.as_ref().convert_to_compiled_rule(rule_index, labels)
319    }
320}
321
322#[derive(Debug, PartialEq, Clone)]
323struct ScannerFeatures {
324    pub add_implicit_index_wildcards: bool,
325    pub multipass_v0_enabled: bool,
326    pub return_matches: bool,
327    // This is a temporary flag to disable failed rules (instead of fail the entire scanner)
328    // for regex rules that match an empty string
329    pub skip_rules_with_regex_matching_empty_string: bool,
330}
331
332impl Default for ScannerFeatures {
333    fn default() -> Self {
334        Self {
335            add_implicit_index_wildcards: false,
336            multipass_v0_enabled: true,
337            return_matches: false,
338            skip_rules_with_regex_matching_empty_string: false,
339        }
340    }
341}
342
343pub struct ScanOptions {
344    // The blocked_rules_idx parameter is a list of rule indices that should be skipped for this scan.
345    // this list shall be small (<10), so a linear search is acceptable otherwise performance will be impacted.
346    pub blocked_rules_idx: Vec<usize>,
347    // The wildcarded_indices parameter is a map containing a list of tuples of (start, end) indices that should be treated as wildcards (for the message key only) per path.
348    pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
349    // Whether to validate matches using third-party validators (e.g., checksum validation for credit cards).
350    // When enabled, the scanner automatically collects match content needed for validation.
351    pub validate_matches: bool,
352}
353
354impl Default for ScanOptions {
355    fn default() -> Self {
356        Self {
357            blocked_rules_idx: vec![],
358            wildcarded_indices: AHashMap::new(),
359            validate_matches: false,
360        }
361    }
362}
363
364pub struct ScanOptionBuilder {
365    blocked_rules_idx: Vec<usize>,
366    wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
367    validate_matches: bool,
368}
369
370impl ScanOptionBuilder {
371    pub fn new() -> Self {
372        Self {
373            blocked_rules_idx: vec![],
374            wildcarded_indices: AHashMap::new(),
375            validate_matches: false,
376        }
377    }
378
379    pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
380        self.blocked_rules_idx = blocked_rules_idx;
381        self
382    }
383
384    pub fn with_wildcarded_indices(
385        mut self,
386        wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
387    ) -> Self {
388        self.wildcarded_indices = wildcarded_indices;
389        self
390    }
391
392    pub fn with_validate_matching(mut self, validate_matches: bool) -> Self {
393        self.validate_matches = validate_matches;
394        self
395    }
396
397    pub fn build(self) -> ScanOptions {
398        ScanOptions {
399            blocked_rules_idx: self.blocked_rules_idx,
400            wildcarded_indices: self.wildcarded_indices,
401            validate_matches: self.validate_matches,
402        }
403    }
404}
405
406pub struct Scanner {
407    rules: Vec<RootCompiledRule>,
408    scoped_ruleset: ScopedRuleSet,
409    scanner_features: ScannerFeatures,
410    metrics: ScannerMetrics,
411    labels: Labels,
412    match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
413    per_scanner_data: SharedData,
414    async_scan_timeout: Duration,
415}
416
417impl Scanner {
418    pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
419        ScannerBuilder::new(rules)
420    }
421
422    // This function scans the given event with the rules configured in the scanner.
423    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
424    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
425    // This version uses default scan options (no validation, no blocked rules, no wildcarded indices).
426    pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
427        self.scan_with_options(event, ScanOptions::default())
428    }
429
430    // This function scans the given event with the rules configured in the scanner.
431    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
432    // The options parameter allows customizing the scan behavior (validation, blocked rules, etc.).
433    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
434    pub fn scan_with_options<E: Event>(
435        &self,
436        event: &mut E,
437        options: ScanOptions,
438    ) -> Result<Vec<RuleMatch>, ScannerError> {
439        block_on(self.internal_scan_with_metrics(event, options))
440    }
441
442    // This function scans the given event with the rules configured in the scanner.
443    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
444    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
445    pub async fn scan_async<E: Event>(
446        &self,
447        event: &mut E,
448    ) -> Result<Vec<RuleMatch>, ScannerError> {
449        self.scan_async_with_options(event, ScanOptions::default())
450            .await
451    }
452
453    pub async fn scan_async_with_options<E: Event>(
454        &self,
455        event: &mut E,
456        options: ScanOptions,
457    ) -> Result<Vec<RuleMatch>, ScannerError> {
458        let fut = self.internal_scan_with_metrics(event, options);
459
460        // The sleep from the timeout requires being in a tokio context
461        // The guard needs to be dropped before await since the guard is !Send
462        let timeout = {
463            let _tokio_guard = TOKIO_RUNTIME.enter();
464            timeout(self.async_scan_timeout, fut)
465        };
466
467        timeout.await.unwrap_or(Err(ScannerError::Transient(
468            "Async scan timeout".to_string(),
469        )))
470    }
471
472    fn record_metrics(&self, output_rule_matches: &[RuleMatch], start: Instant) {
473        // Record detection time
474        self.metrics
475            .duration_ns
476            .increment(start.elapsed().as_nanos() as u64);
477        // Add number of scanned events
478        self.metrics.num_scanned_events.increment(1);
479        // Add number of matches
480        self.metrics
481            .match_count
482            .increment(output_rule_matches.len() as u64);
483    }
484
485    async fn internal_scan_with_metrics<E: Event>(
486        &self,
487        event: &mut E,
488        options: ScanOptions,
489    ) -> Result<Vec<RuleMatch>, ScannerError> {
490        let start = Instant::now();
491        let result = self.internal_scan(event, options).await;
492        match &result {
493            Ok(rule_matches) => {
494                self.record_metrics(rule_matches, start);
495            }
496            Err(_) => {
497                self.record_metrics(&[], start);
498            }
499        }
500        result
501    }
502
503    fn process_rule_matches<E: Event>(
504        &self,
505        event: &mut E,
506        rule_matches: InternalRuleMatchSet<E::Encoding>,
507        excluded_matches: AHashSet<String>,
508        output_rule_matches: &mut Vec<RuleMatch>,
509        need_match_content: bool,
510    ) {
511        if rule_matches.is_empty() {
512            return;
513        }
514        access_regex_caches(|regex_caches| {
515            for (path, mut rule_matches) in rule_matches.into_iter() {
516                // All rule matches in each inner list are for a single path, so they can be processed independently.
517                event.visit_string_mut(&path, |content| {
518                    // calculate_indices requires that matches are sorted by start index
519                    rule_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
520
521                    <<E as Event>::Encoding>::calculate_indices(
522                        content,
523                        rule_matches.iter_mut().map(
524                            |rule_match: &mut InternalRuleMatch<E::Encoding>| EncodeIndices {
525                                utf8_start: rule_match.utf8_start,
526                                utf8_end: rule_match.utf8_end,
527                                custom_start: &mut rule_match.custom_start,
528                                custom_end: &mut rule_match.custom_end,
529                            },
530                        ),
531                    );
532
533                    if self.scanner_features.multipass_v0_enabled {
534                        // Now that the `excluded_matches` set is fully populated, filter out any matches
535                        // that are the same as excluded matches (also known as "Multi-pass V0")
536                        rule_matches.retain(|rule_match| {
537                            if self.rules[rule_match.rule_index]
538                                .inner
539                                .should_exclude_multipass_v0()
540                            {
541                                let is_false_positive = excluded_matches
542                                    .contains(&content[rule_match.utf8_start..rule_match.utf8_end]);
543                                if is_false_positive && self.scanner_features.multipass_v0_enabled {
544                                    self.rules[rule_match.rule_index]
545                                        .on_excluded_match_multipass_v0();
546                                }
547                                !is_false_positive
548                            } else {
549                                true
550                            }
551                        });
552                    }
553
554                    self.suppress_matches::<E::Encoding>(&mut rule_matches, content, regex_caches);
555
556                    self.sort_and_remove_overlapping_rules::<E::Encoding>(&mut rule_matches);
557
558                    let will_mutate = rule_matches.iter().any(|rule_match| {
559                        self.rules[rule_match.rule_index].match_action.is_mutating()
560                    });
561
562                    self.apply_match_actions(
563                        content,
564                        &path,
565                        &mut rule_matches,
566                        output_rule_matches,
567                        need_match_content,
568                    );
569
570                    will_mutate
571                });
572            }
573        });
574    }
575
576    async fn internal_scan<E: Event>(
577        &self,
578        event: &mut E,
579        options: ScanOptions,
580    ) -> Result<Vec<RuleMatch>, ScannerError> {
581        // If validation is requested, we need to collect match content even if the scanner
582        // wasn't originally configured to return matches
583        let need_match_content = self.scanner_features.return_matches || options.validate_matches;
584        // All matches, after some (but not all) false-positives have been removed.
585        let mut rule_matches = InternalRuleMatchSet::new();
586        let mut excluded_matches = AHashSet::new();
587        let mut async_jobs = vec![];
588
589        access_regex_caches(|regex_caches| {
590            self.scoped_ruleset.visit_string_rule_combinations(
591                event,
592                ScannerContentVisitor {
593                    scanner: self,
594                    regex_caches,
595                    rule_matches: &mut rule_matches,
596                    blocked_rules: &options.blocked_rules_idx,
597                    excluded_matches: &mut excluded_matches,
598                    per_event_data: SharedData::new(),
599                    wildcarded_indexes: &options.wildcarded_indices,
600                    async_jobs: &mut async_jobs,
601                    event_id: event.get_id().map(|s| s.to_string()),
602                },
603            )
604        })?;
605
606        // The async jobs were already spawned on the tokio runtime, so the
607        // results just need to be collected
608        for job in async_jobs {
609            let rule_info = job.fut.await.unwrap()?;
610            rule_matches.push_async_matches(
611                &job.path,
612                rule_info
613                    .rule_matches
614                    .into_iter()
615                    .map(|x| InternalRuleMatch::new(rule_info.rule_index, x)),
616            );
617        }
618
619        let mut output_rule_matches = vec![];
620
621        self.process_rule_matches(
622            event,
623            rule_matches,
624            excluded_matches,
625            &mut output_rule_matches,
626            need_match_content,
627        );
628
629        if options.validate_matches {
630            self.validate_matches(&mut output_rule_matches);
631        }
632
633        Ok(output_rule_matches)
634    }
635
636    pub fn suppress_matches<E: Encoding>(
637        &self,
638        rule_matches: &mut Vec<InternalRuleMatch<E>>,
639        content: &str,
640        regex_caches: &mut RegexCaches,
641    ) {
642        rule_matches.retain(|rule_match| {
643            if let Some(suppressions) = &self.rules[rule_match.rule_index].suppressions {
644                let match_should_be_suppressed = suppressions.should_match_be_suppressed(
645                    &content[rule_match.utf8_start..rule_match.utf8_end],
646                    regex_caches,
647                );
648
649                if match_should_be_suppressed {
650                    self.metrics.suppressed_match_count.increment(1);
651                }
652                !match_should_be_suppressed
653            } else {
654                true
655            }
656        });
657    }
658
659    pub fn validate_matches(&self, rule_matches: &mut Vec<RuleMatch>) {
660        // Create MatchValidatorRuleMatch per match_validator_type to pass it to each match_validator
661        let mut match_validator_rule_match_per_type = AHashMap::new();
662
663        let mut validated_rule_matches = vec![];
664
665        for mut rule_match in rule_matches.drain(..) {
666            let rule = &self.rules[rule_match.rule_index];
667            if let Some(match_validation_type) = rule.internal_match_validation_type() {
668                match_validator_rule_match_per_type
669                    .entry(match_validation_type)
670                    .or_insert_with(Vec::new)
671                    .push(rule_match)
672            } else {
673                // There is no match validator for this rule, so mark it as not available.
674                rule_match.match_status.merge(MatchStatus::NotAvailable);
675                validated_rule_matches.push(rule_match);
676            }
677        }
678
679        RAYON_THREAD_POOL.install(|| {
680            use rayon::prelude::*;
681
682            match_validator_rule_match_per_type.par_iter_mut().for_each(
683                |(match_validation_type, matches_per_type)| {
684                    let match_validator = self.match_validators_per_type.get(match_validation_type);
685                    if let Some(match_validator) = match_validator {
686                        match_validator
687                            .as_ref()
688                            .validate(matches_per_type, &self.rules)
689                    }
690                },
691            );
692        });
693
694        // Refill the rule_matches with the validated matches
695        for (_, mut matches) in match_validator_rule_match_per_type {
696            validated_rule_matches.append(&mut matches);
697        }
698
699        // Sort rule_matches by start index
700        validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
701        *rule_matches = validated_rule_matches;
702    }
703
704    /// Apply mutations from actions, and shift indices to match the mutated values.
705    /// This assumes the matches are all from the content given, and are sorted by start index.
706    fn apply_match_actions<E: Encoding>(
707        &self,
708        content: &mut String,
709        path: &Path<'static>,
710        rule_matches: &mut [InternalRuleMatch<E>],
711        output_rule_matches: &mut Vec<RuleMatch>,
712        need_match_content: bool,
713    ) {
714        let mut utf8_byte_delta: isize = 0;
715        let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
716
717        for rule_match in rule_matches {
718            output_rule_matches.push(self.apply_match_actions_for_string::<E>(
719                content,
720                path.clone(),
721                rule_match,
722                &mut utf8_byte_delta,
723                &mut custom_index_delta,
724                need_match_content,
725            ));
726        }
727    }
728
729    /// This will be called once for each match of a single string. The rules must be passed in in order of the start index. Mutating rules must not overlap.
730    fn apply_match_actions_for_string<E: Encoding>(
731        &self,
732        content: &mut String,
733        path: Path<'static>,
734        rule_match: &InternalRuleMatch<E>,
735        // The current difference in length between the original and mutated string
736        utf8_byte_delta: &mut isize,
737
738        // The difference between the custom index on the original string and the mutated string
739        custom_index_delta: &mut <E>::IndexShift,
740        need_match_content: bool,
741    ) -> RuleMatch {
742        let rule = &self.rules[rule_match.rule_index];
743
744        let custom_start =
745            (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
746                + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
747
748        let mut matched_content_copy = None;
749
750        if need_match_content {
751            // This copies part of the is_mutating block but is seperate since can't mix compilation condition and code condition
752            let mutated_utf8_match_start =
753                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
754            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
755
756            // Matches for mutating rules must have valid indices
757            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
758            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
759
760            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
761            matched_content_copy = Some(matched_content.to_string());
762        }
763
764        if rule.match_action.is_mutating() {
765            let mutated_utf8_match_start =
766                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
767            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
768
769            // Matches for mutating rules must have valid indices
770            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
771            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
772
773            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
774            if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
775                let before_replacement = &matched_content[replacement.start..replacement.end];
776
777                // update indices to match the new mutated content
778                <E>::adjust_shift(
779                    custom_index_delta,
780                    before_replacement,
781                    &replacement.replacement,
782                );
783                *utf8_byte_delta +=
784                    replacement.replacement.len() as isize - before_replacement.len() as isize;
785
786                let replacement_start = mutated_utf8_match_start + replacement.start;
787                let replacement_end = mutated_utf8_match_start + replacement.end;
788                content.replace_range(replacement_start..replacement_end, &replacement.replacement);
789            }
790        }
791
792        let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
793        let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
794            + shift_offset) as usize;
795
796        let rule = &self.rules[rule_match.rule_index];
797
798        let match_status: MatchStatus = if rule.match_validation_type.is_some() {
799            MatchStatus::NotChecked
800        } else {
801            MatchStatus::NotAvailable
802        };
803
804        RuleMatch {
805            rule_index: rule_match.rule_index,
806            path,
807            replacement_type: rule.match_action.replacement_type(),
808            start_index: custom_start,
809            end_index_exclusive: custom_end,
810            shift_offset,
811            match_value: matched_content_copy,
812            match_status,
813        }
814    }
815
816    fn sort_and_remove_overlapping_rules<E: Encoding>(
817        &self,
818        rule_matches: &mut Vec<InternalRuleMatch<E>>,
819    ) {
820        // Some of the scanner code relies on the behavior here, such as the sort order and removal of overlapping mutating rules.
821        // Be very careful if this function is modified.
822
823        rule_matches.sort_unstable_by(|a, b| {
824            // Mutating rules are a higher priority (earlier in the list)
825            let ord = self.rules[a.rule_index]
826                .match_action
827                .is_mutating()
828                .cmp(&self.rules[b.rule_index].match_action.is_mutating())
829                .reverse();
830
831            // Earlier start offset
832            let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
833
834            // Longer matches
835            let ord = ord.then(a.len().cmp(&b.len()).reverse());
836
837            // Matches from earlier rules
838            let ord = ord.then(a.rule_index.cmp(&b.rule_index));
839
840            // swap the order of everything so matches can be efficiently popped off the back as they are processed
841            ord.reverse()
842        });
843
844        let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
845
846        'rule_matches: while let Some(rule_match) = rule_matches.pop() {
847            if self.rules[rule_match.rule_index].match_action.is_mutating() {
848                // Mutating rules are kept only if they don't overlap with a previous rule.
849                if let Some(last) = retained_rules.last()
850                    && last.utf8_end > rule_match.utf8_start
851                {
852                    continue;
853                }
854            } else {
855                // Only retain if it doesn't overlap with any other rule. Since mutating matches are sorted before non-mutated matches
856                // this needs to check all retained matches (instead of just the last one)
857                for retained_rule in &retained_rules {
858                    if retained_rule.utf8_start < rule_match.utf8_end
859                        && retained_rule.utf8_end > rule_match.utf8_start
860                    {
861                        continue 'rule_matches;
862                    }
863                }
864            };
865            retained_rules.push(rule_match);
866        }
867
868        // ensure rules are sorted by start index (other parts of the library required this to function correctly)
869        retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
870
871        *rule_matches = retained_rules;
872    }
873}
874
875impl Drop for Scanner {
876    fn drop(&mut self) {
877        let stats = &*GLOBAL_STATS;
878        stats.scanner_deletions.increment(1);
879        stats.decrement_total_scanners();
880    }
881}
882
883#[derive(Default)]
884pub struct ScannerBuilder<'a> {
885    rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
886    labels: Labels,
887    scanner_features: ScannerFeatures,
888    async_scan_timeout: Duration,
889}
890
891impl ScannerBuilder<'_> {
892    pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
893        ScannerBuilder {
894            rules,
895            labels: Labels::empty(),
896            scanner_features: ScannerFeatures::default(),
897            async_scan_timeout: Duration::from_secs(60 * 5),
898        }
899    }
900
901    pub fn labels(mut self, labels: Labels) -> Self {
902        self.labels = labels;
903        self
904    }
905
906    pub fn with_async_scan_timeout(mut self, duration: Duration) -> Self {
907        self.async_scan_timeout = duration;
908        self
909    }
910
911    pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
912        self.scanner_features.add_implicit_index_wildcards = value;
913        self
914    }
915
916    pub fn with_return_matches(mut self, value: bool) -> Self {
917        self.scanner_features.return_matches = value;
918        self
919    }
920
921    /// Enables/Disables the Multipass V0 feature. This defaults to TRUE.
922    /// Multipass V0 saves matches from excluded scopes, and marks any identical
923    /// matches in included scopes as a false positive.
924    pub fn with_multipass_v0(mut self, value: bool) -> Self {
925        self.scanner_features.multipass_v0_enabled = value;
926        self
927    }
928
929    pub fn with_skip_rules_with_regex_matching_empty_string(mut self, value: bool) -> Self {
930        self.scanner_features
931            .skip_rules_with_regex_matching_empty_string = value;
932        self
933    }
934
935    pub fn build(self) -> Result<Scanner, CreateScannerError> {
936        let mut match_validators_per_type = AHashMap::new();
937
938        for rule in self.rules.iter() {
939            if let Some(match_validation_type) = &rule.get_third_party_active_checker()
940                && match_validation_type.can_create_match_validator()
941            {
942                let internal_type = match_validation_type.get_internal_match_validation_type();
943                let match_validator = match_validation_type.into_match_validator();
944                if let Ok(match_validator) = match_validator {
945                    if !match_validators_per_type.contains_key(&internal_type) {
946                        match_validators_per_type.insert(internal_type, match_validator);
947                    }
948                } else {
949                    return Err(CreateScannerError::InvalidMatchValidator(
950                        MatchValidatorCreationError::InternalError,
951                    ));
952                }
953            }
954        }
955
956        let compiled_rules = self
957            .rules
958            .iter()
959            .enumerate()
960            .filter_map(|(rule_index, config)| {
961                let inner = match config.convert_to_compiled_rule(rule_index, self.labels.clone()) {
962                    Ok(inner) => Ok(inner),
963                    Err(err) => {
964                        if self
965                            .scanner_features
966                            .skip_rules_with_regex_matching_empty_string
967                            && err
968                            == CreateScannerError::InvalidRegex(
969                            RegexValidationError::MatchesEmptyString,
970                        )
971                        {
972                            // this is a temporary feature to skip rules that should be considered invalid.
973                            #[allow(clippy::print_stdout)]
974                            {
975                                println!("skipping rule that matches empty string: rule_index={}, labels={:?}", rule_index, self.labels.clone());
976                            }
977                            return None;
978                        } else {
979                            Err(err)
980                        }
981                    }
982                };
983                Some((config, inner))
984            })
985            .map(|(config, inner)| {
986                config.match_action.validate()?;
987                let compiled_suppressions = match &config.suppressions {
988                    Some(s) => s.compile()?,
989                    None => None,
990                };
991                Ok(RootCompiledRule {
992                    inner: inner?,
993                    scope: config.scope.clone(),
994                    match_action: config.match_action.clone(),
995                    match_validation_type: config.get_third_party_active_checker().cloned(),
996                    suppressions: compiled_suppressions,
997                })
998            })
999            .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
1000
1001        let mut per_scanner_data = SharedData::new();
1002
1003        compiled_rules.iter().for_each(|rule| {
1004            rule.init_per_scanner_data(&mut per_scanner_data);
1005        });
1006
1007        let scoped_ruleset = ScopedRuleSet::new(
1008            &compiled_rules
1009                .iter()
1010                .map(|rule| rule.scope.clone())
1011                .collect::<Vec<_>>(),
1012        )
1013        .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
1014
1015        {
1016            let stats = &*GLOBAL_STATS;
1017            stats.scanner_creations.increment(1);
1018            stats.increment_total_scanners();
1019        }
1020
1021        Ok(Scanner {
1022            rules: compiled_rules,
1023            scoped_ruleset,
1024            scanner_features: self.scanner_features,
1025            metrics: ScannerMetrics::new(&self.labels),
1026            match_validators_per_type,
1027            labels: self.labels,
1028            per_scanner_data,
1029            async_scan_timeout: self.async_scan_timeout,
1030        })
1031    }
1032}
1033
1034struct ScannerContentVisitor<'a, E: Encoding> {
1035    scanner: &'a Scanner,
1036    regex_caches: &'a mut RegexCaches,
1037    rule_matches: &'a mut InternalRuleMatchSet<E>,
1038    // Rules that shall be skipped for this scan
1039    // This list shall be small (<10), so a linear search is acceptable
1040    blocked_rules: &'a Vec<usize>,
1041    excluded_matches: &'a mut AHashSet<String>,
1042    per_event_data: SharedData,
1043    wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
1044    async_jobs: &'a mut Vec<PendingRuleJob>,
1045    event_id: Option<String>,
1046}
1047
1048impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
1049    fn visit_content<'b>(
1050        &'b mut self,
1051        path: &Path<'a>,
1052        content: &str,
1053        mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
1054        exclusion_check: ExclusionCheck<'b>,
1055    ) -> Result<bool, ScannerError> {
1056        // matches for a single path
1057        let mut path_rules_matches = vec![];
1058
1059        // Create a map of per rule type data that can be shared between rules of the same type
1060        let mut per_string_data = SharedData::new();
1061        let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
1062
1063        rule_visitor.visit_rule_indices(|rule_index| {
1064            if self.blocked_rules.contains(&rule_index) {
1065                return Ok(());
1066            }
1067            let rule = &self.scanner.rules[rule_index];
1068            {
1069                // creating the emitter is basically free, it will get mostly optimized away
1070                let mut emitter = |rule_match: StringMatch| {
1071                    // This should never happen, but to ensure no empty match is ever generated
1072                    // (which may cause an infinite loop), this will panic instead.
1073                    assert_ne!(rule_match.start, rule_match.end, "empty match detected");
1074                    path_rules_matches.push(InternalRuleMatch::new(rule_index, rule_match));
1075                };
1076
1077                rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
1078
1079                // TODO: move this somewhere higher?
1080                rule.init_per_event_data(&mut self.per_event_data);
1081
1082                let mut ctx = StringMatchesCtx {
1083                    rule_index,
1084                    regex_caches: self.regex_caches,
1085                    exclusion_check: &exclusion_check,
1086                    excluded_matches: self.excluded_matches,
1087                    match_emitter: &mut emitter,
1088                    wildcard_indices: wildcard_indices_per_path,
1089                    per_string_data: &mut per_string_data,
1090                    per_scanner_data: &self.scanner.per_scanner_data,
1091                    per_event_data: &mut self.per_event_data,
1092                    event_id: self.event_id.as_deref(),
1093                };
1094
1095                let async_status = rule.get_string_matches(content, path, &mut ctx)?;
1096
1097                match async_status {
1098                    RuleStatus::Done => {
1099                        // nothing to do
1100                    }
1101                    RuleStatus::Pending(fut) => {
1102                        self.async_jobs.push(PendingRuleJob {
1103                            fut,
1104                            path: path.into_static(),
1105                        });
1106                    }
1107                }
1108            }
1109            Ok(())
1110        })?;
1111
1112        // If there are any matches, the string will need to be accessed to check for false positives from
1113        // excluded matches, any to potentially mutate the string.
1114        // If there are any async jobs, this is also true since it's not known yet whether there
1115        // will be a match
1116        let needs_to_access_content = !path_rules_matches.is_empty() || !self.async_jobs.is_empty();
1117
1118        self.rule_matches
1119            .push_sync_matches(path, path_rules_matches);
1120
1121        Ok(needs_to_access_content)
1122    }
1123}
1124
1125// Calculates the next starting position for a regex match if a the previous match is a false positive
1126fn get_next_regex_start(content: &str, regex_match: (usize, usize)) -> Option<usize> {
1127    // The next valid UTF8 char after the start of the regex match is used
1128    if let Some((i, _)) = content[regex_match.0..].char_indices().nth(1) {
1129        Some(regex_match.0 + i)
1130    } else {
1131        // There are no more chars left in the string to scan
1132        None
1133    }
1134}
1135
1136fn is_false_positive_match(
1137    regex_match_range: (usize, usize),
1138    rule: &RegexCompiledRule,
1139    content: &str,
1140    check_excluded_keywords: bool,
1141) -> bool {
1142    if check_excluded_keywords
1143        && let Some(excluded_keywords) = &rule.excluded_keywords
1144        && excluded_keywords.is_false_positive_match(content, regex_match_range.0)
1145    {
1146        return true;
1147    }
1148
1149    if let Some(validator) = rule.validator.as_ref()
1150        && !validator.is_valid_match(&content[regex_match_range.0..regex_match_range.1])
1151    {
1152        return true;
1153    }
1154    false
1155}