dd_sds/scanner/
mod.rs

1use crate::encoding::Encoding;
2use crate::event::Event;
3use std::future::Future;
4
5use crate::match_validation::{
6    config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
7    match_validator::MatchValidator,
8};
9
10use error::{MatchValidationError, MatchValidatorCreationError};
11
12use self::metrics::ScannerMetrics;
13use crate::match_validation::match_validator::RAYON_THREAD_POOL;
14use crate::observability::labels::Labels;
15use crate::rule_match::{InternalRuleMatch, RuleMatch};
16use crate::scanner::config::RuleConfig;
17use crate::scanner::internal_rule_match_set::InternalRuleMatchSet;
18use crate::scanner::regex_rule::compiled::RegexCompiledRule;
19use crate::scanner::regex_rule::{access_regex_caches, RegexCaches};
20use crate::scanner::scope::Scope;
21pub use crate::scanner::shared_data::SharedData;
22use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
23pub use crate::secondary_validation::Validator;
24use crate::stats::GLOBAL_STATS;
25use crate::tokio::TOKIO_RUNTIME;
26use crate::{
27    CreateScannerError, EncodeIndices, MatchAction, Path, RegexValidationError, ScannerError,
28};
29use ahash::{AHashMap, AHashSet};
30use futures::executor::block_on;
31use regex_automata::Match;
32use serde::{Deserialize, Serialize};
33use serde_with::serde_as;
34use std::ops::Deref;
35use std::pin::Pin;
36use std::sync::Arc;
37use std::time::{Duration, Instant};
38use tokio::task::JoinHandle;
39use tokio::time::timeout;
40
41pub mod config;
42pub mod error;
43pub mod metrics;
44pub mod regex_rule;
45pub mod scope;
46pub mod shared_data;
47pub mod shared_pool;
48
49mod internal_rule_match_set;
50#[cfg(test)]
51mod test;
52
53#[derive(Copy, Clone)]
54pub struct StringMatch {
55    pub start: usize,
56    pub end: usize,
57}
58
59pub trait MatchEmitter<T = ()> {
60    fn emit(&mut self, string_match: StringMatch) -> T;
61}
62
63// This implements MatchEmitter for mutable closures (so you can use a closure instead of a custom
64// struct that implements MatchEmitter)
65impl<F, T> MatchEmitter<T> for F
66where
67    F: FnMut(StringMatch) -> T,
68{
69    fn emit(&mut self, string_match: StringMatch) -> T {
70        // This just calls the closure (itself)
71        (self)(string_match)
72    }
73}
74
75#[serde_as]
76#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
77pub struct RootRuleConfig<T> {
78    pub match_action: MatchAction,
79    #[serde(default)]
80    pub scope: Scope,
81    #[deprecated(note = "Use `third_party_active_checker` instead")]
82    match_validation_type: Option<MatchValidationType>,
83    third_party_active_checker: Option<MatchValidationType>,
84    #[serde(flatten)]
85    pub inner: T,
86}
87
88impl<T> RootRuleConfig<T>
89where
90    T: RuleConfig + 'static,
91{
92    pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
93        RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
94    }
95
96    pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
97        self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
98    }
99}
100
101impl<T> RootRuleConfig<T> {
102    pub fn new(inner: T) -> Self {
103        #[allow(deprecated)]
104        Self {
105            match_action: MatchAction::None,
106            scope: Scope::all(),
107            match_validation_type: None,
108            third_party_active_checker: None,
109            inner,
110        }
111    }
112
113    pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
114        #[allow(deprecated)]
115        RootRuleConfig {
116            match_action: self.match_action,
117            scope: self.scope,
118            match_validation_type: self.match_validation_type,
119            third_party_active_checker: self.third_party_active_checker,
120            inner: func(self.inner),
121        }
122    }
123
124    pub fn match_action(mut self, action: MatchAction) -> Self {
125        self.match_action = action;
126        self
127    }
128
129    pub fn scope(mut self, scope: Scope) -> Self {
130        self.scope = scope;
131        self
132    }
133
134    pub fn third_party_active_checker(
135        mut self,
136        match_validation_type: MatchValidationType,
137    ) -> Self {
138        self.third_party_active_checker = Some(match_validation_type);
139        self
140    }
141
142    fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
143        #[allow(deprecated)]
144        self.third_party_active_checker
145            .as_ref()
146            .or(self.match_validation_type.as_ref())
147    }
148}
149
150impl<T> Deref for RootRuleConfig<T> {
151    type Target = T;
152
153    fn deref(&self) -> &Self::Target {
154        &self.inner
155    }
156}
157pub struct RootCompiledRule {
158    pub inner: Box<dyn CompiledRule>,
159    pub scope: Scope,
160    pub match_action: MatchAction,
161    pub match_validation_type: Option<MatchValidationType>,
162}
163
164impl RootCompiledRule {
165    pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
166        self.match_validation_type
167            .as_ref()
168            .map(|x| x.get_internal_match_validation_type())
169    }
170}
171
172impl Deref for RootCompiledRule {
173    type Target = dyn CompiledRule;
174
175    fn deref(&self) -> &Self::Target {
176        self.inner.as_ref()
177    }
178}
179
180pub struct StringMatchesCtx<'a> {
181    rule_index: usize,
182    pub regex_caches: &'a mut RegexCaches,
183    pub exclusion_check: &'a ExclusionCheck<'a>,
184    pub excluded_matches: &'a mut AHashSet<String>,
185    pub match_emitter: &'a mut dyn MatchEmitter,
186    pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
187
188    // Shared Data
189    pub per_string_data: &'a mut SharedData,
190    pub per_scanner_data: &'a SharedData,
191    pub per_event_data: &'a mut SharedData,
192}
193
194impl StringMatchesCtx<'_> {
195    /// If a `get_string_matches` implementation needs to do any async processing (e.g. I/O),
196    /// this function can be used to return an "async job" to find matches. The return value
197    /// of `process_async` should be returned from the `get_string_matches` function. The future
198    /// passed into this function will be spawned and executed immediately without blocking
199    /// other `get_string_matches` calls. This means all the async jobs will run concurrently.
200    ///
201    /// The `ctx` available to async jobs is more restrictive than the normal `ctx` available in
202    /// `get_string_matches`. The only thing you can do is return matches. If other data is needed,
203    /// it should be accessed before `process_async` is called.
204    pub fn process_async(
205        &self,
206        func: impl for<'a> FnOnce(
207                &'a mut AsyncStringMatchesCtx,
208            )
209                -> Pin<Box<dyn Future<Output = Result<(), ScannerError>> + Send + 'a>>
210            + Send
211            + 'static,
212    ) -> RuleResult {
213        let rule_index = self.rule_index;
214
215        // The future is spawned onto the tokio runtime immediately so it starts running
216        // in the background
217        let fut = TOKIO_RUNTIME.spawn(async move {
218            let mut ctx = AsyncStringMatchesCtx {
219                rule_matches: vec![],
220            };
221            (func)(&mut ctx).await?;
222
223            Ok(AsyncRuleInfo {
224                rule_index,
225                rule_matches: ctx.rule_matches,
226            })
227        });
228
229        Ok(RuleStatus::Pending(fut))
230    }
231}
232
233pub struct AsyncStringMatchesCtx {
234    rule_matches: Vec<StringMatch>,
235}
236
237impl AsyncStringMatchesCtx {
238    pub fn emit_match(&mut self, string_match: StringMatch) {
239        self.rule_matches.push(string_match);
240    }
241}
242
243#[must_use]
244pub enum RuleStatus {
245    Done,
246    Pending(PendingRuleResult),
247}
248
249// pub type PendingRuleResult = BoxFuture<'static, Result<AsyncRuleInfo, ScannerError>>;
250pub type PendingRuleResult = JoinHandle<Result<AsyncRuleInfo, ScannerError>>;
251
252pub struct PendingRuleJob {
253    fut: PendingRuleResult,
254    path: Path<'static>,
255}
256
257pub struct AsyncRuleInfo {
258    rule_index: usize,
259    rule_matches: Vec<StringMatch>,
260}
261
262/// A rule result that cannot be async
263pub type RuleResult = Result<RuleStatus, ScannerError>;
264
265// This is the public trait that is used to define the behavior of a compiled rule.
266pub trait CompiledRule: Send + Sync {
267    fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
268        // by default, no per-scanner data is initialized
269    }
270
271    fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
272        // by default, no per-string data is initialized
273    }
274
275    fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
276        // by default, no per-event data is initialized
277    }
278
279    fn get_string_matches(
280        &self,
281        content: &str,
282        path: &Path,
283        ctx: &mut StringMatchesCtx<'_>,
284    ) -> RuleResult;
285
286    // Whether a match from this rule should be excluded (marked as a false-positive)
287    // if the content of this match was found in a match from an excluded scope
288    fn should_exclude_multipass_v0(&self) -> bool {
289        // default is to NOT use Multi-pass V0
290        false
291    }
292
293    fn on_excluded_match_multipass_v0(&self) {
294        // default is to do nothing
295    }
296}
297
298impl<T> RuleConfig for Box<T>
299where
300    T: RuleConfig + ?Sized,
301{
302    fn convert_to_compiled_rule(
303        &self,
304        rule_index: usize,
305        labels: Labels,
306    ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
307        self.as_ref().convert_to_compiled_rule(rule_index, labels)
308    }
309}
310
311#[derive(Debug, PartialEq, Clone)]
312struct ScannerFeatures {
313    pub add_implicit_index_wildcards: bool,
314    pub multipass_v0_enabled: bool,
315    pub return_matches: bool,
316    // This is a temporary flag to disable failed rules (instead of fail the entire scanner)
317    // for regex rules that match an empty string
318    pub skip_rules_with_regex_matching_empty_string: bool,
319}
320
321impl Default for ScannerFeatures {
322    fn default() -> Self {
323        Self {
324            add_implicit_index_wildcards: false,
325            multipass_v0_enabled: true,
326            return_matches: false,
327            skip_rules_with_regex_matching_empty_string: false,
328        }
329    }
330}
331
332pub struct ScanOptions {
333    // The blocked_rules_idx parameter is a list of rule indices that should be skipped for this scan.
334    // this list shall be small (<10), so a linear search is acceptable otherwise performance will be impacted.
335    pub blocked_rules_idx: Vec<usize>,
336    // The wildcarded_indices parameter is a map containing a list of tuples of (start, end) indices that should be treated as wildcards (for the message key only) per path.
337    pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
338}
339
340impl Default for ScanOptions {
341    fn default() -> Self {
342        Self {
343            blocked_rules_idx: vec![],
344            wildcarded_indices: AHashMap::new(),
345        }
346    }
347}
348
349pub struct ScanOptionBuilder {
350    blocked_rules_idx: Vec<usize>,
351    wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
352}
353
354impl ScanOptionBuilder {
355    pub fn new() -> Self {
356        Self {
357            blocked_rules_idx: vec![],
358            wildcarded_indices: AHashMap::new(),
359        }
360    }
361
362    pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
363        self.blocked_rules_idx = blocked_rules_idx;
364        self
365    }
366
367    pub fn with_wildcarded_indices(
368        mut self,
369        wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
370    ) -> Self {
371        self.wildcarded_indices = wildcarded_indices;
372        self
373    }
374
375    pub fn build(self) -> ScanOptions {
376        ScanOptions {
377            blocked_rules_idx: self.blocked_rules_idx,
378            wildcarded_indices: self.wildcarded_indices,
379        }
380    }
381}
382
383pub struct Scanner {
384    rules: Vec<RootCompiledRule>,
385    scoped_ruleset: ScopedRuleSet,
386    scanner_features: ScannerFeatures,
387    metrics: ScannerMetrics,
388    labels: Labels,
389    match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
390    per_scanner_data: SharedData,
391    async_scan_timeout: Duration,
392}
393
394impl Scanner {
395    pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
396        ScannerBuilder::new(rules)
397    }
398
399    // This function scans the given event with the rules configured in the scanner.
400    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
401    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
402    pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
403        self.scan_with_options(event, ScanOptions::default())
404    }
405
406    // This function scans the given event with the rules configured in the scanner.
407    // The event parameter is a mutable reference to the event that should be scanned (implemented the Event trait).
408    // The return value is a list of RuleMatch objects, which contain information about the matches that were found.
409    pub async fn scan_async<E: Event>(
410        &self,
411        event: &mut E,
412    ) -> Result<Vec<RuleMatch>, ScannerError> {
413        self.scan_async_with_options(event, ScanOptions::default())
414            .await
415    }
416
417    pub fn scan_with_options<E: Event>(
418        &self,
419        event: &mut E,
420        options: ScanOptions,
421    ) -> Result<Vec<RuleMatch>, ScannerError> {
422        block_on(self.internal_scan_with_metrics(event, options))
423    }
424
425    pub async fn scan_async_with_options<E: Event>(
426        &self,
427        event: &mut E,
428        options: ScanOptions,
429    ) -> Result<Vec<RuleMatch>, ScannerError> {
430        let fut = self.internal_scan_with_metrics(event, options);
431
432        // The sleep from the timeout requires being in a tokio context
433        // The guard needs to be dropped before await since the guard is !Send
434        let timeout = {
435            let _tokio_guard = TOKIO_RUNTIME.enter();
436            timeout(self.async_scan_timeout, fut)
437        };
438
439        timeout.await.unwrap_or(Err(ScannerError::Transient(
440            "Async scan timeout".to_string(),
441        )))
442    }
443
444    fn record_metrics(&self, output_rule_matches: &[RuleMatch], start: Instant) {
445        // Record detection time
446        self.metrics
447            .duration_ns
448            .increment(start.elapsed().as_nanos() as u64);
449        // Add number of scanned events
450        self.metrics.num_scanned_events.increment(1);
451        // Add number of matches
452        self.metrics
453            .match_count
454            .increment(output_rule_matches.len() as u64);
455    }
456
457    async fn internal_scan_with_metrics<E: Event>(
458        &self,
459        event: &mut E,
460        options: ScanOptions,
461    ) -> Result<Vec<RuleMatch>, ScannerError> {
462        let start = Instant::now();
463        let result = self.internal_scan(event, options).await;
464        match &result {
465            Ok(rule_matches) => {
466                self.record_metrics(rule_matches, start);
467            }
468            Err(_) => {
469                self.record_metrics(&[], start);
470            }
471        }
472        result
473    }
474
475    async fn internal_scan<E: Event>(
476        &self,
477        event: &mut E,
478        options: ScanOptions,
479    ) -> Result<Vec<RuleMatch>, ScannerError> {
480        // All matches, after some (but not all) false-positives have been removed.
481        let mut rule_matches = InternalRuleMatchSet::new();
482        let mut excluded_matches = AHashSet::new();
483        let mut async_jobs = vec![];
484
485        access_regex_caches(|regex_caches| {
486            self.scoped_ruleset.visit_string_rule_combinations(
487                event,
488                ScannerContentVisitor {
489                    scanner: self,
490                    regex_caches,
491                    rule_matches: &mut rule_matches,
492                    blocked_rules: &options.blocked_rules_idx,
493                    excluded_matches: &mut excluded_matches,
494                    per_event_data: SharedData::new(),
495                    wildcarded_indexes: &options.wildcarded_indices,
496                    async_jobs: &mut async_jobs,
497                },
498            )
499        })?;
500
501        // The async jobs were already spawned on the tokio runtime, so the
502        // results just need to be collected
503        for job in async_jobs {
504            let rule_info = job.fut.await.unwrap()?;
505            rule_matches.push_async_matches(
506                &job.path,
507                rule_info
508                    .rule_matches
509                    .into_iter()
510                    .map(|x| InternalRuleMatch::new(rule_info.rule_index, x)),
511            );
512        }
513
514        let mut output_rule_matches = vec![];
515
516        for (path, mut rule_matches) in rule_matches.into_iter() {
517            // All rule matches in each inner list are for a single path, so they can be processed independently.
518            event.visit_string_mut(&path, |content| {
519                // calculate_indices requires that matches are sorted by start index
520                rule_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
521
522                <<E as Event>::Encoding>::calculate_indices(
523                    content,
524                    rule_matches.iter_mut().map(
525                        |rule_match: &mut InternalRuleMatch<E::Encoding>| EncodeIndices {
526                            utf8_start: rule_match.utf8_start,
527                            utf8_end: rule_match.utf8_end,
528                            custom_start: &mut rule_match.custom_start,
529                            custom_end: &mut rule_match.custom_end,
530                        },
531                    ),
532                );
533
534                if self.scanner_features.multipass_v0_enabled {
535                    // Now that the `excluded_matches` set is fully populated, filter out any matches
536                    // that are the same as excluded matches (also known as "Multi-pass V0")
537                    rule_matches.retain(|rule_match| {
538                        if self.rules[rule_match.rule_index]
539                            .inner
540                            .should_exclude_multipass_v0()
541                        {
542                            let is_false_positive = excluded_matches
543                                .contains(&content[rule_match.utf8_start..rule_match.utf8_end]);
544                            if is_false_positive && self.scanner_features.multipass_v0_enabled {
545                                self.rules[rule_match.rule_index].on_excluded_match_multipass_v0();
546                            }
547                            !is_false_positive
548                        } else {
549                            true
550                        }
551                    });
552                }
553
554                self.sort_and_remove_overlapping_rules::<E::Encoding>(&mut rule_matches);
555
556                let will_mutate = rule_matches
557                    .iter()
558                    .any(|rule_match| self.rules[rule_match.rule_index].match_action.is_mutating());
559
560                self.apply_match_actions(
561                    content,
562                    &path,
563                    &mut rule_matches,
564                    &mut output_rule_matches,
565                );
566
567                will_mutate
568            });
569        }
570
571        Ok(output_rule_matches)
572    }
573
574    pub fn validate_matches(
575        &self,
576        rule_matches: &mut Vec<RuleMatch>,
577    ) -> Result<(), MatchValidationError> {
578        if !self.scanner_features.return_matches {
579            return Err(MatchValidationError::NoMatchValidationType);
580        }
581        // Create MatchValidatorRuleMatch per match_validator_type to pass it to each match_validator
582        let mut match_validator_rule_match_per_type = AHashMap::new();
583
584        let mut validated_rule_matches = vec![];
585
586        for mut rule_match in rule_matches.drain(..) {
587            let rule = &self.rules[rule_match.rule_index];
588            if let Some(match_validation_type) = rule.internal_match_validation_type() {
589                match_validator_rule_match_per_type
590                    .entry(match_validation_type)
591                    .or_insert_with(Vec::new)
592                    .push(rule_match)
593            } else {
594                // There is no match validator for this rule, so mark it as not available.
595                rule_match.match_status.merge(MatchStatus::NotAvailable);
596                validated_rule_matches.push(rule_match);
597            }
598        }
599
600        RAYON_THREAD_POOL.install(|| {
601            use rayon::prelude::*;
602
603            match_validator_rule_match_per_type.par_iter_mut().for_each(
604                |(match_validation_type, matches_per_type)| {
605                    let match_validator = self.match_validators_per_type.get(match_validation_type);
606                    if let Some(match_validator) = match_validator {
607                        match_validator
608                            .as_ref()
609                            .validate(matches_per_type, &self.rules)
610                    }
611                },
612            );
613        });
614
615        // Refill the rule_matches with the validated matches
616        for (_, mut matches) in match_validator_rule_match_per_type {
617            validated_rule_matches.append(&mut matches);
618        }
619
620        // Sort rule_matches by start index
621        validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
622        *rule_matches = validated_rule_matches;
623        Ok(())
624    }
625
626    /// Apply mutations from actions, and shift indices to match the mutated values.
627    /// This assumes the matches are all from the content given, and are sorted by start index.
628    fn apply_match_actions<E: Encoding>(
629        &self,
630        content: &mut String,
631        path: &Path<'static>,
632        rule_matches: &mut [InternalRuleMatch<E>],
633        output_rule_matches: &mut Vec<RuleMatch>,
634    ) {
635        let mut utf8_byte_delta: isize = 0;
636        let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
637
638        for rule_match in rule_matches {
639            output_rule_matches.push(self.apply_match_actions_for_string::<E>(
640                content,
641                path.clone(),
642                rule_match,
643                &mut utf8_byte_delta,
644                &mut custom_index_delta,
645            ));
646        }
647    }
648
649    /// This will be called once for each match of a single string. The rules must be passed in in order of the start index. Mutating rules must not overlap.
650    fn apply_match_actions_for_string<E: Encoding>(
651        &self,
652        content: &mut String,
653        path: Path<'static>,
654        rule_match: &InternalRuleMatch<E>,
655        // The current difference in length between the original and mutated string
656        utf8_byte_delta: &mut isize,
657
658        // The difference between the custom index on the original string and the mutated string
659        custom_index_delta: &mut <E>::IndexShift,
660    ) -> RuleMatch {
661        let rule = &self.rules[rule_match.rule_index];
662
663        let custom_start =
664            (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
665                + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
666
667        let mut matched_content_copy = None;
668
669        if self.scanner_features.return_matches {
670            // This copies part of the is_mutating block but is seperate since can't mix compilation condition and code condition
671            let mutated_utf8_match_start =
672                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
673            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
674
675            // Matches for mutating rules must have valid indices
676            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
677            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
678
679            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
680            matched_content_copy = Some(matched_content.to_string());
681        }
682
683        if rule.match_action.is_mutating() {
684            let mutated_utf8_match_start =
685                (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
686            let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
687
688            // Matches for mutating rules must have valid indices
689            debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
690            debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
691
692            let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
693            if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
694                let before_replacement = &matched_content[replacement.start..replacement.end];
695
696                // update indices to match the new mutated content
697                <E>::adjust_shift(
698                    custom_index_delta,
699                    before_replacement,
700                    &replacement.replacement,
701                );
702                *utf8_byte_delta +=
703                    replacement.replacement.len() as isize - before_replacement.len() as isize;
704
705                let replacement_start = mutated_utf8_match_start + replacement.start;
706                let replacement_end = mutated_utf8_match_start + replacement.end;
707                content.replace_range(replacement_start..replacement_end, &replacement.replacement);
708            }
709        }
710
711        let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
712        let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
713            + shift_offset) as usize;
714
715        let rule = &self.rules[rule_match.rule_index];
716
717        let match_status: MatchStatus = if rule.match_validation_type.is_some() {
718            MatchStatus::NotChecked
719        } else {
720            MatchStatus::NotAvailable
721        };
722
723        RuleMatch {
724            rule_index: rule_match.rule_index,
725            path,
726            replacement_type: rule.match_action.replacement_type(),
727            start_index: custom_start,
728            end_index_exclusive: custom_end,
729            shift_offset,
730            match_value: matched_content_copy,
731            match_status,
732        }
733    }
734
735    fn sort_and_remove_overlapping_rules<E: Encoding>(
736        &self,
737        rule_matches: &mut Vec<InternalRuleMatch<E>>,
738    ) {
739        // Some of the scanner code relies on the behavior here, such as the sort order and removal of overlapping mutating rules.
740        // Be very careful if this function is modified.
741
742        rule_matches.sort_unstable_by(|a, b| {
743            // Mutating rules are a higher priority (earlier in the list)
744            let ord = self.rules[a.rule_index]
745                .match_action
746                .is_mutating()
747                .cmp(&self.rules[b.rule_index].match_action.is_mutating())
748                .reverse();
749
750            // Earlier start offset
751            let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
752
753            // Longer matches
754            let ord = ord.then(a.len().cmp(&b.len()).reverse());
755
756            // Matches from earlier rules
757            let ord = ord.then(a.rule_index.cmp(&b.rule_index));
758
759            // swap the order of everything so matches can be efficiently popped off the back as they are processed
760            ord.reverse()
761        });
762
763        let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
764
765        'rule_matches: while let Some(rule_match) = rule_matches.pop() {
766            if self.rules[rule_match.rule_index].match_action.is_mutating() {
767                // Mutating rules are kept only if they don't overlap with a previous rule.
768                if let Some(last) = retained_rules.last() {
769                    if last.utf8_end > rule_match.utf8_start {
770                        continue;
771                    }
772                }
773            } else {
774                // Only retain if it doesn't overlap with any other rule. Since mutating matches are sorted before non-mutated matches
775                // this needs to check all retained matches (instead of just the last one)
776                for retained_rule in &retained_rules {
777                    if retained_rule.utf8_start < rule_match.utf8_end
778                        && retained_rule.utf8_end > rule_match.utf8_start
779                    {
780                        continue 'rule_matches;
781                    }
782                }
783            };
784            retained_rules.push(rule_match);
785        }
786
787        // ensure rules are sorted by start index (other parts of the library required this to function correctly)
788        retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
789
790        *rule_matches = retained_rules;
791    }
792}
793
794impl Drop for Scanner {
795    fn drop(&mut self) {
796        let stats = &*GLOBAL_STATS;
797        stats.scanner_deletions.increment(1);
798        stats.decrement_total_scanners();
799    }
800}
801
802#[derive(Default)]
803pub struct ScannerBuilder<'a> {
804    rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
805    labels: Labels,
806    scanner_features: ScannerFeatures,
807    async_scan_timeout: Duration,
808}
809
810impl ScannerBuilder<'_> {
811    pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
812        ScannerBuilder {
813            rules,
814            labels: Labels::empty(),
815            scanner_features: ScannerFeatures::default(),
816            async_scan_timeout: Duration::from_secs(60),
817        }
818    }
819
820    pub fn labels(mut self, labels: Labels) -> Self {
821        self.labels = labels;
822        self
823    }
824
825    pub fn with_async_scan_timeout(mut self, duration: Duration) -> Self {
826        self.async_scan_timeout = duration;
827        self
828    }
829
830    pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
831        self.scanner_features.add_implicit_index_wildcards = value;
832        self
833    }
834
835    pub fn with_return_matches(mut self, value: bool) -> Self {
836        self.scanner_features.return_matches = value;
837        self
838    }
839
840    /// Enables/Disables the Multipass V0 feature. This defaults to TRUE.
841    /// Multipass V0 saves matches from excluded scopes, and marks any identical
842    /// matches in included scopes as a false positive.
843    pub fn with_multipass_v0(mut self, value: bool) -> Self {
844        self.scanner_features.multipass_v0_enabled = value;
845        self
846    }
847
848    pub fn with_skip_rules_with_regex_matching_empty_string(mut self, value: bool) -> Self {
849        self.scanner_features
850            .skip_rules_with_regex_matching_empty_string = value;
851        self
852    }
853
854    pub fn build(self) -> Result<Scanner, CreateScannerError> {
855        let mut match_validators_per_type = AHashMap::new();
856
857        for rule in self.rules.iter() {
858            if let Some(match_validation_type) = &rule.get_third_party_active_checker() {
859                if match_validation_type.can_create_match_validator() {
860                    let internal_type = match_validation_type.get_internal_match_validation_type();
861                    let match_validator = match_validation_type.into_match_validator();
862                    if let Ok(match_validator) = match_validator {
863                        if !match_validators_per_type.contains_key(&internal_type) {
864                            match_validators_per_type.insert(internal_type, match_validator);
865                        }
866                    } else {
867                        return Err(CreateScannerError::InvalidMatchValidator(
868                            MatchValidatorCreationError::InternalError,
869                        ));
870                    }
871                }
872            }
873        }
874
875        let compiled_rules = self
876            .rules
877            .iter()
878            .enumerate()
879            .filter_map(|(rule_index, config)| {
880                let inner = match config.convert_to_compiled_rule(rule_index, self.labels.clone()) {
881                    Ok(inner) => Ok(inner),
882                    Err(err) => {
883                        if self
884                            .scanner_features
885                            .skip_rules_with_regex_matching_empty_string
886                            && err
887                            == CreateScannerError::InvalidRegex(
888                            RegexValidationError::MatchesEmptyString,
889                        )
890                        {
891                            // this is a temporary feature to skip rules that should be considered invalid.
892                            #[allow(clippy::print_stdout)]
893                            {
894                                println!("skipping rule that matches empty string: rule_index={}, labels={:?}", rule_index, self.labels.clone());
895                            }
896                            return None;
897                        } else {
898                            Err(err)
899                        }
900                    }
901                };
902                Some((config, inner))
903            })
904            .map(|(config, inner)| {
905                config.match_action.validate()?;
906                Ok(RootCompiledRule {
907                    inner: inner?,
908                    scope: config.scope.clone(),
909                    match_action: config.match_action.clone(),
910                    match_validation_type: config.get_third_party_active_checker().cloned(),
911                })
912            })
913            .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
914
915        let mut per_scanner_data = SharedData::new();
916
917        compiled_rules.iter().for_each(|rule| {
918            rule.init_per_scanner_data(&mut per_scanner_data);
919        });
920
921        let scoped_ruleset = ScopedRuleSet::new(
922            &compiled_rules
923                .iter()
924                .map(|rule| rule.scope.clone())
925                .collect::<Vec<_>>(),
926        )
927        .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
928
929        {
930            let stats = &*GLOBAL_STATS;
931            stats.scanner_creations.increment(1);
932            stats.increment_total_scanners();
933        }
934
935        Ok(Scanner {
936            rules: compiled_rules,
937            scoped_ruleset,
938            scanner_features: self.scanner_features,
939            metrics: ScannerMetrics::new(&self.labels),
940            match_validators_per_type,
941            labels: self.labels,
942            per_scanner_data,
943            async_scan_timeout: self.async_scan_timeout,
944        })
945    }
946}
947
948struct ScannerContentVisitor<'a, E: Encoding> {
949    scanner: &'a Scanner,
950    regex_caches: &'a mut RegexCaches,
951    rule_matches: &'a mut InternalRuleMatchSet<E>,
952    // Rules that shall be skipped for this scan
953    // This list shall be small (<10), so a linear search is acceptable
954    blocked_rules: &'a Vec<usize>,
955    excluded_matches: &'a mut AHashSet<String>,
956    per_event_data: SharedData,
957    wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
958    async_jobs: &'a mut Vec<PendingRuleJob>,
959}
960
961impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
962    fn visit_content<'b>(
963        &'b mut self,
964        path: &Path<'a>,
965        content: &str,
966        mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
967        exclusion_check: ExclusionCheck<'b>,
968    ) -> Result<bool, ScannerError> {
969        // matches for a single path
970        let mut path_rules_matches = vec![];
971
972        // Create a map of per rule type data that can be shared between rules of the same type
973        let mut per_string_data = SharedData::new();
974        let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
975
976        rule_visitor.visit_rule_indices(|rule_index| {
977            if self.blocked_rules.contains(&rule_index) {
978                return Ok(());
979            }
980            let rule = &self.scanner.rules[rule_index];
981            {
982                // creating the emitter is basically free, it will get mostly optimized away
983                let mut emitter = |rule_match: StringMatch| {
984                    // This should never happen, but to ensure no empty match is ever generated
985                    // (which may cause an infinite loop), this will panic instead.
986                    assert_ne!(rule_match.start, rule_match.end, "empty match detected");
987                    path_rules_matches.push(InternalRuleMatch::new(rule_index, rule_match));
988                };
989
990                rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
991
992                // TODO: move this somewhere higher?
993                rule.init_per_event_data(&mut self.per_event_data);
994
995                let mut ctx = StringMatchesCtx {
996                    rule_index,
997                    regex_caches: self.regex_caches,
998                    exclusion_check: &exclusion_check,
999                    excluded_matches: self.excluded_matches,
1000                    match_emitter: &mut emitter,
1001                    wildcard_indices: wildcard_indices_per_path,
1002                    per_string_data: &mut per_string_data,
1003                    per_scanner_data: &self.scanner.per_scanner_data,
1004                    per_event_data: &mut self.per_event_data,
1005                };
1006
1007                let async_status = rule.get_string_matches(content, path, &mut ctx)?;
1008
1009                match async_status {
1010                    RuleStatus::Done => {
1011                        // nothing to do
1012                    }
1013                    RuleStatus::Pending(fut) => {
1014                        self.async_jobs.push(PendingRuleJob {
1015                            fut,
1016                            path: path.into_static(),
1017                        });
1018                    }
1019                }
1020            }
1021            Ok(())
1022        })?;
1023
1024        // If there are any matches, the string will need to be accessed to check for false positives from
1025        // excluded matches, any to potentially mutate the string.
1026        // If there are any async jobs, this is also true since it's not known yet whether there
1027        // will be a match
1028        let needs_to_access_content = !path_rules_matches.is_empty() || !self.async_jobs.is_empty();
1029
1030        self.rule_matches
1031            .push_sync_matches(path, path_rules_matches);
1032
1033        Ok(needs_to_access_content)
1034    }
1035}
1036
1037// Calculates the next starting position for a regex match if a the previous match is a false positive
1038fn get_next_regex_start(content: &str, regex_match: &Match) -> Option<usize> {
1039    // The next valid UTF8 char after the start of the regex match is used
1040    if let Some((i, _)) = content[regex_match.start()..].char_indices().nth(1) {
1041        Some(regex_match.start() + i)
1042    } else {
1043        // There are no more chars left in the string to scan
1044        None
1045    }
1046}
1047
1048fn is_false_positive_match(
1049    regex_match: &Match,
1050    rule: &RegexCompiledRule,
1051    content: &str,
1052    check_excluded_keywords: bool,
1053) -> bool {
1054    if check_excluded_keywords {
1055        if let Some(excluded_keywords) = &rule.excluded_keywords {
1056            if excluded_keywords.is_false_positive_match(content, regex_match.start()) {
1057                return true;
1058            }
1059        }
1060    }
1061
1062    if let Some(validator) = rule.validator.as_ref() {
1063        if !validator.is_valid_match(&content[regex_match.range()]) {
1064            return true;
1065        };
1066    }
1067    false
1068}