1use crate::encoding::Encoding;
2use crate::event::Event;
3use std::future::Future;
4
5use crate::match_validation::{
6 config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
7 match_validator::MatchValidator,
8};
9
10use error::MatchValidatorCreationError;
11
12use self::metrics::ScannerMetrics;
13use crate::match_validation::match_validator::RAYON_THREAD_POOL;
14use crate::observability::labels::Labels;
15use crate::rule_match::{InternalRuleMatch, RuleMatch};
16use crate::scanner::config::RuleConfig;
17use crate::scanner::internal_rule_match_set::InternalRuleMatchSet;
18use crate::scanner::regex_rule::compiled::RegexCompiledRule;
19use crate::scanner::regex_rule::{RegexCaches, access_regex_caches};
20use crate::scanner::scope::Scope;
21pub use crate::scanner::shared_data::SharedData;
22use crate::scanner::suppression::{CompiledSuppressions, SuppressionValidationError, Suppressions};
23use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
24pub use crate::secondary_validation::Validator;
25use crate::stats::GLOBAL_STATS;
26use crate::tokio::TOKIO_RUNTIME;
27use crate::{CreateScannerError, EncodeIndices, MatchAction, Path, ScannerError};
28use ahash::{AHashMap, AHashSet};
29use futures::executor::block_on;
30use serde::{Deserialize, Serialize};
31use serde_with::serde_as;
32use std::ops::Deref;
33use std::pin::Pin;
34use std::sync::Arc;
35use std::time::{Duration, Instant};
36use tokio::task::JoinHandle;
37use tokio::time::timeout;
38
39pub mod config;
40pub mod debug_scan;
41pub mod error;
42pub mod metrics;
43pub mod regex_rule;
44pub mod scope;
45pub mod shared_data;
46pub mod shared_pool;
47pub mod suppression;
48
49mod internal_rule_match_set;
50#[cfg(test)]
51mod test;
52
53#[derive(Copy, Clone)]
54pub struct StringMatch {
55 pub start: usize,
56 pub end: usize,
57}
58
59pub trait MatchEmitter<T = ()> {
60 fn emit(&mut self, string_match: StringMatch) -> T;
61}
62
63impl<F, T> MatchEmitter<T> for F
66where
67 F: FnMut(StringMatch) -> T,
68{
69 fn emit(&mut self, string_match: StringMatch) -> T {
70 (self)(string_match)
72 }
73}
74
75#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Copy)]
84pub enum Precedence {
85 Catchall,
86 Generic,
87 Specific,
88}
89
90impl Default for Precedence {
91 fn default() -> Self {
92 Self::Specific
93 }
94}
95
96#[serde_as]
97#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
98pub struct RootRuleConfig<T> {
99 pub match_action: MatchAction,
100 #[serde(default)]
101 pub scope: Scope,
102 #[deprecated(note = "Use `third_party_active_checker` instead")]
103 match_validation_type: Option<MatchValidationType>,
104 third_party_active_checker: Option<MatchValidationType>,
105 suppressions: Option<Suppressions>,
106 #[serde(default)]
107 precedence: Precedence,
108 #[serde(flatten)]
109 pub inner: T,
110}
111
112impl<T> RootRuleConfig<T>
113where
114 T: RuleConfig + 'static,
115{
116 pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
117 RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
118 }
119
120 pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
121 self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
122 }
123}
124
125impl<T> RootRuleConfig<T> {
126 pub fn new(inner: T) -> Self {
127 #[allow(deprecated)]
128 Self {
129 match_action: MatchAction::None,
130 scope: Scope::all(),
131 match_validation_type: None,
132 third_party_active_checker: None,
133 suppressions: None,
134 precedence: Precedence::default(),
135 inner,
136 }
137 }
138
139 pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
140 #[allow(deprecated)]
141 RootRuleConfig {
142 match_action: self.match_action,
143 scope: self.scope,
144 match_validation_type: self.match_validation_type,
145 third_party_active_checker: self.third_party_active_checker,
146 suppressions: self.suppressions,
147 precedence: self.precedence,
148 inner: func(self.inner),
149 }
150 }
151
152 pub fn match_action(mut self, action: MatchAction) -> Self {
153 self.match_action = action;
154 self
155 }
156
157 pub fn precedence(mut self, precedence: Precedence) -> Self {
158 self.precedence = precedence;
159 self
160 }
161
162 pub fn scope(mut self, scope: Scope) -> Self {
163 self.scope = scope;
164 self
165 }
166
167 pub fn third_party_active_checker(
168 mut self,
169 match_validation_type: MatchValidationType,
170 ) -> Self {
171 self.third_party_active_checker = Some(match_validation_type);
172 self
173 }
174
175 pub fn suppressions(mut self, suppressions: Suppressions) -> Self {
176 self.suppressions = Some(suppressions);
177 self
178 }
179
180 fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
181 #[allow(deprecated)]
182 self.third_party_active_checker
183 .as_ref()
184 .or(self.match_validation_type.as_ref())
185 }
186}
187
188impl<T> Deref for RootRuleConfig<T> {
189 type Target = T;
190
191 fn deref(&self) -> &Self::Target {
192 &self.inner
193 }
194}
195pub struct RootCompiledRule {
196 pub inner: Box<dyn CompiledRule>,
197 pub scope: Scope,
198 pub match_action: MatchAction,
199 pub match_validation_type: Option<MatchValidationType>,
200 pub suppressions: Option<CompiledSuppressions>,
201 pub precedence: Precedence,
202}
203
204impl RootCompiledRule {
205 pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
206 self.match_validation_type
207 .as_ref()
208 .map(|x| x.get_internal_match_validation_type())
209 }
210}
211
212impl Deref for RootCompiledRule {
213 type Target = dyn CompiledRule;
214
215 fn deref(&self) -> &Self::Target {
216 self.inner.as_ref()
217 }
218}
219
220pub struct StringMatchesCtx<'a> {
221 rule_index: usize,
222 pub regex_caches: &'a mut RegexCaches,
223 pub exclusion_check: &'a ExclusionCheck<'a>,
224 pub excluded_matches: &'a mut AHashSet<String>,
225 pub match_emitter: &'a mut dyn MatchEmitter,
226 pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
227
228 pub per_string_data: &'a mut SharedData,
230 pub per_scanner_data: &'a SharedData,
231 pub per_event_data: &'a mut SharedData,
232 pub event_id: Option<&'a str>,
233}
234
235impl StringMatchesCtx<'_> {
236 pub fn process_async(
246 &self,
247 func: impl for<'a> FnOnce(
248 &'a mut AsyncStringMatchesCtx,
249 )
250 -> Pin<Box<dyn Future<Output = Result<(), ScannerError>> + Send + 'a>>
251 + Send
252 + 'static,
253 ) -> RuleResult {
254 let rule_index = self.rule_index;
255
256 let fut = TOKIO_RUNTIME.spawn(async move {
259 let start = Instant::now();
260 let mut ctx = AsyncStringMatchesCtx {
261 rule_matches: vec![],
262 };
263 (func)(&mut ctx).await?;
264 let io_duration = start.elapsed();
265
266 Ok(AsyncRuleInfo {
267 rule_index,
268 rule_matches: ctx.rule_matches,
269 io_duration,
270 })
271 });
272
273 Ok(RuleStatus::Pending(fut))
274 }
275}
276
277pub struct AsyncStringMatchesCtx {
278 rule_matches: Vec<StringMatch>,
279}
280
281impl AsyncStringMatchesCtx {
282 pub fn emit_match(&mut self, string_match: StringMatch) {
283 self.rule_matches.push(string_match);
284 }
285}
286
287#[must_use]
288pub enum RuleStatus {
289 Done,
290 Pending(PendingRuleResult),
291}
292
293pub type PendingRuleResult = JoinHandle<Result<AsyncRuleInfo, ScannerError>>;
295
296pub struct PendingRuleJob {
297 fut: PendingRuleResult,
298 path: Path<'static>,
299}
300
301pub struct AsyncRuleInfo {
302 rule_index: usize,
303 rule_matches: Vec<StringMatch>,
304 io_duration: Duration,
305}
306
307pub type RuleResult = Result<RuleStatus, ScannerError>;
309
310pub trait CompiledRule: Send + Sync {
312 fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
313 }
315
316 fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
317 }
319
320 fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
321 }
323
324 fn get_string_matches(
325 &self,
326 content: &str,
327 path: &Path,
328 ctx: &mut StringMatchesCtx<'_>,
329 ) -> RuleResult;
330
331 fn should_exclude_multipass_v0(&self) -> bool {
334 false
336 }
337
338 fn on_excluded_match_multipass_v0(&self) {
339 }
341
342 fn as_regex_rule(&self) -> Option<&RegexCompiledRule> {
343 None
344 }
345
346 fn as_regex_rule_mut(&mut self) -> Option<&mut RegexCompiledRule> {
347 None
348 }
349
350 fn allow_scanner_to_exclude_namespace(&self) -> bool {
351 true
352 }
353}
354
355impl<T> RuleConfig for Box<T>
356where
357 T: RuleConfig + ?Sized,
358{
359 fn convert_to_compiled_rule(
360 &self,
361 rule_index: usize,
362 labels: Labels,
363 ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
364 self.as_ref().convert_to_compiled_rule(rule_index, labels)
365 }
366}
367
368#[derive(Debug, PartialEq, Clone)]
369struct ScannerFeatures {
370 pub add_implicit_index_wildcards: bool,
371 pub multipass_v0_enabled: bool,
372 pub return_matches: bool,
373}
374
375impl Default for ScannerFeatures {
376 fn default() -> Self {
377 Self {
378 add_implicit_index_wildcards: false,
379 multipass_v0_enabled: true,
380 return_matches: false,
381 }
382 }
383}
384
385pub struct ScanOptions {
386 pub blocked_rules_idx: Vec<usize>,
389 pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
391 pub validate_matches: bool,
394}
395
396impl Default for ScanOptions {
397 fn default() -> Self {
398 Self {
399 blocked_rules_idx: vec![],
400 wildcarded_indices: AHashMap::new(),
401 validate_matches: false,
402 }
403 }
404}
405
406pub struct ScanOptionBuilder {
407 blocked_rules_idx: Vec<usize>,
408 wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
409 validate_matches: bool,
410}
411
412impl ScanOptionBuilder {
413 pub fn new() -> Self {
414 Self {
415 blocked_rules_idx: vec![],
416 wildcarded_indices: AHashMap::new(),
417 validate_matches: false,
418 }
419 }
420
421 pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
422 self.blocked_rules_idx = blocked_rules_idx;
423 self
424 }
425
426 pub fn with_wildcarded_indices(
427 mut self,
428 wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
429 ) -> Self {
430 self.wildcarded_indices = wildcarded_indices;
431 self
432 }
433
434 pub fn with_validate_matching(mut self, validate_matches: bool) -> Self {
435 self.validate_matches = validate_matches;
436 self
437 }
438
439 pub fn build(self) -> ScanOptions {
440 ScanOptions {
441 blocked_rules_idx: self.blocked_rules_idx,
442 wildcarded_indices: self.wildcarded_indices,
443 validate_matches: self.validate_matches,
444 }
445 }
446}
447
448pub struct Scanner {
449 rules: Vec<RootCompiledRule>,
450 scoped_ruleset: ScopedRuleSet,
451 scanner_features: ScannerFeatures,
452 metrics: ScannerMetrics,
453 labels: Labels,
454 match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
455 per_scanner_data: SharedData,
456 async_scan_timeout: Duration,
457}
458
459impl Scanner {
460 pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
461 ScannerBuilder::new(rules)
462 }
463
464 pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
469 self.scan_with_options(event, ScanOptions::default())
470 }
471
472 pub fn scan_with_options<E: Event>(
477 &self,
478 event: &mut E,
479 options: ScanOptions,
480 ) -> Result<Vec<RuleMatch>, ScannerError> {
481 block_on(self.internal_scan_with_metrics(event, options))
482 }
483
484 pub async fn scan_async<E: Event>(
488 &self,
489 event: &mut E,
490 ) -> Result<Vec<RuleMatch>, ScannerError> {
491 self.scan_async_with_options(event, ScanOptions::default())
492 .await
493 }
494
495 pub async fn scan_async_with_options<E: Event>(
496 &self,
497 event: &mut E,
498 options: ScanOptions,
499 ) -> Result<Vec<RuleMatch>, ScannerError> {
500 let fut = self.internal_scan_with_metrics(event, options);
501
502 let timeout = {
505 let _tokio_guard = TOKIO_RUNTIME.enter();
506 timeout(self.async_scan_timeout, fut)
507 };
508
509 timeout.await.unwrap_or(Err(ScannerError::Transient(
510 "Async scan timeout".to_string(),
511 )))
512 }
513
514 fn record_metrics(
515 &self,
516 output_rule_matches: &[RuleMatch],
517 start: Instant,
518 io_duration: Option<Duration>,
519 ) {
520 self.metrics.num_scanned_events.increment(1);
522 self.metrics
524 .match_count
525 .increment(output_rule_matches.len() as u64);
526
527 if let Some(io_duration) = io_duration {
528 let total_duration = start.elapsed();
529 let cpu_duration = total_duration.saturating_sub(io_duration);
530 self.metrics
531 .cpu_duration
532 .increment(cpu_duration.as_nanos() as u64);
533 }
534 }
535
536 async fn internal_scan_with_metrics<E: Event>(
537 &self,
538 event: &mut E,
539 options: ScanOptions,
540 ) -> Result<Vec<RuleMatch>, ScannerError> {
541 let start = Instant::now();
542 let result = self.internal_scan(event, options).await;
543 match result {
544 Ok((rule_matches, io_duration)) => {
545 self.record_metrics(&rule_matches, start, Some(io_duration));
546 Ok(rule_matches)
547 }
548 Err(e) => {
549 self.record_metrics(&[], start, None);
550 Err(e)
551 }
552 }
553 }
554
555 fn process_rule_matches<E: Event>(
556 &self,
557 event: &mut E,
558 rule_matches: InternalRuleMatchSet<E::Encoding>,
559 excluded_matches: AHashSet<String>,
560 output_rule_matches: &mut Vec<RuleMatch>,
561 need_match_content: bool,
562 ) {
563 if rule_matches.is_empty() {
564 return;
565 }
566 access_regex_caches(|regex_caches| {
567 for (path, mut rule_matches) in rule_matches.into_iter() {
568 event.visit_string_mut(&path, |content| {
570 rule_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
572
573 <<E as Event>::Encoding>::calculate_indices(
574 content,
575 rule_matches.iter_mut().map(
576 |rule_match: &mut InternalRuleMatch<E::Encoding>| EncodeIndices {
577 utf8_start: rule_match.utf8_start,
578 utf8_end: rule_match.utf8_end,
579 custom_start: &mut rule_match.custom_start,
580 custom_end: &mut rule_match.custom_end,
581 },
582 ),
583 );
584
585 if self.scanner_features.multipass_v0_enabled {
586 rule_matches.retain(|rule_match| {
589 if self.rules[rule_match.rule_index]
590 .inner
591 .should_exclude_multipass_v0()
592 {
593 let is_false_positive = excluded_matches
594 .contains(&content[rule_match.utf8_start..rule_match.utf8_end]);
595 if is_false_positive && self.scanner_features.multipass_v0_enabled {
596 self.rules[rule_match.rule_index]
597 .on_excluded_match_multipass_v0();
598 }
599 !is_false_positive
600 } else {
601 true
602 }
603 });
604 }
605
606 self.suppress_matches::<E::Encoding>(&mut rule_matches, content, regex_caches);
607
608 self.sort_and_remove_overlapping_rules::<E::Encoding>(&mut rule_matches);
609
610 let will_mutate = rule_matches.iter().any(|rule_match| {
611 self.rules[rule_match.rule_index].match_action.is_mutating()
612 });
613
614 self.apply_match_actions(
615 content,
616 &path,
617 &mut rule_matches,
618 output_rule_matches,
619 need_match_content,
620 );
621
622 will_mutate
623 });
624 }
625 });
626 }
627
628 async fn internal_scan<E: Event>(
629 &self,
630 event: &mut E,
631 options: ScanOptions,
632 ) -> Result<(Vec<RuleMatch>, Duration), ScannerError> {
633 let need_match_content = self.scanner_features.return_matches || options.validate_matches;
636 let mut rule_matches = InternalRuleMatchSet::new();
638 let mut excluded_matches = AHashSet::new();
639 let mut async_jobs = vec![];
640
641 access_regex_caches(|regex_caches| {
642 self.scoped_ruleset.visit_string_rule_combinations(
643 event,
644 ScannerContentVisitor {
645 scanner: self,
646 regex_caches,
647 rule_matches: &mut rule_matches,
648 blocked_rules: &options.blocked_rules_idx,
649 excluded_matches: &mut excluded_matches,
650 per_event_data: SharedData::new(),
651 wildcarded_indexes: &options.wildcarded_indices,
652 async_jobs: &mut async_jobs,
653 event_id: event.get_id().map(|s| s.to_string()),
654 },
655 )
656 })?;
657
658 let mut total_io_duration = Duration::ZERO;
661 for job in async_jobs {
662 let rule_info = job.fut.await.unwrap()?;
663 total_io_duration += rule_info.io_duration;
664 rule_matches.push_async_matches(
665 &job.path,
666 rule_info
667 .rule_matches
668 .into_iter()
669 .map(|x| InternalRuleMatch::new(rule_info.rule_index, x)),
670 );
671 }
672
673 let mut output_rule_matches = vec![];
674
675 self.process_rule_matches(
676 event,
677 rule_matches,
678 excluded_matches,
679 &mut output_rule_matches,
680 need_match_content,
681 );
682
683 if options.validate_matches {
684 self.validate_matches(&mut output_rule_matches);
685 }
686
687 Ok((output_rule_matches, total_io_duration))
688 }
689
690 pub fn suppress_matches<E: Encoding>(
691 &self,
692 rule_matches: &mut Vec<InternalRuleMatch<E>>,
693 content: &str,
694 regex_caches: &mut RegexCaches,
695 ) {
696 rule_matches.retain(|rule_match| {
697 if let Some(suppressions) = &self.rules[rule_match.rule_index].suppressions {
698 let match_should_be_suppressed = suppressions.should_match_be_suppressed(
699 &content[rule_match.utf8_start..rule_match.utf8_end],
700 regex_caches,
701 );
702
703 if match_should_be_suppressed {
704 self.metrics.suppressed_match_count.increment(1);
705 }
706 !match_should_be_suppressed
707 } else {
708 true
709 }
710 });
711 }
712
713 pub fn validate_matches(&self, rule_matches: &mut Vec<RuleMatch>) {
714 let mut match_validator_rule_match_per_type = AHashMap::new();
716
717 let mut validated_rule_matches = vec![];
718
719 for mut rule_match in rule_matches.drain(..) {
720 let rule = &self.rules[rule_match.rule_index];
721 if let Some(match_validation_type) = rule.internal_match_validation_type() {
722 match_validator_rule_match_per_type
723 .entry(match_validation_type)
724 .or_insert_with(Vec::new)
725 .push(rule_match)
726 } else {
727 rule_match.match_status.merge(MatchStatus::NotAvailable);
729 validated_rule_matches.push(rule_match);
730 }
731 }
732
733 RAYON_THREAD_POOL.install(|| {
734 use rayon::prelude::*;
735
736 match_validator_rule_match_per_type.par_iter_mut().for_each(
737 |(match_validation_type, matches_per_type)| {
738 let match_validator = self.match_validators_per_type.get(match_validation_type);
739 if let Some(match_validator) = match_validator {
740 match_validator
741 .as_ref()
742 .validate(matches_per_type, &self.rules)
743 }
744 },
745 );
746 });
747
748 for (_, mut matches) in match_validator_rule_match_per_type {
750 validated_rule_matches.append(&mut matches);
751 }
752
753 validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
755 *rule_matches = validated_rule_matches;
756 }
757
758 fn apply_match_actions<E: Encoding>(
761 &self,
762 content: &mut String,
763 path: &Path<'static>,
764 rule_matches: &mut [InternalRuleMatch<E>],
765 output_rule_matches: &mut Vec<RuleMatch>,
766 need_match_content: bool,
767 ) {
768 let mut utf8_byte_delta: isize = 0;
769 let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
770
771 for rule_match in rule_matches {
772 output_rule_matches.push(self.apply_match_actions_for_string::<E>(
773 content,
774 path.clone(),
775 rule_match,
776 &mut utf8_byte_delta,
777 &mut custom_index_delta,
778 need_match_content,
779 ));
780 }
781 }
782
783 fn apply_match_actions_for_string<E: Encoding>(
785 &self,
786 content: &mut String,
787 path: Path<'static>,
788 rule_match: &InternalRuleMatch<E>,
789 utf8_byte_delta: &mut isize,
791
792 custom_index_delta: &mut <E>::IndexShift,
794 need_match_content: bool,
795 ) -> RuleMatch {
796 let rule = &self.rules[rule_match.rule_index];
797
798 let custom_start =
799 (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
800 + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
801
802 let mut matched_content_copy = None;
803
804 if need_match_content {
805 let mutated_utf8_match_start =
807 (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
808 let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
809
810 debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
812 debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
813
814 let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
815 matched_content_copy = Some(matched_content.to_string());
816 }
817
818 if rule.match_action.is_mutating() {
819 let mutated_utf8_match_start =
820 (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
821 let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
822
823 debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
825 debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
826
827 let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
828 if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
829 let before_replacement = &matched_content[replacement.start..replacement.end];
830
831 <E>::adjust_shift(
833 custom_index_delta,
834 before_replacement,
835 &replacement.replacement,
836 );
837 *utf8_byte_delta +=
838 replacement.replacement.len() as isize - before_replacement.len() as isize;
839
840 let replacement_start = mutated_utf8_match_start + replacement.start;
841 let replacement_end = mutated_utf8_match_start + replacement.end;
842 content.replace_range(replacement_start..replacement_end, &replacement.replacement);
843 }
844 }
845
846 let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
847 let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
848 + shift_offset) as usize;
849
850 let rule = &self.rules[rule_match.rule_index];
851
852 let match_status: MatchStatus = if rule.match_validation_type.is_some() {
853 MatchStatus::NotChecked
854 } else {
855 MatchStatus::NotAvailable
856 };
857
858 RuleMatch {
859 rule_index: rule_match.rule_index,
860 path,
861 replacement_type: rule.match_action.replacement_type(),
862 start_index: custom_start,
863 end_index_exclusive: custom_end,
864 shift_offset,
865 match_value: matched_content_copy,
866 match_status,
867 }
868 }
869
870 fn sort_and_remove_overlapping_rules<E: Encoding>(
871 &self,
872 rule_matches: &mut Vec<InternalRuleMatch<E>>,
873 ) {
874 rule_matches.sort_unstable_by(|a, b| {
878 let ord = self.rules[a.rule_index]
880 .match_action
881 .is_mutating()
882 .cmp(&self.rules[b.rule_index].match_action.is_mutating())
883 .reverse();
884
885 let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
887
888 let ord = ord.then(a.len().cmp(&b.len()).reverse());
890
891 let ord = ord.then(
893 self.rules[a.rule_index]
894 .precedence
895 .cmp(&self.rules[b.rule_index].precedence)
896 .reverse(),
897 );
898
899 let ord = ord.then(a.rule_index.cmp(&b.rule_index));
901
902 ord.reverse()
904 });
905
906 let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
907
908 'rule_matches: while let Some(rule_match) = rule_matches.pop() {
909 if self.rules[rule_match.rule_index].match_action.is_mutating() {
910 if let Some(last) = retained_rules.last()
912 && last.utf8_end > rule_match.utf8_start
913 {
914 continue;
915 }
916 } else {
917 for retained_rule in &retained_rules {
920 if retained_rule.utf8_start < rule_match.utf8_end
921 && retained_rule.utf8_end > rule_match.utf8_start
922 {
923 continue 'rule_matches;
924 }
925 }
926 };
927 retained_rules.push(rule_match);
928 }
929
930 retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
932
933 *rule_matches = retained_rules;
934 }
935}
936
937impl Drop for Scanner {
938 fn drop(&mut self) {
939 let stats = &*GLOBAL_STATS;
940 stats.scanner_deletions.increment(1);
941 stats.decrement_total_scanners();
942 }
943}
944
945#[derive(Default)]
946pub struct ScannerBuilder<'a> {
947 rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
948 labels: Labels,
949 scanner_features: ScannerFeatures,
950 async_scan_timeout: Duration,
951}
952
953impl ScannerBuilder<'_> {
954 pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
955 ScannerBuilder {
956 rules,
957 labels: Labels::empty(),
958 scanner_features: ScannerFeatures::default(),
959 async_scan_timeout: Duration::from_secs(60 * 5),
960 }
961 }
962
963 pub fn labels(mut self, labels: Labels) -> Self {
964 self.labels = labels;
965 self
966 }
967
968 pub fn with_async_scan_timeout(mut self, duration: Duration) -> Self {
969 self.async_scan_timeout = duration;
970 self
971 }
972
973 pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
974 self.scanner_features.add_implicit_index_wildcards = value;
975 self
976 }
977
978 pub fn with_return_matches(mut self, value: bool) -> Self {
979 self.scanner_features.return_matches = value;
980 self
981 }
982
983 pub fn with_multipass_v0(mut self, value: bool) -> Self {
987 self.scanner_features.multipass_v0_enabled = value;
988 self
989 }
990
991 pub fn build(self) -> Result<Scanner, CreateScannerError> {
992 let mut match_validators_per_type = AHashMap::new();
993
994 for rule in self.rules.iter() {
995 if let Some(match_validation_type) = &rule.get_third_party_active_checker()
996 && match_validation_type.can_create_match_validator()
997 {
998 let internal_type = match_validation_type.get_internal_match_validation_type();
999 let match_validator = match_validation_type.into_match_validator();
1000 if let Ok(match_validator) = match_validator {
1001 if !match_validators_per_type.contains_key(&internal_type) {
1002 match_validators_per_type.insert(internal_type, match_validator);
1003 }
1004 } else {
1005 return Err(CreateScannerError::InvalidMatchValidator(
1006 MatchValidatorCreationError::InternalError,
1007 ));
1008 }
1009 }
1010 }
1011
1012 let compiled_rules = self
1013 .rules
1014 .iter()
1015 .enumerate()
1016 .map(|(rule_index, config)| {
1017 let inner = config.convert_to_compiled_rule(rule_index, self.labels.clone())?;
1018 config.match_action.validate()?;
1019 let compiled_suppressions = match &config.suppressions {
1020 Some(s) => s.compile()?,
1021 None => None,
1022 };
1023 Ok(RootCompiledRule {
1024 inner,
1025 scope: config.scope.clone(),
1026 match_action: config.match_action.clone(),
1027 match_validation_type: config.get_third_party_active_checker().cloned(),
1028 suppressions: compiled_suppressions,
1029 precedence: config.precedence,
1030 })
1031 })
1032 .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
1033
1034 let mut per_scanner_data = SharedData::new();
1035
1036 compiled_rules.iter().for_each(|rule| {
1037 rule.init_per_scanner_data(&mut per_scanner_data);
1038 });
1039
1040 let scoped_ruleset = ScopedRuleSet::new(
1041 &compiled_rules
1042 .iter()
1043 .map(|rule| rule.scope.clone())
1044 .collect::<Vec<_>>(),
1045 )
1046 .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
1047
1048 {
1049 let stats = &*GLOBAL_STATS;
1050 stats.scanner_creations.increment(1);
1051 stats.increment_total_scanners();
1052 }
1053
1054 Ok(Scanner {
1055 rules: compiled_rules,
1056 scoped_ruleset,
1057 scanner_features: self.scanner_features,
1058 metrics: ScannerMetrics::new(&self.labels),
1059 match_validators_per_type,
1060 labels: self.labels,
1061 per_scanner_data,
1062 async_scan_timeout: self.async_scan_timeout,
1063 })
1064 }
1065}
1066
1067struct ScannerContentVisitor<'a, E: Encoding> {
1068 scanner: &'a Scanner,
1069 regex_caches: &'a mut RegexCaches,
1070 rule_matches: &'a mut InternalRuleMatchSet<E>,
1071 blocked_rules: &'a Vec<usize>,
1074 excluded_matches: &'a mut AHashSet<String>,
1075 per_event_data: SharedData,
1076 wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
1077 async_jobs: &'a mut Vec<PendingRuleJob>,
1078 event_id: Option<String>,
1079}
1080
1081impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
1082 fn visit_content<'b>(
1083 &'b mut self,
1084 path: &Path<'a>,
1085 content: &str,
1086 mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
1087 exclusion_check: ExclusionCheck<'b>,
1088 ) -> Result<bool, ScannerError> {
1089 let mut path_rules_matches = vec![];
1091
1092 let mut per_string_data = SharedData::new();
1094 let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
1095
1096 rule_visitor.visit_rule_indices(|rule_index| {
1097 if self.blocked_rules.contains(&rule_index) {
1098 return Ok(());
1099 }
1100 let rule = &self.scanner.rules[rule_index];
1101 {
1102 if rule.inner.allow_scanner_to_exclude_namespace() {
1103 if exclusion_check.is_excluded(rule_index) {
1105 return Ok(());
1106 }
1107 }
1108 let mut emitter = |rule_match: StringMatch| {
1110 assert_ne!(rule_match.start, rule_match.end, "empty match detected");
1113 path_rules_matches.push(InternalRuleMatch::new(rule_index, rule_match));
1114 };
1115
1116 rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
1117
1118 rule.init_per_event_data(&mut self.per_event_data);
1120
1121 let mut ctx = StringMatchesCtx {
1122 rule_index,
1123 regex_caches: self.regex_caches,
1124 exclusion_check: &exclusion_check,
1125 excluded_matches: self.excluded_matches,
1126 match_emitter: &mut emitter,
1127 wildcard_indices: wildcard_indices_per_path,
1128 per_string_data: &mut per_string_data,
1129 per_scanner_data: &self.scanner.per_scanner_data,
1130 per_event_data: &mut self.per_event_data,
1131 event_id: self.event_id.as_deref(),
1132 };
1133
1134 let async_status = rule.get_string_matches(content, path, &mut ctx)?;
1135
1136 match async_status {
1137 RuleStatus::Done => {
1138 }
1140 RuleStatus::Pending(fut) => {
1141 self.async_jobs.push(PendingRuleJob {
1142 fut,
1143 path: path.into_static(),
1144 });
1145 }
1146 }
1147 }
1148 Ok(())
1149 })?;
1150
1151 let needs_to_access_content = !path_rules_matches.is_empty() || !self.async_jobs.is_empty();
1156
1157 self.rule_matches
1158 .push_sync_matches(path, path_rules_matches);
1159
1160 Ok(needs_to_access_content)
1161 }
1162}
1163
1164fn get_next_regex_start(content: &str, regex_match: (usize, usize)) -> Option<usize> {
1166 if let Some((i, _)) = content[regex_match.0..].char_indices().nth(1) {
1168 Some(regex_match.0 + i)
1169 } else {
1170 None
1172 }
1173}
1174
1175fn is_false_positive_match(
1176 regex_match_range: (usize, usize),
1177 rule: &RegexCompiledRule,
1178 content: &str,
1179 check_excluded_keywords: bool,
1180) -> bool {
1181 if check_excluded_keywords
1182 && let Some(excluded_keywords) = &rule.excluded_keywords
1183 && excluded_keywords.is_false_positive_match(content, regex_match_range.0)
1184 {
1185 return true;
1186 }
1187
1188 if let Some(validator) = rule.validator.as_ref()
1189 && !validator.is_valid_match(&content[regex_match_range.0..regex_match_range.1])
1190 {
1191 return true;
1192 }
1193 false
1194}