1use crate::encoding::Encoding;
2use crate::event::Event;
3use std::future::Future;
4
5use crate::match_validation::{
6 config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
7 match_validator::MatchValidator,
8};
9
10use error::MatchValidatorCreationError;
11
12use self::metrics::ScannerMetrics;
13use crate::match_validation::match_validator::RAYON_THREAD_POOL;
14use crate::observability::labels::Labels;
15use crate::rule_match::{InternalRuleMatch, RuleMatch};
16use crate::scanner::config::RuleConfig;
17use crate::scanner::internal_rule_match_set::InternalRuleMatchSet;
18use crate::scanner::regex_rule::compiled::RegexCompiledRule;
19use crate::scanner::regex_rule::{RegexCaches, access_regex_caches};
20use crate::scanner::scope::Scope;
21pub use crate::scanner::shared_data::SharedData;
22use crate::scanner::suppression::{CompiledSuppressions, SuppressionValidationError, Suppressions};
23use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
24pub use crate::secondary_validation::Validator;
25use crate::stats::GLOBAL_STATS;
26use crate::tokio::TOKIO_RUNTIME;
27use crate::{CreateScannerError, EncodeIndices, MatchAction, Path, ScannerError};
28use ahash::AHashMap;
29use futures::executor::block_on;
30use serde::{Deserialize, Serialize};
31use serde_with::serde_as;
32use std::ops::Deref;
33use std::pin::Pin;
34use std::sync::Arc;
35use std::time::{Duration, Instant};
36use tokio::task::JoinHandle;
37use tokio::time::timeout;
38
39pub mod config;
40pub mod debug_scan;
41pub mod error;
42pub mod metrics;
43pub mod regex_rule;
44pub mod scope;
45pub mod shared_data;
46pub mod shared_pool;
47pub mod suppression;
48
49mod internal_rule_match_set;
50#[cfg(test)]
51mod test;
52
53#[derive(Clone)]
54pub struct StringMatch {
55 pub start: usize,
56 pub end: usize,
57 pub keyword: Option<String>,
59}
60
61pub trait MatchEmitter<T = ()> {
62 fn emit(&mut self, string_match: StringMatch) -> T;
63}
64
65impl<F, T> MatchEmitter<T> for F
68where
69 F: FnMut(StringMatch) -> T,
70{
71 fn emit(&mut self, string_match: StringMatch) -> T {
72 (self)(string_match)
74 }
75}
76
77#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Default)]
86pub enum Precedence {
87 Catchall,
88 Generic,
89 #[default]
90 Specific,
91}
92
93#[serde_as]
94#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
95pub struct RootRuleConfig<T> {
96 pub match_action: MatchAction,
97 #[serde(default)]
98 pub scope: Scope,
99 #[deprecated(note = "Use `third_party_active_checker` instead")]
100 match_validation_type: Option<MatchValidationType>,
101 third_party_active_checker: Option<MatchValidationType>,
102 suppressions: Option<Suppressions>,
103 #[serde(default)]
104 precedence: Precedence,
105 #[serde(default)]
106 pub is_supporting_rule: bool,
107 #[serde(flatten)]
108 pub inner: T,
109}
110
111impl<T> RootRuleConfig<T>
112where
113 T: RuleConfig + 'static,
114{
115 pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
116 RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
117 }
118
119 pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
120 self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
121 }
122}
123
124impl<T> RootRuleConfig<T> {
125 pub fn new(inner: T) -> Self {
126 #[allow(deprecated)]
127 Self {
128 match_action: MatchAction::None,
129 scope: Scope::all(),
130 match_validation_type: None,
131 third_party_active_checker: None,
132 suppressions: None,
133 precedence: Precedence::default(),
134 is_supporting_rule: false,
135 inner,
136 }
137 }
138
139 pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
140 #[allow(deprecated)]
141 RootRuleConfig {
142 match_action: self.match_action,
143 scope: self.scope,
144 match_validation_type: self.match_validation_type,
145 third_party_active_checker: self.third_party_active_checker,
146 suppressions: self.suppressions,
147 precedence: self.precedence,
148 is_supporting_rule: self.is_supporting_rule,
149 inner: func(self.inner),
150 }
151 }
152
153 pub fn match_action(mut self, action: MatchAction) -> Self {
154 self.match_action = action;
155 self
156 }
157
158 pub fn precedence(mut self, precedence: Precedence) -> Self {
159 self.precedence = precedence;
160 self
161 }
162
163 pub fn scope(mut self, scope: Scope) -> Self {
164 self.scope = scope;
165 self
166 }
167
168 pub fn third_party_active_checker(
169 mut self,
170 match_validation_type: MatchValidationType,
171 ) -> Self {
172 self.third_party_active_checker = Some(match_validation_type);
173 self
174 }
175
176 pub fn suppressions(mut self, suppressions: Suppressions) -> Self {
177 self.suppressions = Some(suppressions);
178 self
179 }
180
181 pub fn is_supporting_rule(mut self, value: bool) -> Self {
182 self.is_supporting_rule = value;
183 self
184 }
185
186 fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
187 #[allow(deprecated)]
188 self.third_party_active_checker
189 .as_ref()
190 .or(self.match_validation_type.as_ref())
191 }
192}
193
194impl<T> Deref for RootRuleConfig<T> {
195 type Target = T;
196
197 fn deref(&self) -> &Self::Target {
198 &self.inner
199 }
200}
201pub struct RootCompiledRule {
202 pub inner: Box<dyn CompiledRule>,
203 pub scope: Scope,
204 pub match_action: MatchAction,
205 pub match_validation_type: Option<MatchValidationType>,
206 pub suppressions: Option<CompiledSuppressions>,
207 pub precedence: Precedence,
208 pub is_supporting_rule: bool,
209}
210
211impl RootCompiledRule {
212 pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
213 self.match_validation_type
214 .as_ref()
215 .map(|x| x.get_internal_match_validation_type())
216 }
217}
218
219impl Deref for RootCompiledRule {
220 type Target = dyn CompiledRule;
221
222 fn deref(&self) -> &Self::Target {
223 self.inner.as_ref()
224 }
225}
226
227pub struct StringMatchesCtx<'a> {
228 rule_index: usize,
229 pub regex_caches: &'a mut RegexCaches,
230 pub exclusion_check: &'a ExclusionCheck<'a>,
231 pub excluded_matches: &'a mut AHashMap<String, String>,
232 pub match_emitter: &'a mut dyn MatchEmitter,
233 pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
234 pub enable_debug_observability: bool,
235
236 pub per_string_data: &'a mut SharedData,
238 pub per_scanner_data: &'a SharedData,
239 pub per_event_data: &'a mut SharedData,
240 pub event_id: Option<&'a str>,
241}
242
243impl StringMatchesCtx<'_> {
244 pub fn process_async(
254 &self,
255 func: impl for<'a> FnOnce(
256 &'a mut AsyncStringMatchesCtx,
257 )
258 -> Pin<Box<dyn Future<Output = Result<(), ScannerError>> + Send + 'a>>
259 + Send
260 + 'static,
261 ) -> RuleResult {
262 let rule_index = self.rule_index;
263
264 let fut = TOKIO_RUNTIME.spawn(async move {
267 let start = Instant::now();
268 let mut ctx = AsyncStringMatchesCtx {
269 rule_matches: vec![],
270 };
271 (func)(&mut ctx).await?;
272 let io_duration = start.elapsed();
273
274 Ok(AsyncRuleInfo {
275 rule_index,
276 rule_matches: ctx.rule_matches,
277 io_duration,
278 })
279 });
280
281 Ok(RuleStatus::Pending(fut))
282 }
283}
284
285pub struct AsyncStringMatchesCtx {
286 rule_matches: Vec<StringMatch>,
287}
288
289impl AsyncStringMatchesCtx {
290 pub fn emit_match(&mut self, string_match: StringMatch) {
291 self.rule_matches.push(string_match);
292 }
293}
294
295#[must_use]
296pub enum RuleStatus {
297 Done,
298 Pending(PendingRuleResult),
299}
300
301pub type PendingRuleResult = JoinHandle<Result<AsyncRuleInfo, ScannerError>>;
303
304pub struct PendingRuleJob {
305 fut: PendingRuleResult,
306 path: Path<'static>,
307}
308
309pub struct AsyncRuleInfo {
310 rule_index: usize,
311 rule_matches: Vec<StringMatch>,
312 io_duration: Duration,
313}
314
315pub type RuleResult = Result<RuleStatus, ScannerError>;
317
318pub trait CompiledRule: Send + Sync {
320 fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
321 }
323
324 fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
325 }
327
328 fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
329 }
331
332 fn get_string_matches(
333 &self,
334 content: &str,
335 path: &Path,
336 ctx: &mut StringMatchesCtx<'_>,
337 ) -> RuleResult;
338
339 fn should_exclude_multipass_v0(&self) -> bool {
342 false
344 }
345
346 fn on_excluded_match_multipass_v0(
347 &self,
348 _path: &Path,
349 _excluded_path: &str,
350 _enable_debug_observability: bool,
351 ) {
352 }
354
355 fn as_regex_rule(&self) -> Option<&RegexCompiledRule> {
356 None
357 }
358
359 fn as_regex_rule_mut(&mut self) -> Option<&mut RegexCompiledRule> {
360 None
361 }
362
363 fn allow_scanner_to_exclude_namespace(&self) -> bool {
364 true
365 }
366}
367
368impl<T> RuleConfig for Box<T>
369where
370 T: RuleConfig + ?Sized,
371{
372 fn convert_to_compiled_rule(
373 &self,
374 rule_index: usize,
375 labels: Labels,
376 ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
377 self.as_ref().convert_to_compiled_rule(rule_index, labels)
378 }
379}
380
381#[derive(Debug, PartialEq, Clone)]
382struct ScannerFeatures {
383 pub add_implicit_index_wildcards: bool,
384 pub multipass_v0_enabled: bool,
385 pub return_matches: bool,
386 pub enable_debug_observability: bool,
387}
388
389impl Default for ScannerFeatures {
390 fn default() -> Self {
391 Self {
392 add_implicit_index_wildcards: false,
393 multipass_v0_enabled: true,
394 return_matches: false,
395 enable_debug_observability: false,
396 }
397 }
398}
399
400pub struct ScanOptions {
401 pub blocked_rules_idx: Vec<usize>,
404 pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
406 pub validate_matches: bool,
409}
410
411impl Default for ScanOptions {
412 fn default() -> Self {
413 Self {
414 blocked_rules_idx: vec![],
415 wildcarded_indices: AHashMap::new(),
416 validate_matches: false,
417 }
418 }
419}
420
421pub struct ScanOptionBuilder {
422 blocked_rules_idx: Vec<usize>,
423 wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
424 validate_matches: bool,
425}
426
427impl ScanOptionBuilder {
428 pub fn new() -> Self {
429 Self {
430 blocked_rules_idx: vec![],
431 wildcarded_indices: AHashMap::new(),
432 validate_matches: false,
433 }
434 }
435
436 pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
437 self.blocked_rules_idx = blocked_rules_idx;
438 self
439 }
440
441 pub fn with_wildcarded_indices(
442 mut self,
443 wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
444 ) -> Self {
445 self.wildcarded_indices = wildcarded_indices;
446 self
447 }
448
449 pub fn with_validate_matching(mut self, validate_matches: bool) -> Self {
450 self.validate_matches = validate_matches;
451 self
452 }
453
454 pub fn build(self) -> ScanOptions {
455 ScanOptions {
456 blocked_rules_idx: self.blocked_rules_idx,
457 wildcarded_indices: self.wildcarded_indices,
458 validate_matches: self.validate_matches,
459 }
460 }
461}
462
463pub struct Scanner {
464 rules: Vec<RootCompiledRule>,
465 scoped_ruleset: ScopedRuleSet,
466 scanner_features: ScannerFeatures,
467 metrics: ScannerMetrics,
468 labels: Labels,
469 match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
470 per_scanner_data: SharedData,
471 async_scan_timeout: Duration,
472}
473
474impl Scanner {
475 pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
476 ScannerBuilder::new(rules)
477 }
478
479 pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
484 self.scan_with_options(event, ScanOptions::default())
485 }
486
487 pub fn scan_with_options<E: Event>(
492 &self,
493 event: &mut E,
494 options: ScanOptions,
495 ) -> Result<Vec<RuleMatch>, ScannerError> {
496 let start = Instant::now();
497 let validate = options.validate_matches;
498 let result = block_on(self.internal_scan_collect(event, options));
501 match result {
502 Ok((mut rule_matches, io_duration)) => {
503 self.finalize_matches(&mut rule_matches, validate);
504 self.record_metrics(&rule_matches, start, Some(io_duration));
505 Ok(rule_matches)
506 }
507 Err(e) => {
508 self.record_metrics(&[], start, None);
509 Err(e)
510 }
511 }
512 }
513
514 pub async fn scan_async<E: Event>(
518 &self,
519 event: &mut E,
520 ) -> Result<Vec<RuleMatch>, ScannerError> {
521 self.scan_async_with_options(event, ScanOptions::default())
522 .await
523 }
524
525 pub async fn scan_async_with_options<E: Event>(
526 &self,
527 event: &mut E,
528 options: ScanOptions,
529 ) -> Result<Vec<RuleMatch>, ScannerError> {
530 let start = Instant::now();
531 let validate = options.validate_matches;
532 let fut = self.internal_scan_collect(event, options);
533
534 let timeout_result = {
537 let _tokio_guard = TOKIO_RUNTIME.enter();
538 timeout(self.async_scan_timeout, fut)
539 };
540
541 let result = timeout_result.await.unwrap_or(Err(ScannerError::Transient(
542 "Async scan timeout".to_string(),
543 )));
544
545 match result {
546 Ok((mut rule_matches, io_duration)) => {
547 self.finalize_matches(&mut rule_matches, validate);
548 self.record_metrics(&rule_matches, start, Some(io_duration));
549 Ok(rule_matches)
550 }
551 Err(e) => {
552 self.record_metrics(&[], start, None);
553 Err(e)
554 }
555 }
556 }
557
558 fn record_metrics(
559 &self,
560 output_rule_matches: &[RuleMatch],
561 start: Instant,
562 io_duration: Option<Duration>,
563 ) {
564 self.metrics.num_scanned_events.increment(1);
566 self.metrics
568 .match_count
569 .increment(output_rule_matches.len() as u64);
570
571 if let Some(io_duration) = io_duration {
572 let total_duration = start.elapsed();
573 let cpu_duration = total_duration.saturating_sub(io_duration);
574 self.metrics
575 .cpu_duration
576 .increment(cpu_duration.as_nanos() as u64);
577 }
578 }
579
580 fn process_rule_matches<E: Event>(
581 &self,
582 event: &mut E,
583 rule_matches: InternalRuleMatchSet<E::Encoding>,
584 excluded_matches: AHashMap<String, String>,
585 output_rule_matches: &mut Vec<RuleMatch>,
586 need_match_content: bool,
587 ) {
588 if rule_matches.is_empty() {
589 return;
590 }
591 access_regex_caches(|regex_caches| {
592 for (path, mut rule_matches) in rule_matches.into_iter() {
593 event.visit_string_mut(&path, |content| {
595 rule_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
597
598 <<E as Event>::Encoding>::calculate_indices(
599 content,
600 rule_matches.iter_mut().map(
601 |rule_match: &mut InternalRuleMatch<E::Encoding>| EncodeIndices {
602 utf8_start: rule_match.utf8_start,
603 utf8_end: rule_match.utf8_end,
604 custom_start: &mut rule_match.custom_start,
605 custom_end: &mut rule_match.custom_end,
606 },
607 ),
608 );
609
610 if self.scanner_features.multipass_v0_enabled {
611 rule_matches.retain(|rule_match| {
614 if self.rules[rule_match.rule_index]
615 .inner
616 .should_exclude_multipass_v0()
617 {
618 let match_content =
619 &content[rule_match.utf8_start..rule_match.utf8_end];
620 let excluded_path = excluded_matches.get(match_content);
621 if let Some(excluded_path) = excluded_path {
622 self.rules[rule_match.rule_index]
623 .on_excluded_match_multipass_v0(
624 &path,
625 excluded_path,
626 self.scanner_features.enable_debug_observability,
627 );
628 }
629 excluded_path.is_none()
630 } else {
631 true
632 }
633 });
634 }
635
636 self.suppress_matches::<E::Encoding>(&mut rule_matches, content, regex_caches);
637
638 self.sort_and_remove_overlapping_rules::<E::Encoding>(&mut rule_matches);
639
640 let will_mutate = rule_matches.iter().any(|rule_match| {
641 self.rules[rule_match.rule_index].match_action.is_mutating()
642 });
643
644 self.apply_match_actions(
645 content,
646 &path,
647 rule_matches,
648 output_rule_matches,
649 need_match_content,
650 );
651
652 will_mutate
653 });
654 }
655 });
656 }
657
658 async fn internal_scan_collect<E: Event>(
659 &self,
660 event: &mut E,
661 options: ScanOptions,
662 ) -> Result<(Vec<RuleMatch>, Duration), ScannerError> {
663 let need_match_content = self.scanner_features.return_matches || options.validate_matches;
666 let mut rule_matches = InternalRuleMatchSet::new();
668 let mut excluded_matches = AHashMap::new();
669 let mut async_jobs = vec![];
670
671 access_regex_caches(|regex_caches| {
672 self.scoped_ruleset.visit_string_rule_combinations(
673 event,
674 ScannerContentVisitor {
675 scanner: self,
676 regex_caches,
677 rule_matches: &mut rule_matches,
678 blocked_rules: &options.blocked_rules_idx,
679 excluded_matches: &mut excluded_matches,
680 per_event_data: SharedData::new(),
681 wildcarded_indexes: &options.wildcarded_indices,
682 async_jobs: &mut async_jobs,
683 event_id: event.get_id().map(|s| s.to_string()),
684 },
685 )
686 })?;
687
688 let mut total_io_duration = Duration::ZERO;
691 for job in async_jobs {
692 let rule_info = job.fut.await.unwrap()?;
693 total_io_duration += rule_info.io_duration;
694 rule_matches.push_async_matches(
695 &job.path,
696 rule_info
697 .rule_matches
698 .into_iter()
699 .map(|x| InternalRuleMatch::new(rule_info.rule_index, x)),
700 );
701 }
702
703 let mut output_rule_matches = vec![];
704
705 self.process_rule_matches(
706 event,
707 rule_matches,
708 excluded_matches,
709 &mut output_rule_matches,
710 need_match_content,
711 );
712
713 Ok((output_rule_matches, total_io_duration))
714 }
715
716 pub fn suppress_matches<E: Encoding>(
717 &self,
718 rule_matches: &mut Vec<InternalRuleMatch<E>>,
719 content: &str,
720 regex_caches: &mut RegexCaches,
721 ) {
722 rule_matches.retain(|rule_match| {
723 if let Some(suppressions) = &self.rules[rule_match.rule_index].suppressions {
724 let match_should_be_suppressed = suppressions.should_match_be_suppressed(
725 &content[rule_match.utf8_start..rule_match.utf8_end],
726 regex_caches,
727 );
728
729 if match_should_be_suppressed {
730 self.metrics.suppressed_match_count.increment(1);
731 }
732 !match_should_be_suppressed
733 } else {
734 true
735 }
736 });
737 }
738
739 pub fn validate_matches(&self, rule_matches: &mut Vec<RuleMatch>) {
740 let mut match_validator_rule_match_per_type = AHashMap::new();
742
743 let mut validated_rule_matches = vec![];
744
745 for mut rule_match in rule_matches.drain(..) {
746 let rule = &self.rules[rule_match.rule_index];
747 if let Some(match_validation_type) = rule.internal_match_validation_type() {
748 match_validator_rule_match_per_type
749 .entry(match_validation_type)
750 .or_insert_with(Vec::new)
751 .push(rule_match)
752 } else {
753 rule_match.match_status.merge(MatchStatus::NotAvailable);
755 validated_rule_matches.push(rule_match);
756 }
757 }
758
759 RAYON_THREAD_POOL.install(|| {
760 use rayon::prelude::*;
761
762 match_validator_rule_match_per_type.par_iter_mut().for_each(
763 |(match_validation_type, matches_per_type)| {
764 let match_validator = self.match_validators_per_type.get(match_validation_type);
765 if let Some(match_validator) = match_validator {
766 match_validator
767 .as_ref()
768 .validate(matches_per_type, &self.rules)
769 }
770 },
771 );
772 });
773
774 for (_, mut matches) in match_validator_rule_match_per_type {
776 validated_rule_matches.append(&mut matches);
777 }
778
779 validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
781 *rule_matches = validated_rule_matches;
782 }
783
784 fn finalize_matches(&self, rule_matches: &mut Vec<RuleMatch>, validate: bool) {
789 if validate {
790 self.validate_matches(rule_matches);
791 }
792 rule_matches.retain(|rule_match| !self.rules[rule_match.rule_index].is_supporting_rule);
796 }
797
798 fn apply_match_actions<E: Encoding>(
801 &self,
802 content: &mut String,
803 path: &Path<'static>,
804 rule_matches: Vec<InternalRuleMatch<E>>,
805 output_rule_matches: &mut Vec<RuleMatch>,
806 need_match_content: bool,
807 ) {
808 let mut utf8_byte_delta: isize = 0;
809 let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
810
811 for rule_match in rule_matches {
812 output_rule_matches.push(self.apply_match_actions_for_string::<E>(
813 content,
814 path.clone(),
815 rule_match,
816 &mut utf8_byte_delta,
817 &mut custom_index_delta,
818 need_match_content,
819 ));
820 }
821 }
822
823 fn apply_match_actions_for_string<E: Encoding>(
825 &self,
826 content: &mut String,
827 path: Path<'static>,
828 rule_match: InternalRuleMatch<E>,
829 utf8_byte_delta: &mut isize,
831
832 custom_index_delta: &mut <E>::IndexShift,
834 need_match_content: bool,
835 ) -> RuleMatch {
836 let rule = &self.rules[rule_match.rule_index];
837
838 let custom_start =
839 (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
840 + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
841
842 let mut matched_content_copy = None;
843
844 if need_match_content {
845 let mutated_utf8_match_start =
847 (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
848 let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
849
850 debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
852 debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
853
854 let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
855 matched_content_copy = Some(matched_content.to_string());
856 }
857
858 if rule.match_action.is_mutating() {
859 let mutated_utf8_match_start =
860 (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
861 let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
862
863 debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
865 debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
866
867 let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
868 if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
869 let before_replacement = &matched_content[replacement.start..replacement.end];
870
871 <E>::adjust_shift(
873 custom_index_delta,
874 before_replacement,
875 &replacement.replacement,
876 );
877 *utf8_byte_delta +=
878 replacement.replacement.len() as isize - before_replacement.len() as isize;
879
880 let replacement_start = mutated_utf8_match_start + replacement.start;
881 let replacement_end = mutated_utf8_match_start + replacement.end;
882 content.replace_range(replacement_start..replacement_end, &replacement.replacement);
883 }
884 }
885
886 let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
887 let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
888 + shift_offset) as usize;
889
890 let rule = &self.rules[rule_match.rule_index];
891
892 let match_status: MatchStatus = if rule.match_validation_type.is_some() {
893 MatchStatus::NotChecked
894 } else {
895 MatchStatus::NotAvailable
896 };
897
898 RuleMatch {
899 rule_index: rule_match.rule_index,
900 path,
901 replacement_type: rule.match_action.replacement_type(),
902 start_index: custom_start,
903 end_index_exclusive: custom_end,
904 shift_offset,
905 match_value: matched_content_copy,
906 match_status,
907 keyword: rule_match.keyword,
908 }
909 }
910
911 fn sort_and_remove_overlapping_rules<E: Encoding>(
912 &self,
913 rule_matches: &mut Vec<InternalRuleMatch<E>>,
914 ) {
915 rule_matches.sort_unstable_by(|a, b| {
919 let ord = self.rules[a.rule_index]
921 .match_action
922 .is_mutating()
923 .cmp(&self.rules[b.rule_index].match_action.is_mutating())
924 .reverse();
925
926 let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
928
929 let ord = ord.then(a.len().cmp(&b.len()).reverse());
931
932 let ord = ord.then(
934 self.rules[a.rule_index]
935 .precedence
936 .cmp(&self.rules[b.rule_index].precedence)
937 .reverse(),
938 );
939
940 let ord = ord.then(a.rule_index.cmp(&b.rule_index));
942
943 ord.reverse()
945 });
946
947 let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
948
949 'rule_matches: while let Some(rule_match) = rule_matches.pop() {
950 if self.rules[rule_match.rule_index].match_action.is_mutating() {
951 if let Some(last) = retained_rules.last()
953 && last.utf8_end > rule_match.utf8_start
954 {
955 continue;
956 }
957 } else {
958 for retained_rule in &retained_rules {
961 if retained_rule.utf8_start < rule_match.utf8_end
962 && retained_rule.utf8_end > rule_match.utf8_start
963 {
964 continue 'rule_matches;
965 }
966 }
967 };
968 retained_rules.push(rule_match);
969 }
970
971 retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
973
974 *rule_matches = retained_rules;
975 }
976}
977
978impl Drop for Scanner {
979 fn drop(&mut self) {
980 let stats = &*GLOBAL_STATS;
981 stats.scanner_deletions.increment(1);
982 stats.decrement_total_scanners();
983 }
984}
985
986#[derive(Default)]
987pub struct ScannerBuilder<'a> {
988 rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
989 labels: Labels,
990 scanner_features: ScannerFeatures,
991 async_scan_timeout: Duration,
992}
993
994impl ScannerBuilder<'_> {
995 pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
996 ScannerBuilder {
997 rules,
998 labels: Labels::empty(),
999 scanner_features: ScannerFeatures::default(),
1000 async_scan_timeout: Duration::from_secs(60 * 5),
1001 }
1002 }
1003
1004 pub fn labels(mut self, labels: Labels) -> Self {
1005 self.labels = labels;
1006 self
1007 }
1008
1009 pub fn with_async_scan_timeout(mut self, duration: Duration) -> Self {
1010 self.async_scan_timeout = duration;
1011 self
1012 }
1013
1014 pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
1015 self.scanner_features.add_implicit_index_wildcards = value;
1016 self
1017 }
1018
1019 pub fn with_return_matches(mut self, value: bool) -> Self {
1020 self.scanner_features.return_matches = value;
1021 self
1022 }
1023
1024 pub fn with_multipass_v0(mut self, value: bool) -> Self {
1028 self.scanner_features.multipass_v0_enabled = value;
1029 self
1030 }
1031
1032 pub fn with_debug_observability(mut self, value: bool) -> Self {
1036 self.scanner_features.enable_debug_observability = value;
1037 self
1038 }
1039
1040 pub fn build(self) -> Result<Scanner, CreateScannerError> {
1041 let mut match_validators_per_type = AHashMap::new();
1042
1043 for rule in self.rules.iter() {
1044 if let Some(match_validation_type) = &rule.get_third_party_active_checker()
1045 && match_validation_type.can_create_match_validator()
1046 {
1047 let internal_type = match_validation_type.get_internal_match_validation_type();
1048 let match_validator = match_validation_type.into_match_validator();
1049 if let Ok(match_validator) = match_validator {
1050 if !match_validators_per_type.contains_key(&internal_type) {
1051 match_validators_per_type.insert(internal_type, match_validator);
1052 }
1053 } else {
1054 return Err(CreateScannerError::InvalidMatchValidator(
1055 MatchValidatorCreationError::InternalError,
1056 ));
1057 }
1058 }
1059 }
1060
1061 let compiled_rules = self
1062 .rules
1063 .iter()
1064 .enumerate()
1065 .map(|(rule_index, config)| {
1066 if config.is_supporting_rule && config.match_action != MatchAction::None {
1067 return Err(CreateScannerError::SupportingRuleHasMatchAction);
1068 }
1069 let inner = config.convert_to_compiled_rule(rule_index, self.labels.clone())?;
1070 config.match_action.validate()?;
1071 let compiled_suppressions = match &config.suppressions {
1072 Some(s) => s.compile()?,
1073 None => None,
1074 };
1075 Ok(RootCompiledRule {
1076 inner,
1077 scope: config.scope.clone(),
1078 match_action: config.match_action.clone(),
1079 match_validation_type: config.get_third_party_active_checker().cloned(),
1080 suppressions: compiled_suppressions,
1081 precedence: config.precedence,
1082 is_supporting_rule: config.is_supporting_rule,
1083 })
1084 })
1085 .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
1086
1087 let mut per_scanner_data = SharedData::new();
1088
1089 compiled_rules.iter().for_each(|rule| {
1090 rule.init_per_scanner_data(&mut per_scanner_data);
1091 });
1092
1093 let scoped_ruleset = ScopedRuleSet::new(
1094 &compiled_rules
1095 .iter()
1096 .map(|rule| rule.scope.clone())
1097 .collect::<Vec<_>>(),
1098 )
1099 .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
1100
1101 {
1102 let stats = &*GLOBAL_STATS;
1103 stats.scanner_creations.increment(1);
1104 stats.increment_total_scanners();
1105 }
1106
1107 Ok(Scanner {
1108 rules: compiled_rules,
1109 scoped_ruleset,
1110 scanner_features: self.scanner_features,
1111 metrics: ScannerMetrics::new(&self.labels),
1112 match_validators_per_type,
1113 labels: self.labels,
1114 per_scanner_data,
1115 async_scan_timeout: self.async_scan_timeout,
1116 })
1117 }
1118}
1119
1120struct ScannerContentVisitor<'a, E: Encoding> {
1121 scanner: &'a Scanner,
1122 regex_caches: &'a mut RegexCaches,
1123 rule_matches: &'a mut InternalRuleMatchSet<E>,
1124 blocked_rules: &'a Vec<usize>,
1127 excluded_matches: &'a mut AHashMap<String, String>,
1128 per_event_data: SharedData,
1129 wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
1130 async_jobs: &'a mut Vec<PendingRuleJob>,
1131 event_id: Option<String>,
1132}
1133
1134impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
1135 fn visit_content<'b>(
1136 &'b mut self,
1137 path: &Path<'a>,
1138 content: &str,
1139 mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
1140 exclusion_check: ExclusionCheck<'b>,
1141 ) -> Result<bool, ScannerError> {
1142 let mut path_rules_matches = vec![];
1144
1145 let mut per_string_data = SharedData::new();
1147 let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
1148
1149 rule_visitor.visit_rule_indices(|rule_index| {
1150 if self.blocked_rules.contains(&rule_index) {
1151 return Ok(());
1152 }
1153 let rule = &self.scanner.rules[rule_index];
1154 {
1155 if rule.inner.allow_scanner_to_exclude_namespace() {
1156 if exclusion_check.is_excluded(rule_index) {
1158 return Ok(());
1159 }
1160 }
1161 let mut emitter = |rule_match: StringMatch| {
1163 assert_ne!(
1166 rule_match.start, rule_match.end,
1167 "empty match detected on rule with index {rule_index}"
1168 );
1169 path_rules_matches.push(InternalRuleMatch::new(rule_index, rule_match));
1170 };
1171
1172 rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
1173
1174 rule.init_per_event_data(&mut self.per_event_data);
1176
1177 let mut ctx = StringMatchesCtx {
1178 rule_index,
1179 regex_caches: self.regex_caches,
1180 exclusion_check: &exclusion_check,
1181 excluded_matches: self.excluded_matches,
1182 match_emitter: &mut emitter,
1183 wildcard_indices: wildcard_indices_per_path,
1184 enable_debug_observability: self
1185 .scanner
1186 .scanner_features
1187 .enable_debug_observability,
1188 per_string_data: &mut per_string_data,
1189 per_scanner_data: &self.scanner.per_scanner_data,
1190 per_event_data: &mut self.per_event_data,
1191 event_id: self.event_id.as_deref(),
1192 };
1193
1194 let async_status = rule.get_string_matches(content, path, &mut ctx)?;
1195
1196 match async_status {
1197 RuleStatus::Done => {
1198 }
1200 RuleStatus::Pending(fut) => {
1201 self.async_jobs.push(PendingRuleJob {
1202 fut,
1203 path: path.into_static(),
1204 });
1205 }
1206 }
1207 }
1208 Ok(())
1209 })?;
1210
1211 let needs_to_access_content = !path_rules_matches.is_empty() || !self.async_jobs.is_empty();
1216
1217 self.rule_matches
1218 .push_sync_matches(path, path_rules_matches);
1219
1220 Ok(needs_to_access_content)
1221 }
1222}
1223
1224fn get_next_regex_start(content: &str, regex_match: (usize, usize)) -> Option<usize> {
1226 if let Some((i, _)) = content[regex_match.0..].char_indices().nth(1) {
1228 Some(regex_match.0 + i)
1229 } else {
1230 None
1232 }
1233}
1234
1235fn is_false_positive_match(
1236 regex_match_range: (usize, usize),
1237 rule: &RegexCompiledRule,
1238 content: &str,
1239 check_excluded_keywords: bool,
1240) -> bool {
1241 if check_excluded_keywords
1242 && let Some(excluded_keywords) = &rule.excluded_keywords
1243 && excluded_keywords.is_false_positive_match(content, regex_match_range.0)
1244 {
1245 return true;
1246 }
1247
1248 if let Some(validator) = rule.validator.as_ref()
1249 && !validator.is_valid_match(&content[regex_match_range.0..regex_match_range.1])
1250 {
1251 return true;
1252 }
1253 false
1254}