1use crate::encoding::Encoding;
2use crate::event::Event;
3use std::future::Future;
4
5use crate::match_validation::{
6 config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
7 match_validator::MatchValidator,
8};
9
10use error::MatchValidatorCreationError;
11
12use self::metrics::ScannerMetrics;
13use crate::match_validation::match_validator::RAYON_THREAD_POOL;
14use crate::observability::labels::Labels;
15use crate::rule_match::{InternalRuleMatch, RuleMatch};
16use crate::scanner::config::RuleConfig;
17use crate::scanner::internal_rule_match_set::InternalRuleMatchSet;
18use crate::scanner::regex_rule::compiled::RegexCompiledRule;
19use crate::scanner::regex_rule::{RegexCaches, access_regex_caches};
20use crate::scanner::scope::Scope;
21pub use crate::scanner::shared_data::SharedData;
22use crate::scanner::suppression::{CompiledSuppressions, SuppressionValidationError, Suppressions};
23use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
24pub use crate::secondary_validation::Validator;
25use crate::stats::GLOBAL_STATS;
26use crate::tokio::TOKIO_RUNTIME;
27use crate::{CreateScannerError, EncodeIndices, MatchAction, Path, ScannerError};
28use ahash::AHashMap;
29use futures::executor::block_on;
30use serde::{Deserialize, Serialize};
31use serde_with::serde_as;
32use std::ops::Deref;
33use std::pin::Pin;
34use std::sync::Arc;
35use std::time::{Duration, Instant};
36use tokio::task::JoinHandle;
37use tokio::time::timeout;
38
39pub mod config;
40pub mod debug_scan;
41pub mod error;
42pub mod metrics;
43pub mod regex_rule;
44pub mod scope;
45pub mod shared_data;
46pub mod shared_pool;
47pub mod suppression;
48
49mod internal_rule_match_set;
50#[cfg(test)]
51mod test;
52
53#[derive(Clone)]
54pub struct StringMatch {
55 pub start: usize,
56 pub end: usize,
57 pub keyword: Option<String>,
59}
60
61pub trait MatchEmitter<T = ()> {
62 fn emit(&mut self, string_match: StringMatch) -> T;
63}
64
65impl<F, T> MatchEmitter<T> for F
68where
69 F: FnMut(StringMatch) -> T,
70{
71 fn emit(&mut self, string_match: StringMatch) -> T {
72 (self)(string_match)
74 }
75}
76
77#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Default)]
86pub enum Precedence {
87 Catchall,
88 Generic,
89 #[default]
90 Specific,
91}
92
93#[serde_as]
94#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
95pub struct RootRuleConfig<T> {
96 pub match_action: MatchAction,
97 #[serde(default)]
98 pub scope: Scope,
99 #[deprecated(note = "Use `third_party_active_checker` instead")]
100 match_validation_type: Option<MatchValidationType>,
101 third_party_active_checker: Option<MatchValidationType>,
102 suppressions: Option<Suppressions>,
103 #[serde(default)]
104 precedence: Precedence,
105 #[serde(flatten)]
106 pub inner: T,
107}
108
109impl<T> RootRuleConfig<T>
110where
111 T: RuleConfig + 'static,
112{
113 pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
114 RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
115 }
116
117 pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
118 self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
119 }
120}
121
122impl<T> RootRuleConfig<T> {
123 pub fn new(inner: T) -> Self {
124 #[allow(deprecated)]
125 Self {
126 match_action: MatchAction::None,
127 scope: Scope::all(),
128 match_validation_type: None,
129 third_party_active_checker: None,
130 suppressions: None,
131 precedence: Precedence::default(),
132 inner,
133 }
134 }
135
136 pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
137 #[allow(deprecated)]
138 RootRuleConfig {
139 match_action: self.match_action,
140 scope: self.scope,
141 match_validation_type: self.match_validation_type,
142 third_party_active_checker: self.third_party_active_checker,
143 suppressions: self.suppressions,
144 precedence: self.precedence,
145 inner: func(self.inner),
146 }
147 }
148
149 pub fn match_action(mut self, action: MatchAction) -> Self {
150 self.match_action = action;
151 self
152 }
153
154 pub fn precedence(mut self, precedence: Precedence) -> Self {
155 self.precedence = precedence;
156 self
157 }
158
159 pub fn scope(mut self, scope: Scope) -> Self {
160 self.scope = scope;
161 self
162 }
163
164 pub fn third_party_active_checker(
165 mut self,
166 match_validation_type: MatchValidationType,
167 ) -> Self {
168 self.third_party_active_checker = Some(match_validation_type);
169 self
170 }
171
172 pub fn suppressions(mut self, suppressions: Suppressions) -> Self {
173 self.suppressions = Some(suppressions);
174 self
175 }
176
177 fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
178 #[allow(deprecated)]
179 self.third_party_active_checker
180 .as_ref()
181 .or(self.match_validation_type.as_ref())
182 }
183}
184
185impl<T> Deref for RootRuleConfig<T> {
186 type Target = T;
187
188 fn deref(&self) -> &Self::Target {
189 &self.inner
190 }
191}
192pub struct RootCompiledRule {
193 pub inner: Box<dyn CompiledRule>,
194 pub scope: Scope,
195 pub match_action: MatchAction,
196 pub match_validation_type: Option<MatchValidationType>,
197 pub suppressions: Option<CompiledSuppressions>,
198 pub precedence: Precedence,
199}
200
201impl RootCompiledRule {
202 pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
203 self.match_validation_type
204 .as_ref()
205 .map(|x| x.get_internal_match_validation_type())
206 }
207}
208
209impl Deref for RootCompiledRule {
210 type Target = dyn CompiledRule;
211
212 fn deref(&self) -> &Self::Target {
213 self.inner.as_ref()
214 }
215}
216
217pub struct StringMatchesCtx<'a> {
218 rule_index: usize,
219 pub regex_caches: &'a mut RegexCaches,
220 pub exclusion_check: &'a ExclusionCheck<'a>,
221 pub excluded_matches: &'a mut AHashMap<String, String>,
222 pub match_emitter: &'a mut dyn MatchEmitter,
223 pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
224 pub enable_debug_observability: bool,
225
226 pub per_string_data: &'a mut SharedData,
228 pub per_scanner_data: &'a SharedData,
229 pub per_event_data: &'a mut SharedData,
230 pub event_id: Option<&'a str>,
231}
232
233impl StringMatchesCtx<'_> {
234 pub fn process_async(
244 &self,
245 func: impl for<'a> FnOnce(
246 &'a mut AsyncStringMatchesCtx,
247 )
248 -> Pin<Box<dyn Future<Output = Result<(), ScannerError>> + Send + 'a>>
249 + Send
250 + 'static,
251 ) -> RuleResult {
252 let rule_index = self.rule_index;
253
254 let fut = TOKIO_RUNTIME.spawn(async move {
257 let start = Instant::now();
258 let mut ctx = AsyncStringMatchesCtx {
259 rule_matches: vec![],
260 };
261 (func)(&mut ctx).await?;
262 let io_duration = start.elapsed();
263
264 Ok(AsyncRuleInfo {
265 rule_index,
266 rule_matches: ctx.rule_matches,
267 io_duration,
268 })
269 });
270
271 Ok(RuleStatus::Pending(fut))
272 }
273}
274
275pub struct AsyncStringMatchesCtx {
276 rule_matches: Vec<StringMatch>,
277}
278
279impl AsyncStringMatchesCtx {
280 pub fn emit_match(&mut self, string_match: StringMatch) {
281 self.rule_matches.push(string_match);
282 }
283}
284
285#[must_use]
286pub enum RuleStatus {
287 Done,
288 Pending(PendingRuleResult),
289}
290
291pub type PendingRuleResult = JoinHandle<Result<AsyncRuleInfo, ScannerError>>;
293
294pub struct PendingRuleJob {
295 fut: PendingRuleResult,
296 path: Path<'static>,
297}
298
299pub struct AsyncRuleInfo {
300 rule_index: usize,
301 rule_matches: Vec<StringMatch>,
302 io_duration: Duration,
303}
304
305pub type RuleResult = Result<RuleStatus, ScannerError>;
307
308pub trait CompiledRule: Send + Sync {
310 fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
311 }
313
314 fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
315 }
317
318 fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
319 }
321
322 fn get_string_matches(
323 &self,
324 content: &str,
325 path: &Path,
326 ctx: &mut StringMatchesCtx<'_>,
327 ) -> RuleResult;
328
329 fn should_exclude_multipass_v0(&self) -> bool {
332 false
334 }
335
336 fn on_excluded_match_multipass_v0(
337 &self,
338 _path: &Path,
339 _excluded_path: &str,
340 _enable_debug_observability: bool,
341 ) {
342 }
344
345 fn as_regex_rule(&self) -> Option<&RegexCompiledRule> {
346 None
347 }
348
349 fn as_regex_rule_mut(&mut self) -> Option<&mut RegexCompiledRule> {
350 None
351 }
352
353 fn allow_scanner_to_exclude_namespace(&self) -> bool {
354 true
355 }
356}
357
358impl<T> RuleConfig for Box<T>
359where
360 T: RuleConfig + ?Sized,
361{
362 fn convert_to_compiled_rule(
363 &self,
364 rule_index: usize,
365 labels: Labels,
366 ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
367 self.as_ref().convert_to_compiled_rule(rule_index, labels)
368 }
369}
370
371#[derive(Debug, PartialEq, Clone)]
372struct ScannerFeatures {
373 pub add_implicit_index_wildcards: bool,
374 pub multipass_v0_enabled: bool,
375 pub return_matches: bool,
376 pub enable_debug_observability: bool,
377}
378
379impl Default for ScannerFeatures {
380 fn default() -> Self {
381 Self {
382 add_implicit_index_wildcards: false,
383 multipass_v0_enabled: true,
384 return_matches: false,
385 enable_debug_observability: false,
386 }
387 }
388}
389
390pub struct ScanOptions {
391 pub blocked_rules_idx: Vec<usize>,
394 pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
396 pub validate_matches: bool,
399}
400
401impl Default for ScanOptions {
402 fn default() -> Self {
403 Self {
404 blocked_rules_idx: vec![],
405 wildcarded_indices: AHashMap::new(),
406 validate_matches: false,
407 }
408 }
409}
410
411pub struct ScanOptionBuilder {
412 blocked_rules_idx: Vec<usize>,
413 wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
414 validate_matches: bool,
415}
416
417impl ScanOptionBuilder {
418 pub fn new() -> Self {
419 Self {
420 blocked_rules_idx: vec![],
421 wildcarded_indices: AHashMap::new(),
422 validate_matches: false,
423 }
424 }
425
426 pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
427 self.blocked_rules_idx = blocked_rules_idx;
428 self
429 }
430
431 pub fn with_wildcarded_indices(
432 mut self,
433 wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
434 ) -> Self {
435 self.wildcarded_indices = wildcarded_indices;
436 self
437 }
438
439 pub fn with_validate_matching(mut self, validate_matches: bool) -> Self {
440 self.validate_matches = validate_matches;
441 self
442 }
443
444 pub fn build(self) -> ScanOptions {
445 ScanOptions {
446 blocked_rules_idx: self.blocked_rules_idx,
447 wildcarded_indices: self.wildcarded_indices,
448 validate_matches: self.validate_matches,
449 }
450 }
451}
452
453pub struct Scanner {
454 rules: Vec<RootCompiledRule>,
455 scoped_ruleset: ScopedRuleSet,
456 scanner_features: ScannerFeatures,
457 metrics: ScannerMetrics,
458 labels: Labels,
459 match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
460 per_scanner_data: SharedData,
461 async_scan_timeout: Duration,
462}
463
464impl Scanner {
465 pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
466 ScannerBuilder::new(rules)
467 }
468
469 pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
474 self.scan_with_options(event, ScanOptions::default())
475 }
476
477 pub fn scan_with_options<E: Event>(
482 &self,
483 event: &mut E,
484 options: ScanOptions,
485 ) -> Result<Vec<RuleMatch>, ScannerError> {
486 block_on(self.internal_scan_with_metrics(event, options))
487 }
488
489 pub async fn scan_async<E: Event>(
493 &self,
494 event: &mut E,
495 ) -> Result<Vec<RuleMatch>, ScannerError> {
496 self.scan_async_with_options(event, ScanOptions::default())
497 .await
498 }
499
500 pub async fn scan_async_with_options<E: Event>(
501 &self,
502 event: &mut E,
503 options: ScanOptions,
504 ) -> Result<Vec<RuleMatch>, ScannerError> {
505 let fut = self.internal_scan_with_metrics(event, options);
506
507 let timeout = {
510 let _tokio_guard = TOKIO_RUNTIME.enter();
511 timeout(self.async_scan_timeout, fut)
512 };
513
514 timeout.await.unwrap_or(Err(ScannerError::Transient(
515 "Async scan timeout".to_string(),
516 )))
517 }
518
519 fn record_metrics(
520 &self,
521 output_rule_matches: &[RuleMatch],
522 start: Instant,
523 io_duration: Option<Duration>,
524 ) {
525 self.metrics.num_scanned_events.increment(1);
527 self.metrics
529 .match_count
530 .increment(output_rule_matches.len() as u64);
531
532 if let Some(io_duration) = io_duration {
533 let total_duration = start.elapsed();
534 let cpu_duration = total_duration.saturating_sub(io_duration);
535 self.metrics
536 .cpu_duration
537 .increment(cpu_duration.as_nanos() as u64);
538 }
539 }
540
541 async fn internal_scan_with_metrics<E: Event>(
542 &self,
543 event: &mut E,
544 options: ScanOptions,
545 ) -> Result<Vec<RuleMatch>, ScannerError> {
546 let start = Instant::now();
547 let result = self.internal_scan(event, options).await;
548 match result {
549 Ok((rule_matches, io_duration)) => {
550 self.record_metrics(&rule_matches, start, Some(io_duration));
551 Ok(rule_matches)
552 }
553 Err(e) => {
554 self.record_metrics(&[], start, None);
555 Err(e)
556 }
557 }
558 }
559
560 fn process_rule_matches<E: Event>(
561 &self,
562 event: &mut E,
563 rule_matches: InternalRuleMatchSet<E::Encoding>,
564 excluded_matches: AHashMap<String, String>,
565 output_rule_matches: &mut Vec<RuleMatch>,
566 need_match_content: bool,
567 ) {
568 if rule_matches.is_empty() {
569 return;
570 }
571 access_regex_caches(|regex_caches| {
572 for (path, mut rule_matches) in rule_matches.into_iter() {
573 event.visit_string_mut(&path, |content| {
575 rule_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
577
578 <<E as Event>::Encoding>::calculate_indices(
579 content,
580 rule_matches.iter_mut().map(
581 |rule_match: &mut InternalRuleMatch<E::Encoding>| EncodeIndices {
582 utf8_start: rule_match.utf8_start,
583 utf8_end: rule_match.utf8_end,
584 custom_start: &mut rule_match.custom_start,
585 custom_end: &mut rule_match.custom_end,
586 },
587 ),
588 );
589
590 if self.scanner_features.multipass_v0_enabled {
591 rule_matches.retain(|rule_match| {
594 if self.rules[rule_match.rule_index]
595 .inner
596 .should_exclude_multipass_v0()
597 {
598 let match_content =
599 &content[rule_match.utf8_start..rule_match.utf8_end];
600 let excluded_path = excluded_matches.get(match_content);
601 if let Some(excluded_path) = excluded_path {
602 self.rules[rule_match.rule_index]
603 .on_excluded_match_multipass_v0(
604 &path,
605 excluded_path,
606 self.scanner_features.enable_debug_observability,
607 );
608 }
609 excluded_path.is_none()
610 } else {
611 true
612 }
613 });
614 }
615
616 self.suppress_matches::<E::Encoding>(&mut rule_matches, content, regex_caches);
617
618 self.sort_and_remove_overlapping_rules::<E::Encoding>(&mut rule_matches);
619
620 let will_mutate = rule_matches.iter().any(|rule_match| {
621 self.rules[rule_match.rule_index].match_action.is_mutating()
622 });
623
624 self.apply_match_actions(
625 content,
626 &path,
627 rule_matches,
628 output_rule_matches,
629 need_match_content,
630 );
631
632 will_mutate
633 });
634 }
635 });
636 }
637
638 async fn internal_scan<E: Event>(
639 &self,
640 event: &mut E,
641 options: ScanOptions,
642 ) -> Result<(Vec<RuleMatch>, Duration), ScannerError> {
643 let need_match_content = self.scanner_features.return_matches || options.validate_matches;
646 let mut rule_matches = InternalRuleMatchSet::new();
648 let mut excluded_matches = AHashMap::new();
649 let mut async_jobs = vec![];
650
651 access_regex_caches(|regex_caches| {
652 self.scoped_ruleset.visit_string_rule_combinations(
653 event,
654 ScannerContentVisitor {
655 scanner: self,
656 regex_caches,
657 rule_matches: &mut rule_matches,
658 blocked_rules: &options.blocked_rules_idx,
659 excluded_matches: &mut excluded_matches,
660 per_event_data: SharedData::new(),
661 wildcarded_indexes: &options.wildcarded_indices,
662 async_jobs: &mut async_jobs,
663 event_id: event.get_id().map(|s| s.to_string()),
664 },
665 )
666 })?;
667
668 let mut total_io_duration = Duration::ZERO;
671 for job in async_jobs {
672 let rule_info = job.fut.await.unwrap()?;
673 total_io_duration += rule_info.io_duration;
674 rule_matches.push_async_matches(
675 &job.path,
676 rule_info
677 .rule_matches
678 .into_iter()
679 .map(|x| InternalRuleMatch::new(rule_info.rule_index, x)),
680 );
681 }
682
683 let mut output_rule_matches = vec![];
684
685 self.process_rule_matches(
686 event,
687 rule_matches,
688 excluded_matches,
689 &mut output_rule_matches,
690 need_match_content,
691 );
692
693 if options.validate_matches {
694 self.validate_matches(&mut output_rule_matches);
695 }
696
697 Ok((output_rule_matches, total_io_duration))
698 }
699
700 pub fn suppress_matches<E: Encoding>(
701 &self,
702 rule_matches: &mut Vec<InternalRuleMatch<E>>,
703 content: &str,
704 regex_caches: &mut RegexCaches,
705 ) {
706 rule_matches.retain(|rule_match| {
707 if let Some(suppressions) = &self.rules[rule_match.rule_index].suppressions {
708 let match_should_be_suppressed = suppressions.should_match_be_suppressed(
709 &content[rule_match.utf8_start..rule_match.utf8_end],
710 regex_caches,
711 );
712
713 if match_should_be_suppressed {
714 self.metrics.suppressed_match_count.increment(1);
715 }
716 !match_should_be_suppressed
717 } else {
718 true
719 }
720 });
721 }
722
723 pub fn validate_matches(&self, rule_matches: &mut Vec<RuleMatch>) {
724 let mut match_validator_rule_match_per_type = AHashMap::new();
726
727 let mut validated_rule_matches = vec![];
728
729 for mut rule_match in rule_matches.drain(..) {
730 let rule = &self.rules[rule_match.rule_index];
731 if let Some(match_validation_type) = rule.internal_match_validation_type() {
732 match_validator_rule_match_per_type
733 .entry(match_validation_type)
734 .or_insert_with(Vec::new)
735 .push(rule_match)
736 } else {
737 rule_match.match_status.merge(MatchStatus::NotAvailable);
739 validated_rule_matches.push(rule_match);
740 }
741 }
742
743 RAYON_THREAD_POOL.install(|| {
744 use rayon::prelude::*;
745
746 match_validator_rule_match_per_type.par_iter_mut().for_each(
747 |(match_validation_type, matches_per_type)| {
748 let match_validator = self.match_validators_per_type.get(match_validation_type);
749 if let Some(match_validator) = match_validator {
750 match_validator
751 .as_ref()
752 .validate(matches_per_type, &self.rules)
753 }
754 },
755 );
756 });
757
758 for (_, mut matches) in match_validator_rule_match_per_type {
760 validated_rule_matches.append(&mut matches);
761 }
762
763 validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
765 *rule_matches = validated_rule_matches;
766 }
767
768 fn apply_match_actions<E: Encoding>(
771 &self,
772 content: &mut String,
773 path: &Path<'static>,
774 rule_matches: Vec<InternalRuleMatch<E>>,
775 output_rule_matches: &mut Vec<RuleMatch>,
776 need_match_content: bool,
777 ) {
778 let mut utf8_byte_delta: isize = 0;
779 let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
780
781 for rule_match in rule_matches {
782 output_rule_matches.push(self.apply_match_actions_for_string::<E>(
783 content,
784 path.clone(),
785 rule_match,
786 &mut utf8_byte_delta,
787 &mut custom_index_delta,
788 need_match_content,
789 ));
790 }
791 }
792
793 fn apply_match_actions_for_string<E: Encoding>(
795 &self,
796 content: &mut String,
797 path: Path<'static>,
798 rule_match: InternalRuleMatch<E>,
799 utf8_byte_delta: &mut isize,
801
802 custom_index_delta: &mut <E>::IndexShift,
804 need_match_content: bool,
805 ) -> RuleMatch {
806 let rule = &self.rules[rule_match.rule_index];
807
808 let custom_start =
809 (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
810 + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
811
812 let mut matched_content_copy = None;
813
814 if need_match_content {
815 let mutated_utf8_match_start =
817 (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
818 let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
819
820 debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
822 debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
823
824 let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
825 matched_content_copy = Some(matched_content.to_string());
826 }
827
828 if rule.match_action.is_mutating() {
829 let mutated_utf8_match_start =
830 (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
831 let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
832
833 debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
835 debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
836
837 let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
838 if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
839 let before_replacement = &matched_content[replacement.start..replacement.end];
840
841 <E>::adjust_shift(
843 custom_index_delta,
844 before_replacement,
845 &replacement.replacement,
846 );
847 *utf8_byte_delta +=
848 replacement.replacement.len() as isize - before_replacement.len() as isize;
849
850 let replacement_start = mutated_utf8_match_start + replacement.start;
851 let replacement_end = mutated_utf8_match_start + replacement.end;
852 content.replace_range(replacement_start..replacement_end, &replacement.replacement);
853 }
854 }
855
856 let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
857 let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
858 + shift_offset) as usize;
859
860 let rule = &self.rules[rule_match.rule_index];
861
862 let match_status: MatchStatus = if rule.match_validation_type.is_some() {
863 MatchStatus::NotChecked
864 } else {
865 MatchStatus::NotAvailable
866 };
867
868 RuleMatch {
869 rule_index: rule_match.rule_index,
870 path,
871 replacement_type: rule.match_action.replacement_type(),
872 start_index: custom_start,
873 end_index_exclusive: custom_end,
874 shift_offset,
875 match_value: matched_content_copy,
876 match_status,
877 keyword: rule_match.keyword,
878 }
879 }
880
881 fn sort_and_remove_overlapping_rules<E: Encoding>(
882 &self,
883 rule_matches: &mut Vec<InternalRuleMatch<E>>,
884 ) {
885 rule_matches.sort_unstable_by(|a, b| {
889 let ord = self.rules[a.rule_index]
891 .match_action
892 .is_mutating()
893 .cmp(&self.rules[b.rule_index].match_action.is_mutating())
894 .reverse();
895
896 let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
898
899 let ord = ord.then(a.len().cmp(&b.len()).reverse());
901
902 let ord = ord.then(
904 self.rules[a.rule_index]
905 .precedence
906 .cmp(&self.rules[b.rule_index].precedence)
907 .reverse(),
908 );
909
910 let ord = ord.then(a.rule_index.cmp(&b.rule_index));
912
913 ord.reverse()
915 });
916
917 let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
918
919 'rule_matches: while let Some(rule_match) = rule_matches.pop() {
920 if self.rules[rule_match.rule_index].match_action.is_mutating() {
921 if let Some(last) = retained_rules.last()
923 && last.utf8_end > rule_match.utf8_start
924 {
925 continue;
926 }
927 } else {
928 for retained_rule in &retained_rules {
931 if retained_rule.utf8_start < rule_match.utf8_end
932 && retained_rule.utf8_end > rule_match.utf8_start
933 {
934 continue 'rule_matches;
935 }
936 }
937 };
938 retained_rules.push(rule_match);
939 }
940
941 retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
943
944 *rule_matches = retained_rules;
945 }
946}
947
948impl Drop for Scanner {
949 fn drop(&mut self) {
950 let stats = &*GLOBAL_STATS;
951 stats.scanner_deletions.increment(1);
952 stats.decrement_total_scanners();
953 }
954}
955
956#[derive(Default)]
957pub struct ScannerBuilder<'a> {
958 rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
959 labels: Labels,
960 scanner_features: ScannerFeatures,
961 async_scan_timeout: Duration,
962}
963
964impl ScannerBuilder<'_> {
965 pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
966 ScannerBuilder {
967 rules,
968 labels: Labels::empty(),
969 scanner_features: ScannerFeatures::default(),
970 async_scan_timeout: Duration::from_secs(60 * 5),
971 }
972 }
973
974 pub fn labels(mut self, labels: Labels) -> Self {
975 self.labels = labels;
976 self
977 }
978
979 pub fn with_async_scan_timeout(mut self, duration: Duration) -> Self {
980 self.async_scan_timeout = duration;
981 self
982 }
983
984 pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
985 self.scanner_features.add_implicit_index_wildcards = value;
986 self
987 }
988
989 pub fn with_return_matches(mut self, value: bool) -> Self {
990 self.scanner_features.return_matches = value;
991 self
992 }
993
994 pub fn with_multipass_v0(mut self, value: bool) -> Self {
998 self.scanner_features.multipass_v0_enabled = value;
999 self
1000 }
1001
1002 pub fn with_debug_observability(mut self, value: bool) -> Self {
1006 self.scanner_features.enable_debug_observability = value;
1007 self
1008 }
1009
1010 pub fn build(self) -> Result<Scanner, CreateScannerError> {
1011 let mut match_validators_per_type = AHashMap::new();
1012
1013 for rule in self.rules.iter() {
1014 if let Some(match_validation_type) = &rule.get_third_party_active_checker()
1015 && match_validation_type.can_create_match_validator()
1016 {
1017 let internal_type = match_validation_type.get_internal_match_validation_type();
1018 let match_validator = match_validation_type.into_match_validator();
1019 if let Ok(match_validator) = match_validator {
1020 if !match_validators_per_type.contains_key(&internal_type) {
1021 match_validators_per_type.insert(internal_type, match_validator);
1022 }
1023 } else {
1024 return Err(CreateScannerError::InvalidMatchValidator(
1025 MatchValidatorCreationError::InternalError,
1026 ));
1027 }
1028 }
1029 }
1030
1031 let compiled_rules = self
1032 .rules
1033 .iter()
1034 .enumerate()
1035 .map(|(rule_index, config)| {
1036 let inner = config.convert_to_compiled_rule(rule_index, self.labels.clone())?;
1037 config.match_action.validate()?;
1038 let compiled_suppressions = match &config.suppressions {
1039 Some(s) => s.compile()?,
1040 None => None,
1041 };
1042 Ok(RootCompiledRule {
1043 inner,
1044 scope: config.scope.clone(),
1045 match_action: config.match_action.clone(),
1046 match_validation_type: config.get_third_party_active_checker().cloned(),
1047 suppressions: compiled_suppressions,
1048 precedence: config.precedence,
1049 })
1050 })
1051 .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
1052
1053 let mut per_scanner_data = SharedData::new();
1054
1055 compiled_rules.iter().for_each(|rule| {
1056 rule.init_per_scanner_data(&mut per_scanner_data);
1057 });
1058
1059 let scoped_ruleset = ScopedRuleSet::new(
1060 &compiled_rules
1061 .iter()
1062 .map(|rule| rule.scope.clone())
1063 .collect::<Vec<_>>(),
1064 )
1065 .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
1066
1067 {
1068 let stats = &*GLOBAL_STATS;
1069 stats.scanner_creations.increment(1);
1070 stats.increment_total_scanners();
1071 }
1072
1073 Ok(Scanner {
1074 rules: compiled_rules,
1075 scoped_ruleset,
1076 scanner_features: self.scanner_features,
1077 metrics: ScannerMetrics::new(&self.labels),
1078 match_validators_per_type,
1079 labels: self.labels,
1080 per_scanner_data,
1081 async_scan_timeout: self.async_scan_timeout,
1082 })
1083 }
1084}
1085
1086struct ScannerContentVisitor<'a, E: Encoding> {
1087 scanner: &'a Scanner,
1088 regex_caches: &'a mut RegexCaches,
1089 rule_matches: &'a mut InternalRuleMatchSet<E>,
1090 blocked_rules: &'a Vec<usize>,
1093 excluded_matches: &'a mut AHashMap<String, String>,
1094 per_event_data: SharedData,
1095 wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
1096 async_jobs: &'a mut Vec<PendingRuleJob>,
1097 event_id: Option<String>,
1098}
1099
1100impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
1101 fn visit_content<'b>(
1102 &'b mut self,
1103 path: &Path<'a>,
1104 content: &str,
1105 mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
1106 exclusion_check: ExclusionCheck<'b>,
1107 ) -> Result<bool, ScannerError> {
1108 let mut path_rules_matches = vec![];
1110
1111 let mut per_string_data = SharedData::new();
1113 let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
1114
1115 rule_visitor.visit_rule_indices(|rule_index| {
1116 if self.blocked_rules.contains(&rule_index) {
1117 return Ok(());
1118 }
1119 let rule = &self.scanner.rules[rule_index];
1120 {
1121 if rule.inner.allow_scanner_to_exclude_namespace() {
1122 if exclusion_check.is_excluded(rule_index) {
1124 return Ok(());
1125 }
1126 }
1127 let mut emitter = |rule_match: StringMatch| {
1129 assert_ne!(
1132 rule_match.start, rule_match.end,
1133 "empty match detected on rule with index {rule_index}"
1134 );
1135 path_rules_matches.push(InternalRuleMatch::new(rule_index, rule_match));
1136 };
1137
1138 rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
1139
1140 rule.init_per_event_data(&mut self.per_event_data);
1142
1143 let mut ctx = StringMatchesCtx {
1144 rule_index,
1145 regex_caches: self.regex_caches,
1146 exclusion_check: &exclusion_check,
1147 excluded_matches: self.excluded_matches,
1148 match_emitter: &mut emitter,
1149 wildcard_indices: wildcard_indices_per_path,
1150 enable_debug_observability: self
1151 .scanner
1152 .scanner_features
1153 .enable_debug_observability,
1154 per_string_data: &mut per_string_data,
1155 per_scanner_data: &self.scanner.per_scanner_data,
1156 per_event_data: &mut self.per_event_data,
1157 event_id: self.event_id.as_deref(),
1158 };
1159
1160 let async_status = rule.get_string_matches(content, path, &mut ctx)?;
1161
1162 match async_status {
1163 RuleStatus::Done => {
1164 }
1166 RuleStatus::Pending(fut) => {
1167 self.async_jobs.push(PendingRuleJob {
1168 fut,
1169 path: path.into_static(),
1170 });
1171 }
1172 }
1173 }
1174 Ok(())
1175 })?;
1176
1177 let needs_to_access_content = !path_rules_matches.is_empty() || !self.async_jobs.is_empty();
1182
1183 self.rule_matches
1184 .push_sync_matches(path, path_rules_matches);
1185
1186 Ok(needs_to_access_content)
1187 }
1188}
1189
1190fn get_next_regex_start(content: &str, regex_match: (usize, usize)) -> Option<usize> {
1192 if let Some((i, _)) = content[regex_match.0..].char_indices().nth(1) {
1194 Some(regex_match.0 + i)
1195 } else {
1196 None
1198 }
1199}
1200
1201fn is_false_positive_match(
1202 regex_match_range: (usize, usize),
1203 rule: &RegexCompiledRule,
1204 content: &str,
1205 check_excluded_keywords: bool,
1206) -> bool {
1207 if check_excluded_keywords
1208 && let Some(excluded_keywords) = &rule.excluded_keywords
1209 && excluded_keywords.is_false_positive_match(content, regex_match_range.0)
1210 {
1211 return true;
1212 }
1213
1214 if let Some(validator) = rule.validator.as_ref()
1215 && !validator.is_valid_match(&content[regex_match_range.0..regex_match_range.1])
1216 {
1217 return true;
1218 }
1219 false
1220}