1use crate::encoding::Encoding;
2use crate::event::Event;
3use std::future::Future;
4
5use crate::match_validation::{
6 config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
7 match_validator::MatchValidator,
8};
9
10use error::MatchValidatorCreationError;
11
12use self::metrics::ScannerMetrics;
13use crate::match_validation::match_validator::RAYON_THREAD_POOL;
14use crate::observability::labels::Labels;
15use crate::rule_match::{InternalRuleMatch, RuleMatch};
16use crate::scanner::config::RuleConfig;
17use crate::scanner::internal_rule_match_set::InternalRuleMatchSet;
18use crate::scanner::regex_rule::compiled::RegexCompiledRule;
19use crate::scanner::regex_rule::{RegexCaches, access_regex_caches};
20use crate::scanner::scope::Scope;
21pub use crate::scanner::shared_data::SharedData;
22use crate::scanner::suppression::{CompiledSuppressions, SuppressionValidationError, Suppressions};
23use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
24pub use crate::secondary_validation::Validator;
25use crate::stats::GLOBAL_STATS;
26use crate::tokio::TOKIO_RUNTIME;
27use crate::{CreateScannerError, EncodeIndices, MatchAction, Path, ScannerError};
28use ahash::{AHashMap, AHashSet};
29use futures::executor::block_on;
30use serde::{Deserialize, Serialize};
31use serde_with::serde_as;
32use std::ops::Deref;
33use std::pin::Pin;
34use std::sync::Arc;
35use std::time::{Duration, Instant};
36use tokio::task::JoinHandle;
37use tokio::time::timeout;
38
39pub mod config;
40pub mod debug_scan;
41pub mod error;
42pub mod metrics;
43pub mod regex_rule;
44pub mod scope;
45pub mod shared_data;
46pub mod shared_pool;
47pub mod suppression;
48
49mod internal_rule_match_set;
50#[cfg(test)]
51mod test;
52
53#[derive(Clone)]
54pub struct StringMatch {
55 pub start: usize,
56 pub end: usize,
57 pub keyword: Option<String>,
59}
60
61pub trait MatchEmitter<T = ()> {
62 fn emit(&mut self, string_match: StringMatch) -> T;
63}
64
65impl<F, T> MatchEmitter<T> for F
68where
69 F: FnMut(StringMatch) -> T,
70{
71 fn emit(&mut self, string_match: StringMatch) -> T {
72 (self)(string_match)
74 }
75}
76
77#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Default)]
86pub enum Precedence {
87 Catchall,
88 Generic,
89 #[default]
90 Specific,
91}
92
93#[serde_as]
94#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
95pub struct RootRuleConfig<T> {
96 pub match_action: MatchAction,
97 #[serde(default)]
98 pub scope: Scope,
99 #[deprecated(note = "Use `third_party_active_checker` instead")]
100 match_validation_type: Option<MatchValidationType>,
101 third_party_active_checker: Option<MatchValidationType>,
102 suppressions: Option<Suppressions>,
103 #[serde(default)]
104 precedence: Precedence,
105 #[serde(flatten)]
106 pub inner: T,
107}
108
109impl<T> RootRuleConfig<T>
110where
111 T: RuleConfig + 'static,
112{
113 pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
114 RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
115 }
116
117 pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
118 self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
119 }
120}
121
122impl<T> RootRuleConfig<T> {
123 pub fn new(inner: T) -> Self {
124 #[allow(deprecated)]
125 Self {
126 match_action: MatchAction::None,
127 scope: Scope::all(),
128 match_validation_type: None,
129 third_party_active_checker: None,
130 suppressions: None,
131 precedence: Precedence::default(),
132 inner,
133 }
134 }
135
136 pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
137 #[allow(deprecated)]
138 RootRuleConfig {
139 match_action: self.match_action,
140 scope: self.scope,
141 match_validation_type: self.match_validation_type,
142 third_party_active_checker: self.third_party_active_checker,
143 suppressions: self.suppressions,
144 precedence: self.precedence,
145 inner: func(self.inner),
146 }
147 }
148
149 pub fn match_action(mut self, action: MatchAction) -> Self {
150 self.match_action = action;
151 self
152 }
153
154 pub fn precedence(mut self, precedence: Precedence) -> Self {
155 self.precedence = precedence;
156 self
157 }
158
159 pub fn scope(mut self, scope: Scope) -> Self {
160 self.scope = scope;
161 self
162 }
163
164 pub fn third_party_active_checker(
165 mut self,
166 match_validation_type: MatchValidationType,
167 ) -> Self {
168 self.third_party_active_checker = Some(match_validation_type);
169 self
170 }
171
172 pub fn suppressions(mut self, suppressions: Suppressions) -> Self {
173 self.suppressions = Some(suppressions);
174 self
175 }
176
177 fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
178 #[allow(deprecated)]
179 self.third_party_active_checker
180 .as_ref()
181 .or(self.match_validation_type.as_ref())
182 }
183}
184
185impl<T> Deref for RootRuleConfig<T> {
186 type Target = T;
187
188 fn deref(&self) -> &Self::Target {
189 &self.inner
190 }
191}
192pub struct RootCompiledRule {
193 pub inner: Box<dyn CompiledRule>,
194 pub scope: Scope,
195 pub match_action: MatchAction,
196 pub match_validation_type: Option<MatchValidationType>,
197 pub suppressions: Option<CompiledSuppressions>,
198 pub precedence: Precedence,
199}
200
201impl RootCompiledRule {
202 pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
203 self.match_validation_type
204 .as_ref()
205 .map(|x| x.get_internal_match_validation_type())
206 }
207}
208
209impl Deref for RootCompiledRule {
210 type Target = dyn CompiledRule;
211
212 fn deref(&self) -> &Self::Target {
213 self.inner.as_ref()
214 }
215}
216
217pub struct StringMatchesCtx<'a> {
218 rule_index: usize,
219 pub regex_caches: &'a mut RegexCaches,
220 pub exclusion_check: &'a ExclusionCheck<'a>,
221 pub excluded_matches: &'a mut AHashSet<String>,
222 pub match_emitter: &'a mut dyn MatchEmitter,
223 pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
224
225 pub per_string_data: &'a mut SharedData,
227 pub per_scanner_data: &'a SharedData,
228 pub per_event_data: &'a mut SharedData,
229 pub event_id: Option<&'a str>,
230}
231
232impl StringMatchesCtx<'_> {
233 pub fn process_async(
243 &self,
244 func: impl for<'a> FnOnce(
245 &'a mut AsyncStringMatchesCtx,
246 )
247 -> Pin<Box<dyn Future<Output = Result<(), ScannerError>> + Send + 'a>>
248 + Send
249 + 'static,
250 ) -> RuleResult {
251 let rule_index = self.rule_index;
252
253 let fut = TOKIO_RUNTIME.spawn(async move {
256 let start = Instant::now();
257 let mut ctx = AsyncStringMatchesCtx {
258 rule_matches: vec![],
259 };
260 (func)(&mut ctx).await?;
261 let io_duration = start.elapsed();
262
263 Ok(AsyncRuleInfo {
264 rule_index,
265 rule_matches: ctx.rule_matches,
266 io_duration,
267 })
268 });
269
270 Ok(RuleStatus::Pending(fut))
271 }
272}
273
274pub struct AsyncStringMatchesCtx {
275 rule_matches: Vec<StringMatch>,
276}
277
278impl AsyncStringMatchesCtx {
279 pub fn emit_match(&mut self, string_match: StringMatch) {
280 self.rule_matches.push(string_match);
281 }
282}
283
284#[must_use]
285pub enum RuleStatus {
286 Done,
287 Pending(PendingRuleResult),
288}
289
290pub type PendingRuleResult = JoinHandle<Result<AsyncRuleInfo, ScannerError>>;
292
293pub struct PendingRuleJob {
294 fut: PendingRuleResult,
295 path: Path<'static>,
296}
297
298pub struct AsyncRuleInfo {
299 rule_index: usize,
300 rule_matches: Vec<StringMatch>,
301 io_duration: Duration,
302}
303
304pub type RuleResult = Result<RuleStatus, ScannerError>;
306
307pub trait CompiledRule: Send + Sync {
309 fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
310 }
312
313 fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
314 }
316
317 fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
318 }
320
321 fn get_string_matches(
322 &self,
323 content: &str,
324 path: &Path,
325 ctx: &mut StringMatchesCtx<'_>,
326 ) -> RuleResult;
327
328 fn should_exclude_multipass_v0(&self) -> bool {
331 false
333 }
334
335 fn on_excluded_match_multipass_v0(&self) {
336 }
338
339 fn as_regex_rule(&self) -> Option<&RegexCompiledRule> {
340 None
341 }
342
343 fn as_regex_rule_mut(&mut self) -> Option<&mut RegexCompiledRule> {
344 None
345 }
346
347 fn allow_scanner_to_exclude_namespace(&self) -> bool {
348 true
349 }
350}
351
352impl<T> RuleConfig for Box<T>
353where
354 T: RuleConfig + ?Sized,
355{
356 fn convert_to_compiled_rule(
357 &self,
358 rule_index: usize,
359 labels: Labels,
360 ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
361 self.as_ref().convert_to_compiled_rule(rule_index, labels)
362 }
363}
364
365#[derive(Debug, PartialEq, Clone)]
366struct ScannerFeatures {
367 pub add_implicit_index_wildcards: bool,
368 pub multipass_v0_enabled: bool,
369 pub return_matches: bool,
370}
371
372impl Default for ScannerFeatures {
373 fn default() -> Self {
374 Self {
375 add_implicit_index_wildcards: false,
376 multipass_v0_enabled: true,
377 return_matches: false,
378 }
379 }
380}
381
382pub struct ScanOptions {
383 pub blocked_rules_idx: Vec<usize>,
386 pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
388 pub validate_matches: bool,
391}
392
393impl Default for ScanOptions {
394 fn default() -> Self {
395 Self {
396 blocked_rules_idx: vec![],
397 wildcarded_indices: AHashMap::new(),
398 validate_matches: false,
399 }
400 }
401}
402
403pub struct ScanOptionBuilder {
404 blocked_rules_idx: Vec<usize>,
405 wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
406 validate_matches: bool,
407}
408
409impl ScanOptionBuilder {
410 pub fn new() -> Self {
411 Self {
412 blocked_rules_idx: vec![],
413 wildcarded_indices: AHashMap::new(),
414 validate_matches: false,
415 }
416 }
417
418 pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
419 self.blocked_rules_idx = blocked_rules_idx;
420 self
421 }
422
423 pub fn with_wildcarded_indices(
424 mut self,
425 wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
426 ) -> Self {
427 self.wildcarded_indices = wildcarded_indices;
428 self
429 }
430
431 pub fn with_validate_matching(mut self, validate_matches: bool) -> Self {
432 self.validate_matches = validate_matches;
433 self
434 }
435
436 pub fn build(self) -> ScanOptions {
437 ScanOptions {
438 blocked_rules_idx: self.blocked_rules_idx,
439 wildcarded_indices: self.wildcarded_indices,
440 validate_matches: self.validate_matches,
441 }
442 }
443}
444
445pub struct Scanner {
446 rules: Vec<RootCompiledRule>,
447 scoped_ruleset: ScopedRuleSet,
448 scanner_features: ScannerFeatures,
449 metrics: ScannerMetrics,
450 labels: Labels,
451 match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
452 per_scanner_data: SharedData,
453 async_scan_timeout: Duration,
454}
455
456impl Scanner {
457 pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
458 ScannerBuilder::new(rules)
459 }
460
461 pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
466 self.scan_with_options(event, ScanOptions::default())
467 }
468
469 pub fn scan_with_options<E: Event>(
474 &self,
475 event: &mut E,
476 options: ScanOptions,
477 ) -> Result<Vec<RuleMatch>, ScannerError> {
478 block_on(self.internal_scan_with_metrics(event, options))
479 }
480
481 pub async fn scan_async<E: Event>(
485 &self,
486 event: &mut E,
487 ) -> Result<Vec<RuleMatch>, ScannerError> {
488 self.scan_async_with_options(event, ScanOptions::default())
489 .await
490 }
491
492 pub async fn scan_async_with_options<E: Event>(
493 &self,
494 event: &mut E,
495 options: ScanOptions,
496 ) -> Result<Vec<RuleMatch>, ScannerError> {
497 let fut = self.internal_scan_with_metrics(event, options);
498
499 let timeout = {
502 let _tokio_guard = TOKIO_RUNTIME.enter();
503 timeout(self.async_scan_timeout, fut)
504 };
505
506 timeout.await.unwrap_or(Err(ScannerError::Transient(
507 "Async scan timeout".to_string(),
508 )))
509 }
510
511 fn record_metrics(
512 &self,
513 output_rule_matches: &[RuleMatch],
514 start: Instant,
515 io_duration: Option<Duration>,
516 ) {
517 self.metrics.num_scanned_events.increment(1);
519 self.metrics
521 .match_count
522 .increment(output_rule_matches.len() as u64);
523
524 if let Some(io_duration) = io_duration {
525 let total_duration = start.elapsed();
526 let cpu_duration = total_duration.saturating_sub(io_duration);
527 self.metrics
528 .cpu_duration
529 .increment(cpu_duration.as_nanos() as u64);
530 }
531 }
532
533 async fn internal_scan_with_metrics<E: Event>(
534 &self,
535 event: &mut E,
536 options: ScanOptions,
537 ) -> Result<Vec<RuleMatch>, ScannerError> {
538 let start = Instant::now();
539 let result = self.internal_scan(event, options).await;
540 match result {
541 Ok((rule_matches, io_duration)) => {
542 self.record_metrics(&rule_matches, start, Some(io_duration));
543 Ok(rule_matches)
544 }
545 Err(e) => {
546 self.record_metrics(&[], start, None);
547 Err(e)
548 }
549 }
550 }
551
552 fn process_rule_matches<E: Event>(
553 &self,
554 event: &mut E,
555 rule_matches: InternalRuleMatchSet<E::Encoding>,
556 excluded_matches: AHashSet<String>,
557 output_rule_matches: &mut Vec<RuleMatch>,
558 need_match_content: bool,
559 ) {
560 if rule_matches.is_empty() {
561 return;
562 }
563 access_regex_caches(|regex_caches| {
564 for (path, mut rule_matches) in rule_matches.into_iter() {
565 event.visit_string_mut(&path, |content| {
567 rule_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
569
570 <<E as Event>::Encoding>::calculate_indices(
571 content,
572 rule_matches.iter_mut().map(
573 |rule_match: &mut InternalRuleMatch<E::Encoding>| EncodeIndices {
574 utf8_start: rule_match.utf8_start,
575 utf8_end: rule_match.utf8_end,
576 custom_start: &mut rule_match.custom_start,
577 custom_end: &mut rule_match.custom_end,
578 },
579 ),
580 );
581
582 if self.scanner_features.multipass_v0_enabled {
583 rule_matches.retain(|rule_match| {
586 if self.rules[rule_match.rule_index]
587 .inner
588 .should_exclude_multipass_v0()
589 {
590 let is_false_positive = excluded_matches
591 .contains(&content[rule_match.utf8_start..rule_match.utf8_end]);
592 if is_false_positive && self.scanner_features.multipass_v0_enabled {
593 self.rules[rule_match.rule_index]
594 .on_excluded_match_multipass_v0();
595 }
596 !is_false_positive
597 } else {
598 true
599 }
600 });
601 }
602
603 self.suppress_matches::<E::Encoding>(&mut rule_matches, content, regex_caches);
604
605 self.sort_and_remove_overlapping_rules::<E::Encoding>(&mut rule_matches);
606
607 let will_mutate = rule_matches.iter().any(|rule_match| {
608 self.rules[rule_match.rule_index].match_action.is_mutating()
609 });
610
611 self.apply_match_actions(
612 content,
613 &path,
614 rule_matches,
615 output_rule_matches,
616 need_match_content,
617 );
618
619 will_mutate
620 });
621 }
622 });
623 }
624
625 async fn internal_scan<E: Event>(
626 &self,
627 event: &mut E,
628 options: ScanOptions,
629 ) -> Result<(Vec<RuleMatch>, Duration), ScannerError> {
630 let need_match_content = self.scanner_features.return_matches || options.validate_matches;
633 let mut rule_matches = InternalRuleMatchSet::new();
635 let mut excluded_matches = AHashSet::new();
636 let mut async_jobs = vec![];
637
638 access_regex_caches(|regex_caches| {
639 self.scoped_ruleset.visit_string_rule_combinations(
640 event,
641 ScannerContentVisitor {
642 scanner: self,
643 regex_caches,
644 rule_matches: &mut rule_matches,
645 blocked_rules: &options.blocked_rules_idx,
646 excluded_matches: &mut excluded_matches,
647 per_event_data: SharedData::new(),
648 wildcarded_indexes: &options.wildcarded_indices,
649 async_jobs: &mut async_jobs,
650 event_id: event.get_id().map(|s| s.to_string()),
651 },
652 )
653 })?;
654
655 let mut total_io_duration = Duration::ZERO;
658 for job in async_jobs {
659 let rule_info = job.fut.await.unwrap()?;
660 total_io_duration += rule_info.io_duration;
661 rule_matches.push_async_matches(
662 &job.path,
663 rule_info
664 .rule_matches
665 .into_iter()
666 .map(|x| InternalRuleMatch::new(rule_info.rule_index, x)),
667 );
668 }
669
670 let mut output_rule_matches = vec![];
671
672 self.process_rule_matches(
673 event,
674 rule_matches,
675 excluded_matches,
676 &mut output_rule_matches,
677 need_match_content,
678 );
679
680 if options.validate_matches {
681 self.validate_matches(&mut output_rule_matches);
682 }
683
684 Ok((output_rule_matches, total_io_duration))
685 }
686
687 pub fn suppress_matches<E: Encoding>(
688 &self,
689 rule_matches: &mut Vec<InternalRuleMatch<E>>,
690 content: &str,
691 regex_caches: &mut RegexCaches,
692 ) {
693 rule_matches.retain(|rule_match| {
694 if let Some(suppressions) = &self.rules[rule_match.rule_index].suppressions {
695 let match_should_be_suppressed = suppressions.should_match_be_suppressed(
696 &content[rule_match.utf8_start..rule_match.utf8_end],
697 regex_caches,
698 );
699
700 if match_should_be_suppressed {
701 self.metrics.suppressed_match_count.increment(1);
702 }
703 !match_should_be_suppressed
704 } else {
705 true
706 }
707 });
708 }
709
710 pub fn validate_matches(&self, rule_matches: &mut Vec<RuleMatch>) {
711 let mut match_validator_rule_match_per_type = AHashMap::new();
713
714 let mut validated_rule_matches = vec![];
715
716 for mut rule_match in rule_matches.drain(..) {
717 let rule = &self.rules[rule_match.rule_index];
718 if let Some(match_validation_type) = rule.internal_match_validation_type() {
719 match_validator_rule_match_per_type
720 .entry(match_validation_type)
721 .or_insert_with(Vec::new)
722 .push(rule_match)
723 } else {
724 rule_match.match_status.merge(MatchStatus::NotAvailable);
726 validated_rule_matches.push(rule_match);
727 }
728 }
729
730 RAYON_THREAD_POOL.install(|| {
731 use rayon::prelude::*;
732
733 match_validator_rule_match_per_type.par_iter_mut().for_each(
734 |(match_validation_type, matches_per_type)| {
735 let match_validator = self.match_validators_per_type.get(match_validation_type);
736 if let Some(match_validator) = match_validator {
737 match_validator
738 .as_ref()
739 .validate(matches_per_type, &self.rules)
740 }
741 },
742 );
743 });
744
745 for (_, mut matches) in match_validator_rule_match_per_type {
747 validated_rule_matches.append(&mut matches);
748 }
749
750 validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
752 *rule_matches = validated_rule_matches;
753 }
754
755 fn apply_match_actions<E: Encoding>(
758 &self,
759 content: &mut String,
760 path: &Path<'static>,
761 rule_matches: Vec<InternalRuleMatch<E>>,
762 output_rule_matches: &mut Vec<RuleMatch>,
763 need_match_content: bool,
764 ) {
765 let mut utf8_byte_delta: isize = 0;
766 let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
767
768 for rule_match in rule_matches {
769 output_rule_matches.push(self.apply_match_actions_for_string::<E>(
770 content,
771 path.clone(),
772 rule_match,
773 &mut utf8_byte_delta,
774 &mut custom_index_delta,
775 need_match_content,
776 ));
777 }
778 }
779
780 fn apply_match_actions_for_string<E: Encoding>(
782 &self,
783 content: &mut String,
784 path: Path<'static>,
785 rule_match: InternalRuleMatch<E>,
786 utf8_byte_delta: &mut isize,
788
789 custom_index_delta: &mut <E>::IndexShift,
791 need_match_content: bool,
792 ) -> RuleMatch {
793 let rule = &self.rules[rule_match.rule_index];
794
795 let custom_start =
796 (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
797 + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
798
799 let mut matched_content_copy = None;
800
801 if need_match_content {
802 let mutated_utf8_match_start =
804 (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
805 let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
806
807 debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
809 debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
810
811 let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
812 matched_content_copy = Some(matched_content.to_string());
813 }
814
815 if rule.match_action.is_mutating() {
816 let mutated_utf8_match_start =
817 (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
818 let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
819
820 debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
822 debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
823
824 let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
825 if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
826 let before_replacement = &matched_content[replacement.start..replacement.end];
827
828 <E>::adjust_shift(
830 custom_index_delta,
831 before_replacement,
832 &replacement.replacement,
833 );
834 *utf8_byte_delta +=
835 replacement.replacement.len() as isize - before_replacement.len() as isize;
836
837 let replacement_start = mutated_utf8_match_start + replacement.start;
838 let replacement_end = mutated_utf8_match_start + replacement.end;
839 content.replace_range(replacement_start..replacement_end, &replacement.replacement);
840 }
841 }
842
843 let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
844 let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
845 + shift_offset) as usize;
846
847 let rule = &self.rules[rule_match.rule_index];
848
849 let match_status: MatchStatus = if rule.match_validation_type.is_some() {
850 MatchStatus::NotChecked
851 } else {
852 MatchStatus::NotAvailable
853 };
854
855 RuleMatch {
856 rule_index: rule_match.rule_index,
857 path,
858 replacement_type: rule.match_action.replacement_type(),
859 start_index: custom_start,
860 end_index_exclusive: custom_end,
861 shift_offset,
862 match_value: matched_content_copy,
863 match_status,
864 keyword: rule_match.keyword,
865 }
866 }
867
868 fn sort_and_remove_overlapping_rules<E: Encoding>(
869 &self,
870 rule_matches: &mut Vec<InternalRuleMatch<E>>,
871 ) {
872 rule_matches.sort_unstable_by(|a, b| {
876 let ord = self.rules[a.rule_index]
878 .match_action
879 .is_mutating()
880 .cmp(&self.rules[b.rule_index].match_action.is_mutating())
881 .reverse();
882
883 let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
885
886 let ord = ord.then(a.len().cmp(&b.len()).reverse());
888
889 let ord = ord.then(
891 self.rules[a.rule_index]
892 .precedence
893 .cmp(&self.rules[b.rule_index].precedence)
894 .reverse(),
895 );
896
897 let ord = ord.then(a.rule_index.cmp(&b.rule_index));
899
900 ord.reverse()
902 });
903
904 let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
905
906 'rule_matches: while let Some(rule_match) = rule_matches.pop() {
907 if self.rules[rule_match.rule_index].match_action.is_mutating() {
908 if let Some(last) = retained_rules.last()
910 && last.utf8_end > rule_match.utf8_start
911 {
912 continue;
913 }
914 } else {
915 for retained_rule in &retained_rules {
918 if retained_rule.utf8_start < rule_match.utf8_end
919 && retained_rule.utf8_end > rule_match.utf8_start
920 {
921 continue 'rule_matches;
922 }
923 }
924 };
925 retained_rules.push(rule_match);
926 }
927
928 retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
930
931 *rule_matches = retained_rules;
932 }
933}
934
935impl Drop for Scanner {
936 fn drop(&mut self) {
937 let stats = &*GLOBAL_STATS;
938 stats.scanner_deletions.increment(1);
939 stats.decrement_total_scanners();
940 }
941}
942
943#[derive(Default)]
944pub struct ScannerBuilder<'a> {
945 rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
946 labels: Labels,
947 scanner_features: ScannerFeatures,
948 async_scan_timeout: Duration,
949}
950
951impl ScannerBuilder<'_> {
952 pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
953 ScannerBuilder {
954 rules,
955 labels: Labels::empty(),
956 scanner_features: ScannerFeatures::default(),
957 async_scan_timeout: Duration::from_secs(60 * 5),
958 }
959 }
960
961 pub fn labels(mut self, labels: Labels) -> Self {
962 self.labels = labels;
963 self
964 }
965
966 pub fn with_async_scan_timeout(mut self, duration: Duration) -> Self {
967 self.async_scan_timeout = duration;
968 self
969 }
970
971 pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
972 self.scanner_features.add_implicit_index_wildcards = value;
973 self
974 }
975
976 pub fn with_return_matches(mut self, value: bool) -> Self {
977 self.scanner_features.return_matches = value;
978 self
979 }
980
981 pub fn with_multipass_v0(mut self, value: bool) -> Self {
985 self.scanner_features.multipass_v0_enabled = value;
986 self
987 }
988
989 pub fn build(self) -> Result<Scanner, CreateScannerError> {
990 let mut match_validators_per_type = AHashMap::new();
991
992 for rule in self.rules.iter() {
993 if let Some(match_validation_type) = &rule.get_third_party_active_checker()
994 && match_validation_type.can_create_match_validator()
995 {
996 let internal_type = match_validation_type.get_internal_match_validation_type();
997 let match_validator = match_validation_type.into_match_validator();
998 if let Ok(match_validator) = match_validator {
999 if !match_validators_per_type.contains_key(&internal_type) {
1000 match_validators_per_type.insert(internal_type, match_validator);
1001 }
1002 } else {
1003 return Err(CreateScannerError::InvalidMatchValidator(
1004 MatchValidatorCreationError::InternalError,
1005 ));
1006 }
1007 }
1008 }
1009
1010 let compiled_rules = self
1011 .rules
1012 .iter()
1013 .enumerate()
1014 .map(|(rule_index, config)| {
1015 let inner = config.convert_to_compiled_rule(rule_index, self.labels.clone())?;
1016 config.match_action.validate()?;
1017 let compiled_suppressions = match &config.suppressions {
1018 Some(s) => s.compile()?,
1019 None => None,
1020 };
1021 Ok(RootCompiledRule {
1022 inner,
1023 scope: config.scope.clone(),
1024 match_action: config.match_action.clone(),
1025 match_validation_type: config.get_third_party_active_checker().cloned(),
1026 suppressions: compiled_suppressions,
1027 precedence: config.precedence,
1028 })
1029 })
1030 .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
1031
1032 let mut per_scanner_data = SharedData::new();
1033
1034 compiled_rules.iter().for_each(|rule| {
1035 rule.init_per_scanner_data(&mut per_scanner_data);
1036 });
1037
1038 let scoped_ruleset = ScopedRuleSet::new(
1039 &compiled_rules
1040 .iter()
1041 .map(|rule| rule.scope.clone())
1042 .collect::<Vec<_>>(),
1043 )
1044 .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
1045
1046 {
1047 let stats = &*GLOBAL_STATS;
1048 stats.scanner_creations.increment(1);
1049 stats.increment_total_scanners();
1050 }
1051
1052 Ok(Scanner {
1053 rules: compiled_rules,
1054 scoped_ruleset,
1055 scanner_features: self.scanner_features,
1056 metrics: ScannerMetrics::new(&self.labels),
1057 match_validators_per_type,
1058 labels: self.labels,
1059 per_scanner_data,
1060 async_scan_timeout: self.async_scan_timeout,
1061 })
1062 }
1063}
1064
1065struct ScannerContentVisitor<'a, E: Encoding> {
1066 scanner: &'a Scanner,
1067 regex_caches: &'a mut RegexCaches,
1068 rule_matches: &'a mut InternalRuleMatchSet<E>,
1069 blocked_rules: &'a Vec<usize>,
1072 excluded_matches: &'a mut AHashSet<String>,
1073 per_event_data: SharedData,
1074 wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
1075 async_jobs: &'a mut Vec<PendingRuleJob>,
1076 event_id: Option<String>,
1077}
1078
1079impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
1080 fn visit_content<'b>(
1081 &'b mut self,
1082 path: &Path<'a>,
1083 content: &str,
1084 mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
1085 exclusion_check: ExclusionCheck<'b>,
1086 ) -> Result<bool, ScannerError> {
1087 let mut path_rules_matches = vec![];
1089
1090 let mut per_string_data = SharedData::new();
1092 let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
1093
1094 rule_visitor.visit_rule_indices(|rule_index| {
1095 if self.blocked_rules.contains(&rule_index) {
1096 return Ok(());
1097 }
1098 let rule = &self.scanner.rules[rule_index];
1099 {
1100 if rule.inner.allow_scanner_to_exclude_namespace() {
1101 if exclusion_check.is_excluded(rule_index) {
1103 return Ok(());
1104 }
1105 }
1106 let mut emitter = |rule_match: StringMatch| {
1108 assert_ne!(
1111 rule_match.start, rule_match.end,
1112 "empty match detected on rule with index {rule_index}"
1113 );
1114 path_rules_matches.push(InternalRuleMatch::new(rule_index, rule_match));
1115 };
1116
1117 rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
1118
1119 rule.init_per_event_data(&mut self.per_event_data);
1121
1122 let mut ctx = StringMatchesCtx {
1123 rule_index,
1124 regex_caches: self.regex_caches,
1125 exclusion_check: &exclusion_check,
1126 excluded_matches: self.excluded_matches,
1127 match_emitter: &mut emitter,
1128 wildcard_indices: wildcard_indices_per_path,
1129 per_string_data: &mut per_string_data,
1130 per_scanner_data: &self.scanner.per_scanner_data,
1131 per_event_data: &mut self.per_event_data,
1132 event_id: self.event_id.as_deref(),
1133 };
1134
1135 let async_status = rule.get_string_matches(content, path, &mut ctx)?;
1136
1137 match async_status {
1138 RuleStatus::Done => {
1139 }
1141 RuleStatus::Pending(fut) => {
1142 self.async_jobs.push(PendingRuleJob {
1143 fut,
1144 path: path.into_static(),
1145 });
1146 }
1147 }
1148 }
1149 Ok(())
1150 })?;
1151
1152 let needs_to_access_content = !path_rules_matches.is_empty() || !self.async_jobs.is_empty();
1157
1158 self.rule_matches
1159 .push_sync_matches(path, path_rules_matches);
1160
1161 Ok(needs_to_access_content)
1162 }
1163}
1164
1165fn get_next_regex_start(content: &str, regex_match: (usize, usize)) -> Option<usize> {
1167 if let Some((i, _)) = content[regex_match.0..].char_indices().nth(1) {
1169 Some(regex_match.0 + i)
1170 } else {
1171 None
1173 }
1174}
1175
1176fn is_false_positive_match(
1177 regex_match_range: (usize, usize),
1178 rule: &RegexCompiledRule,
1179 content: &str,
1180 check_excluded_keywords: bool,
1181) -> bool {
1182 if check_excluded_keywords
1183 && let Some(excluded_keywords) = &rule.excluded_keywords
1184 && excluded_keywords.is_false_positive_match(content, regex_match_range.0)
1185 {
1186 return true;
1187 }
1188
1189 if let Some(validator) = rule.validator.as_ref()
1190 && !validator.is_valid_match(&content[regex_match_range.0..regex_match_range.1])
1191 {
1192 return true;
1193 }
1194 false
1195}