1use crate::encoding::Encoding;
2use crate::event::Event;
3use std::future::Future;
4
5use crate::match_validation::{
6 config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
7 match_validator::MatchValidator,
8};
9
10use error::MatchValidatorCreationError;
11
12use self::metrics::ScannerMetrics;
13use crate::match_validation::match_validator::RAYON_THREAD_POOL;
14use crate::observability::labels::Labels;
15use crate::rule_match::{InternalRuleMatch, RuleMatch};
16use crate::scanner::config::RuleConfig;
17use crate::scanner::internal_rule_match_set::InternalRuleMatchSet;
18use crate::scanner::regex_rule::compiled::RegexCompiledRule;
19use crate::scanner::regex_rule::{RegexCaches, access_regex_caches};
20use crate::scanner::scope::Scope;
21pub use crate::scanner::shared_data::SharedData;
22use crate::scanner::suppression::{CompiledSuppressions, SuppressionValidationError, Suppressions};
23use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
24pub use crate::secondary_validation::Validator;
25use crate::stats::GLOBAL_STATS;
26use crate::tokio::TOKIO_RUNTIME;
27use crate::{
28 CreateScannerError, EncodeIndices, MatchAction, Path, RegexValidationError, ScannerError,
29};
30use ahash::{AHashMap, AHashSet};
31use futures::executor::block_on;
32use serde::{Deserialize, Serialize};
33use serde_with::serde_as;
34use std::ops::Deref;
35use std::pin::Pin;
36use std::sync::Arc;
37use std::time::{Duration, Instant};
38use tokio::task::JoinHandle;
39use tokio::time::timeout;
40
41pub mod config;
42pub mod error;
43pub mod metrics;
44pub mod regex_rule;
45pub mod scope;
46pub mod shared_data;
47pub mod shared_pool;
48pub mod suppression;
49
50mod internal_rule_match_set;
51#[cfg(test)]
52mod test;
53
54#[derive(Copy, Clone)]
55pub struct StringMatch {
56 pub start: usize,
57 pub end: usize,
58}
59
60pub trait MatchEmitter<T = ()> {
61 fn emit(&mut self, string_match: StringMatch) -> T;
62}
63
64impl<F, T> MatchEmitter<T> for F
67where
68 F: FnMut(StringMatch) -> T,
69{
70 fn emit(&mut self, string_match: StringMatch) -> T {
71 (self)(string_match)
73 }
74}
75
76#[serde_as]
77#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
78pub struct RootRuleConfig<T> {
79 pub match_action: MatchAction,
80 #[serde(default)]
81 pub scope: Scope,
82 #[deprecated(note = "Use `third_party_active_checker` instead")]
83 match_validation_type: Option<MatchValidationType>,
84 third_party_active_checker: Option<MatchValidationType>,
85 suppressions: Option<Suppressions>,
86 #[serde(flatten)]
87 pub inner: T,
88}
89
90impl<T> RootRuleConfig<T>
91where
92 T: RuleConfig + 'static,
93{
94 pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
95 RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
96 }
97
98 pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
99 self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
100 }
101}
102
103impl<T> RootRuleConfig<T> {
104 pub fn new(inner: T) -> Self {
105 #[allow(deprecated)]
106 Self {
107 match_action: MatchAction::None,
108 scope: Scope::all(),
109 match_validation_type: None,
110 third_party_active_checker: None,
111 suppressions: None,
112 inner,
113 }
114 }
115
116 pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
117 #[allow(deprecated)]
118 RootRuleConfig {
119 match_action: self.match_action,
120 scope: self.scope,
121 match_validation_type: self.match_validation_type,
122 third_party_active_checker: self.third_party_active_checker,
123 suppressions: self.suppressions,
124 inner: func(self.inner),
125 }
126 }
127
128 pub fn match_action(mut self, action: MatchAction) -> Self {
129 self.match_action = action;
130 self
131 }
132
133 pub fn scope(mut self, scope: Scope) -> Self {
134 self.scope = scope;
135 self
136 }
137
138 pub fn third_party_active_checker(
139 mut self,
140 match_validation_type: MatchValidationType,
141 ) -> Self {
142 self.third_party_active_checker = Some(match_validation_type);
143 self
144 }
145
146 pub fn suppressions(mut self, suppressions: Suppressions) -> Self {
147 self.suppressions = Some(suppressions);
148 self
149 }
150
151 fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
152 #[allow(deprecated)]
153 self.third_party_active_checker
154 .as_ref()
155 .or(self.match_validation_type.as_ref())
156 }
157}
158
159impl<T> Deref for RootRuleConfig<T> {
160 type Target = T;
161
162 fn deref(&self) -> &Self::Target {
163 &self.inner
164 }
165}
166pub struct RootCompiledRule {
167 pub inner: Box<dyn CompiledRule>,
168 pub scope: Scope,
169 pub match_action: MatchAction,
170 pub match_validation_type: Option<MatchValidationType>,
171 pub suppressions: Option<CompiledSuppressions>,
172}
173
174impl RootCompiledRule {
175 pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
176 self.match_validation_type
177 .as_ref()
178 .map(|x| x.get_internal_match_validation_type())
179 }
180}
181
182impl Deref for RootCompiledRule {
183 type Target = dyn CompiledRule;
184
185 fn deref(&self) -> &Self::Target {
186 self.inner.as_ref()
187 }
188}
189
190pub struct StringMatchesCtx<'a> {
191 rule_index: usize,
192 pub regex_caches: &'a mut RegexCaches,
193 pub exclusion_check: &'a ExclusionCheck<'a>,
194 pub excluded_matches: &'a mut AHashSet<String>,
195 pub match_emitter: &'a mut dyn MatchEmitter,
196 pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
197
198 pub per_string_data: &'a mut SharedData,
200 pub per_scanner_data: &'a SharedData,
201 pub per_event_data: &'a mut SharedData,
202 pub event_id: Option<&'a str>,
203}
204
205impl StringMatchesCtx<'_> {
206 pub fn process_async(
216 &self,
217 func: impl for<'a> FnOnce(
218 &'a mut AsyncStringMatchesCtx,
219 )
220 -> Pin<Box<dyn Future<Output = Result<(), ScannerError>> + Send + 'a>>
221 + Send
222 + 'static,
223 ) -> RuleResult {
224 let rule_index = self.rule_index;
225
226 let fut = TOKIO_RUNTIME.spawn(async move {
229 let mut ctx = AsyncStringMatchesCtx {
230 rule_matches: vec![],
231 };
232 (func)(&mut ctx).await?;
233
234 Ok(AsyncRuleInfo {
235 rule_index,
236 rule_matches: ctx.rule_matches,
237 })
238 });
239
240 Ok(RuleStatus::Pending(fut))
241 }
242}
243
244pub struct AsyncStringMatchesCtx {
245 rule_matches: Vec<StringMatch>,
246}
247
248impl AsyncStringMatchesCtx {
249 pub fn emit_match(&mut self, string_match: StringMatch) {
250 self.rule_matches.push(string_match);
251 }
252}
253
254#[must_use]
255pub enum RuleStatus {
256 Done,
257 Pending(PendingRuleResult),
258}
259
260pub type PendingRuleResult = JoinHandle<Result<AsyncRuleInfo, ScannerError>>;
262
263pub struct PendingRuleJob {
264 fut: PendingRuleResult,
265 path: Path<'static>,
266}
267
268pub struct AsyncRuleInfo {
269 rule_index: usize,
270 rule_matches: Vec<StringMatch>,
271}
272
273pub type RuleResult = Result<RuleStatus, ScannerError>;
275
276pub trait CompiledRule: Send + Sync {
278 fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
279 }
281
282 fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
283 }
285
286 fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
287 }
289
290 fn get_string_matches(
291 &self,
292 content: &str,
293 path: &Path,
294 ctx: &mut StringMatchesCtx<'_>,
295 ) -> RuleResult;
296
297 fn should_exclude_multipass_v0(&self) -> bool {
300 false
302 }
303
304 fn on_excluded_match_multipass_v0(&self) {
305 }
307}
308
309impl<T> RuleConfig for Box<T>
310where
311 T: RuleConfig + ?Sized,
312{
313 fn convert_to_compiled_rule(
314 &self,
315 rule_index: usize,
316 labels: Labels,
317 ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
318 self.as_ref().convert_to_compiled_rule(rule_index, labels)
319 }
320}
321
322#[derive(Debug, PartialEq, Clone)]
323struct ScannerFeatures {
324 pub add_implicit_index_wildcards: bool,
325 pub multipass_v0_enabled: bool,
326 pub return_matches: bool,
327 pub skip_rules_with_regex_matching_empty_string: bool,
330}
331
332impl Default for ScannerFeatures {
333 fn default() -> Self {
334 Self {
335 add_implicit_index_wildcards: false,
336 multipass_v0_enabled: true,
337 return_matches: false,
338 skip_rules_with_regex_matching_empty_string: false,
339 }
340 }
341}
342
343pub struct ScanOptions {
344 pub blocked_rules_idx: Vec<usize>,
347 pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
349 pub validate_matches: bool,
352}
353
354impl Default for ScanOptions {
355 fn default() -> Self {
356 Self {
357 blocked_rules_idx: vec![],
358 wildcarded_indices: AHashMap::new(),
359 validate_matches: false,
360 }
361 }
362}
363
364pub struct ScanOptionBuilder {
365 blocked_rules_idx: Vec<usize>,
366 wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
367 validate_matches: bool,
368}
369
370impl ScanOptionBuilder {
371 pub fn new() -> Self {
372 Self {
373 blocked_rules_idx: vec![],
374 wildcarded_indices: AHashMap::new(),
375 validate_matches: false,
376 }
377 }
378
379 pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
380 self.blocked_rules_idx = blocked_rules_idx;
381 self
382 }
383
384 pub fn with_wildcarded_indices(
385 mut self,
386 wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
387 ) -> Self {
388 self.wildcarded_indices = wildcarded_indices;
389 self
390 }
391
392 pub fn with_validate_matching(mut self, validate_matches: bool) -> Self {
393 self.validate_matches = validate_matches;
394 self
395 }
396
397 pub fn build(self) -> ScanOptions {
398 ScanOptions {
399 blocked_rules_idx: self.blocked_rules_idx,
400 wildcarded_indices: self.wildcarded_indices,
401 validate_matches: self.validate_matches,
402 }
403 }
404}
405
406pub struct Scanner {
407 rules: Vec<RootCompiledRule>,
408 scoped_ruleset: ScopedRuleSet,
409 scanner_features: ScannerFeatures,
410 metrics: ScannerMetrics,
411 labels: Labels,
412 match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
413 per_scanner_data: SharedData,
414 async_scan_timeout: Duration,
415}
416
417impl Scanner {
418 pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
419 ScannerBuilder::new(rules)
420 }
421
422 pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
427 self.scan_with_options(event, ScanOptions::default())
428 }
429
430 pub fn scan_with_options<E: Event>(
435 &self,
436 event: &mut E,
437 options: ScanOptions,
438 ) -> Result<Vec<RuleMatch>, ScannerError> {
439 block_on(self.internal_scan_with_metrics(event, options))
440 }
441
442 pub async fn scan_async<E: Event>(
446 &self,
447 event: &mut E,
448 ) -> Result<Vec<RuleMatch>, ScannerError> {
449 self.scan_async_with_options(event, ScanOptions::default())
450 .await
451 }
452
453 pub async fn scan_async_with_options<E: Event>(
454 &self,
455 event: &mut E,
456 options: ScanOptions,
457 ) -> Result<Vec<RuleMatch>, ScannerError> {
458 let fut = self.internal_scan_with_metrics(event, options);
459
460 let timeout = {
463 let _tokio_guard = TOKIO_RUNTIME.enter();
464 timeout(self.async_scan_timeout, fut)
465 };
466
467 timeout.await.unwrap_or(Err(ScannerError::Transient(
468 "Async scan timeout".to_string(),
469 )))
470 }
471
472 fn record_metrics(&self, output_rule_matches: &[RuleMatch], start: Instant) {
473 self.metrics
475 .duration_ns
476 .increment(start.elapsed().as_nanos() as u64);
477 self.metrics.num_scanned_events.increment(1);
479 self.metrics
481 .match_count
482 .increment(output_rule_matches.len() as u64);
483 }
484
485 async fn internal_scan_with_metrics<E: Event>(
486 &self,
487 event: &mut E,
488 options: ScanOptions,
489 ) -> Result<Vec<RuleMatch>, ScannerError> {
490 let start = Instant::now();
491 let result = self.internal_scan(event, options).await;
492 match &result {
493 Ok(rule_matches) => {
494 self.record_metrics(rule_matches, start);
495 }
496 Err(_) => {
497 self.record_metrics(&[], start);
498 }
499 }
500 result
501 }
502
503 fn process_rule_matches<E: Event>(
504 &self,
505 event: &mut E,
506 rule_matches: InternalRuleMatchSet<E::Encoding>,
507 excluded_matches: AHashSet<String>,
508 output_rule_matches: &mut Vec<RuleMatch>,
509 need_match_content: bool,
510 ) {
511 if rule_matches.is_empty() {
512 return;
513 }
514 access_regex_caches(|regex_caches| {
515 for (path, mut rule_matches) in rule_matches.into_iter() {
516 event.visit_string_mut(&path, |content| {
518 rule_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
520
521 <<E as Event>::Encoding>::calculate_indices(
522 content,
523 rule_matches.iter_mut().map(
524 |rule_match: &mut InternalRuleMatch<E::Encoding>| EncodeIndices {
525 utf8_start: rule_match.utf8_start,
526 utf8_end: rule_match.utf8_end,
527 custom_start: &mut rule_match.custom_start,
528 custom_end: &mut rule_match.custom_end,
529 },
530 ),
531 );
532
533 if self.scanner_features.multipass_v0_enabled {
534 rule_matches.retain(|rule_match| {
537 if self.rules[rule_match.rule_index]
538 .inner
539 .should_exclude_multipass_v0()
540 {
541 let is_false_positive = excluded_matches
542 .contains(&content[rule_match.utf8_start..rule_match.utf8_end]);
543 if is_false_positive && self.scanner_features.multipass_v0_enabled {
544 self.rules[rule_match.rule_index]
545 .on_excluded_match_multipass_v0();
546 }
547 !is_false_positive
548 } else {
549 true
550 }
551 });
552 }
553
554 self.suppress_matches::<E::Encoding>(&mut rule_matches, content, regex_caches);
555
556 self.sort_and_remove_overlapping_rules::<E::Encoding>(&mut rule_matches);
557
558 let will_mutate = rule_matches.iter().any(|rule_match| {
559 self.rules[rule_match.rule_index].match_action.is_mutating()
560 });
561
562 self.apply_match_actions(
563 content,
564 &path,
565 &mut rule_matches,
566 output_rule_matches,
567 need_match_content,
568 );
569
570 will_mutate
571 });
572 }
573 });
574 }
575
576 async fn internal_scan<E: Event>(
577 &self,
578 event: &mut E,
579 options: ScanOptions,
580 ) -> Result<Vec<RuleMatch>, ScannerError> {
581 let need_match_content = self.scanner_features.return_matches || options.validate_matches;
584 let mut rule_matches = InternalRuleMatchSet::new();
586 let mut excluded_matches = AHashSet::new();
587 let mut async_jobs = vec![];
588
589 access_regex_caches(|regex_caches| {
590 self.scoped_ruleset.visit_string_rule_combinations(
591 event,
592 ScannerContentVisitor {
593 scanner: self,
594 regex_caches,
595 rule_matches: &mut rule_matches,
596 blocked_rules: &options.blocked_rules_idx,
597 excluded_matches: &mut excluded_matches,
598 per_event_data: SharedData::new(),
599 wildcarded_indexes: &options.wildcarded_indices,
600 async_jobs: &mut async_jobs,
601 event_id: event.get_id().map(|s| s.to_string()),
602 },
603 )
604 })?;
605
606 for job in async_jobs {
609 let rule_info = job.fut.await.unwrap()?;
610 rule_matches.push_async_matches(
611 &job.path,
612 rule_info
613 .rule_matches
614 .into_iter()
615 .map(|x| InternalRuleMatch::new(rule_info.rule_index, x)),
616 );
617 }
618
619 let mut output_rule_matches = vec![];
620
621 self.process_rule_matches(
622 event,
623 rule_matches,
624 excluded_matches,
625 &mut output_rule_matches,
626 need_match_content,
627 );
628
629 if options.validate_matches {
630 self.validate_matches(&mut output_rule_matches);
631 }
632
633 Ok(output_rule_matches)
634 }
635
636 pub fn suppress_matches<E: Encoding>(
637 &self,
638 rule_matches: &mut Vec<InternalRuleMatch<E>>,
639 content: &str,
640 regex_caches: &mut RegexCaches,
641 ) {
642 rule_matches.retain(|rule_match| {
643 if let Some(suppressions) = &self.rules[rule_match.rule_index].suppressions {
644 let match_should_be_suppressed = suppressions.should_match_be_suppressed(
645 &content[rule_match.utf8_start..rule_match.utf8_end],
646 regex_caches,
647 );
648
649 if match_should_be_suppressed {
650 self.metrics.suppressed_match_count.increment(1);
651 }
652 !match_should_be_suppressed
653 } else {
654 true
655 }
656 });
657 }
658
659 pub fn validate_matches(&self, rule_matches: &mut Vec<RuleMatch>) {
660 let mut match_validator_rule_match_per_type = AHashMap::new();
662
663 let mut validated_rule_matches = vec![];
664
665 for mut rule_match in rule_matches.drain(..) {
666 let rule = &self.rules[rule_match.rule_index];
667 if let Some(match_validation_type) = rule.internal_match_validation_type() {
668 match_validator_rule_match_per_type
669 .entry(match_validation_type)
670 .or_insert_with(Vec::new)
671 .push(rule_match)
672 } else {
673 rule_match.match_status.merge(MatchStatus::NotAvailable);
675 validated_rule_matches.push(rule_match);
676 }
677 }
678
679 RAYON_THREAD_POOL.install(|| {
680 use rayon::prelude::*;
681
682 match_validator_rule_match_per_type.par_iter_mut().for_each(
683 |(match_validation_type, matches_per_type)| {
684 let match_validator = self.match_validators_per_type.get(match_validation_type);
685 if let Some(match_validator) = match_validator {
686 match_validator
687 .as_ref()
688 .validate(matches_per_type, &self.rules)
689 }
690 },
691 );
692 });
693
694 for (_, mut matches) in match_validator_rule_match_per_type {
696 validated_rule_matches.append(&mut matches);
697 }
698
699 validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
701 *rule_matches = validated_rule_matches;
702 }
703
704 fn apply_match_actions<E: Encoding>(
707 &self,
708 content: &mut String,
709 path: &Path<'static>,
710 rule_matches: &mut [InternalRuleMatch<E>],
711 output_rule_matches: &mut Vec<RuleMatch>,
712 need_match_content: bool,
713 ) {
714 let mut utf8_byte_delta: isize = 0;
715 let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
716
717 for rule_match in rule_matches {
718 output_rule_matches.push(self.apply_match_actions_for_string::<E>(
719 content,
720 path.clone(),
721 rule_match,
722 &mut utf8_byte_delta,
723 &mut custom_index_delta,
724 need_match_content,
725 ));
726 }
727 }
728
729 fn apply_match_actions_for_string<E: Encoding>(
731 &self,
732 content: &mut String,
733 path: Path<'static>,
734 rule_match: &InternalRuleMatch<E>,
735 utf8_byte_delta: &mut isize,
737
738 custom_index_delta: &mut <E>::IndexShift,
740 need_match_content: bool,
741 ) -> RuleMatch {
742 let rule = &self.rules[rule_match.rule_index];
743
744 let custom_start =
745 (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
746 + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
747
748 let mut matched_content_copy = None;
749
750 if need_match_content {
751 let mutated_utf8_match_start =
753 (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
754 let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
755
756 debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
758 debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
759
760 let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
761 matched_content_copy = Some(matched_content.to_string());
762 }
763
764 if rule.match_action.is_mutating() {
765 let mutated_utf8_match_start =
766 (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
767 let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
768
769 debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
771 debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
772
773 let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
774 if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
775 let before_replacement = &matched_content[replacement.start..replacement.end];
776
777 <E>::adjust_shift(
779 custom_index_delta,
780 before_replacement,
781 &replacement.replacement,
782 );
783 *utf8_byte_delta +=
784 replacement.replacement.len() as isize - before_replacement.len() as isize;
785
786 let replacement_start = mutated_utf8_match_start + replacement.start;
787 let replacement_end = mutated_utf8_match_start + replacement.end;
788 content.replace_range(replacement_start..replacement_end, &replacement.replacement);
789 }
790 }
791
792 let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
793 let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
794 + shift_offset) as usize;
795
796 let rule = &self.rules[rule_match.rule_index];
797
798 let match_status: MatchStatus = if rule.match_validation_type.is_some() {
799 MatchStatus::NotChecked
800 } else {
801 MatchStatus::NotAvailable
802 };
803
804 RuleMatch {
805 rule_index: rule_match.rule_index,
806 path,
807 replacement_type: rule.match_action.replacement_type(),
808 start_index: custom_start,
809 end_index_exclusive: custom_end,
810 shift_offset,
811 match_value: matched_content_copy,
812 match_status,
813 }
814 }
815
816 fn sort_and_remove_overlapping_rules<E: Encoding>(
817 &self,
818 rule_matches: &mut Vec<InternalRuleMatch<E>>,
819 ) {
820 rule_matches.sort_unstable_by(|a, b| {
824 let ord = self.rules[a.rule_index]
826 .match_action
827 .is_mutating()
828 .cmp(&self.rules[b.rule_index].match_action.is_mutating())
829 .reverse();
830
831 let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
833
834 let ord = ord.then(a.len().cmp(&b.len()).reverse());
836
837 let ord = ord.then(a.rule_index.cmp(&b.rule_index));
839
840 ord.reverse()
842 });
843
844 let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
845
846 'rule_matches: while let Some(rule_match) = rule_matches.pop() {
847 if self.rules[rule_match.rule_index].match_action.is_mutating() {
848 if let Some(last) = retained_rules.last()
850 && last.utf8_end > rule_match.utf8_start
851 {
852 continue;
853 }
854 } else {
855 for retained_rule in &retained_rules {
858 if retained_rule.utf8_start < rule_match.utf8_end
859 && retained_rule.utf8_end > rule_match.utf8_start
860 {
861 continue 'rule_matches;
862 }
863 }
864 };
865 retained_rules.push(rule_match);
866 }
867
868 retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
870
871 *rule_matches = retained_rules;
872 }
873}
874
875impl Drop for Scanner {
876 fn drop(&mut self) {
877 let stats = &*GLOBAL_STATS;
878 stats.scanner_deletions.increment(1);
879 stats.decrement_total_scanners();
880 }
881}
882
883#[derive(Default)]
884pub struct ScannerBuilder<'a> {
885 rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
886 labels: Labels,
887 scanner_features: ScannerFeatures,
888 async_scan_timeout: Duration,
889}
890
891impl ScannerBuilder<'_> {
892 pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
893 ScannerBuilder {
894 rules,
895 labels: Labels::empty(),
896 scanner_features: ScannerFeatures::default(),
897 async_scan_timeout: Duration::from_secs(60 * 5),
898 }
899 }
900
901 pub fn labels(mut self, labels: Labels) -> Self {
902 self.labels = labels;
903 self
904 }
905
906 pub fn with_async_scan_timeout(mut self, duration: Duration) -> Self {
907 self.async_scan_timeout = duration;
908 self
909 }
910
911 pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
912 self.scanner_features.add_implicit_index_wildcards = value;
913 self
914 }
915
916 pub fn with_return_matches(mut self, value: bool) -> Self {
917 self.scanner_features.return_matches = value;
918 self
919 }
920
921 pub fn with_multipass_v0(mut self, value: bool) -> Self {
925 self.scanner_features.multipass_v0_enabled = value;
926 self
927 }
928
929 pub fn with_skip_rules_with_regex_matching_empty_string(mut self, value: bool) -> Self {
930 self.scanner_features
931 .skip_rules_with_regex_matching_empty_string = value;
932 self
933 }
934
935 pub fn build(self) -> Result<Scanner, CreateScannerError> {
936 let mut match_validators_per_type = AHashMap::new();
937
938 for rule in self.rules.iter() {
939 if let Some(match_validation_type) = &rule.get_third_party_active_checker()
940 && match_validation_type.can_create_match_validator()
941 {
942 let internal_type = match_validation_type.get_internal_match_validation_type();
943 let match_validator = match_validation_type.into_match_validator();
944 if let Ok(match_validator) = match_validator {
945 if !match_validators_per_type.contains_key(&internal_type) {
946 match_validators_per_type.insert(internal_type, match_validator);
947 }
948 } else {
949 return Err(CreateScannerError::InvalidMatchValidator(
950 MatchValidatorCreationError::InternalError,
951 ));
952 }
953 }
954 }
955
956 let compiled_rules = self
957 .rules
958 .iter()
959 .enumerate()
960 .filter_map(|(rule_index, config)| {
961 let inner = match config.convert_to_compiled_rule(rule_index, self.labels.clone()) {
962 Ok(inner) => Ok(inner),
963 Err(err) => {
964 if self
965 .scanner_features
966 .skip_rules_with_regex_matching_empty_string
967 && err
968 == CreateScannerError::InvalidRegex(
969 RegexValidationError::MatchesEmptyString,
970 )
971 {
972 #[allow(clippy::print_stdout)]
974 {
975 println!("skipping rule that matches empty string: rule_index={}, labels={:?}", rule_index, self.labels.clone());
976 }
977 return None;
978 } else {
979 Err(err)
980 }
981 }
982 };
983 Some((config, inner))
984 })
985 .map(|(config, inner)| {
986 config.match_action.validate()?;
987 let compiled_suppressions = match &config.suppressions {
988 Some(s) => s.compile()?,
989 None => None,
990 };
991 Ok(RootCompiledRule {
992 inner: inner?,
993 scope: config.scope.clone(),
994 match_action: config.match_action.clone(),
995 match_validation_type: config.get_third_party_active_checker().cloned(),
996 suppressions: compiled_suppressions,
997 })
998 })
999 .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
1000
1001 let mut per_scanner_data = SharedData::new();
1002
1003 compiled_rules.iter().for_each(|rule| {
1004 rule.init_per_scanner_data(&mut per_scanner_data);
1005 });
1006
1007 let scoped_ruleset = ScopedRuleSet::new(
1008 &compiled_rules
1009 .iter()
1010 .map(|rule| rule.scope.clone())
1011 .collect::<Vec<_>>(),
1012 )
1013 .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
1014
1015 {
1016 let stats = &*GLOBAL_STATS;
1017 stats.scanner_creations.increment(1);
1018 stats.increment_total_scanners();
1019 }
1020
1021 Ok(Scanner {
1022 rules: compiled_rules,
1023 scoped_ruleset,
1024 scanner_features: self.scanner_features,
1025 metrics: ScannerMetrics::new(&self.labels),
1026 match_validators_per_type,
1027 labels: self.labels,
1028 per_scanner_data,
1029 async_scan_timeout: self.async_scan_timeout,
1030 })
1031 }
1032}
1033
1034struct ScannerContentVisitor<'a, E: Encoding> {
1035 scanner: &'a Scanner,
1036 regex_caches: &'a mut RegexCaches,
1037 rule_matches: &'a mut InternalRuleMatchSet<E>,
1038 blocked_rules: &'a Vec<usize>,
1041 excluded_matches: &'a mut AHashSet<String>,
1042 per_event_data: SharedData,
1043 wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
1044 async_jobs: &'a mut Vec<PendingRuleJob>,
1045 event_id: Option<String>,
1046}
1047
1048impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
1049 fn visit_content<'b>(
1050 &'b mut self,
1051 path: &Path<'a>,
1052 content: &str,
1053 mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
1054 exclusion_check: ExclusionCheck<'b>,
1055 ) -> Result<bool, ScannerError> {
1056 let mut path_rules_matches = vec![];
1058
1059 let mut per_string_data = SharedData::new();
1061 let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
1062
1063 rule_visitor.visit_rule_indices(|rule_index| {
1064 if self.blocked_rules.contains(&rule_index) {
1065 return Ok(());
1066 }
1067 let rule = &self.scanner.rules[rule_index];
1068 {
1069 let mut emitter = |rule_match: StringMatch| {
1071 assert_ne!(rule_match.start, rule_match.end, "empty match detected");
1074 path_rules_matches.push(InternalRuleMatch::new(rule_index, rule_match));
1075 };
1076
1077 rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
1078
1079 rule.init_per_event_data(&mut self.per_event_data);
1081
1082 let mut ctx = StringMatchesCtx {
1083 rule_index,
1084 regex_caches: self.regex_caches,
1085 exclusion_check: &exclusion_check,
1086 excluded_matches: self.excluded_matches,
1087 match_emitter: &mut emitter,
1088 wildcard_indices: wildcard_indices_per_path,
1089 per_string_data: &mut per_string_data,
1090 per_scanner_data: &self.scanner.per_scanner_data,
1091 per_event_data: &mut self.per_event_data,
1092 event_id: self.event_id.as_deref(),
1093 };
1094
1095 let async_status = rule.get_string_matches(content, path, &mut ctx)?;
1096
1097 match async_status {
1098 RuleStatus::Done => {
1099 }
1101 RuleStatus::Pending(fut) => {
1102 self.async_jobs.push(PendingRuleJob {
1103 fut,
1104 path: path.into_static(),
1105 });
1106 }
1107 }
1108 }
1109 Ok(())
1110 })?;
1111
1112 let needs_to_access_content = !path_rules_matches.is_empty() || !self.async_jobs.is_empty();
1117
1118 self.rule_matches
1119 .push_sync_matches(path, path_rules_matches);
1120
1121 Ok(needs_to_access_content)
1122 }
1123}
1124
1125fn get_next_regex_start(content: &str, regex_match: (usize, usize)) -> Option<usize> {
1127 if let Some((i, _)) = content[regex_match.0..].char_indices().nth(1) {
1129 Some(regex_match.0 + i)
1130 } else {
1131 None
1133 }
1134}
1135
1136fn is_false_positive_match(
1137 regex_match_range: (usize, usize),
1138 rule: &RegexCompiledRule,
1139 content: &str,
1140 check_excluded_keywords: bool,
1141) -> bool {
1142 if check_excluded_keywords
1143 && let Some(excluded_keywords) = &rule.excluded_keywords
1144 && excluded_keywords.is_false_positive_match(content, regex_match_range.0)
1145 {
1146 return true;
1147 }
1148
1149 if let Some(validator) = rule.validator.as_ref()
1150 && !validator.is_valid_match(&content[regex_match_range.0..regex_match_range.1])
1151 {
1152 return true;
1153 }
1154 false
1155}