1use crate::encoding::Encoding;
2use crate::event::Event;
3use std::future::Future;
4
5use crate::match_validation::{
6 config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
7 match_validator::MatchValidator,
8};
9
10use error::MatchValidatorCreationError;
11
12use self::metrics::ScannerMetrics;
13use crate::match_validation::match_validator::RAYON_THREAD_POOL;
14use crate::observability::labels::Labels;
15use crate::rule_match::{InternalRuleMatch, RuleMatch};
16use crate::scanner::config::RuleConfig;
17use crate::scanner::internal_rule_match_set::InternalRuleMatchSet;
18use crate::scanner::regex_rule::compiled::RegexCompiledRule;
19use crate::scanner::regex_rule::{RegexCaches, access_regex_caches};
20use crate::scanner::scope::Scope;
21pub use crate::scanner::shared_data::SharedData;
22use crate::scanner::suppression::{CompiledSuppressions, SuppressionValidationError, Suppressions};
23use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
24pub use crate::secondary_validation::Validator;
25use crate::stats::GLOBAL_STATS;
26use crate::tokio::TOKIO_RUNTIME;
27use crate::{
28 CreateScannerError, EncodeIndices, MatchAction, Path, RegexValidationError, ScannerError,
29};
30use ahash::{AHashMap, AHashSet};
31use futures::executor::block_on;
32use serde::{Deserialize, Serialize};
33use serde_with::serde_as;
34use std::ops::Deref;
35use std::pin::Pin;
36use std::sync::Arc;
37use std::time::{Duration, Instant};
38use tokio::task::JoinHandle;
39use tokio::time::timeout;
40
41pub mod config;
42pub mod error;
43pub mod metrics;
44pub mod regex_rule;
45pub mod scope;
46pub mod shared_data;
47pub mod shared_pool;
48pub mod suppression;
49
50mod internal_rule_match_set;
51#[cfg(test)]
52mod test;
53
54#[derive(Copy, Clone)]
55pub struct StringMatch {
56 pub start: usize,
57 pub end: usize,
58}
59
60pub trait MatchEmitter<T = ()> {
61 fn emit(&mut self, string_match: StringMatch) -> T;
62}
63
64impl<F, T> MatchEmitter<T> for F
67where
68 F: FnMut(StringMatch) -> T,
69{
70 fn emit(&mut self, string_match: StringMatch) -> T {
71 (self)(string_match)
73 }
74}
75
76#[serde_as]
77#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
78pub struct RootRuleConfig<T> {
79 pub match_action: MatchAction,
80 #[serde(default)]
81 pub scope: Scope,
82 #[deprecated(note = "Use `third_party_active_checker` instead")]
83 match_validation_type: Option<MatchValidationType>,
84 third_party_active_checker: Option<MatchValidationType>,
85 suppressions: Option<Suppressions>,
86 #[serde(flatten)]
87 pub inner: T,
88}
89
90impl<T> RootRuleConfig<T>
91where
92 T: RuleConfig + 'static,
93{
94 pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
95 RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
96 }
97
98 pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
99 self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
100 }
101}
102
103impl<T> RootRuleConfig<T> {
104 pub fn new(inner: T) -> Self {
105 #[allow(deprecated)]
106 Self {
107 match_action: MatchAction::None,
108 scope: Scope::all(),
109 match_validation_type: None,
110 third_party_active_checker: None,
111 suppressions: None,
112 inner,
113 }
114 }
115
116 pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
117 #[allow(deprecated)]
118 RootRuleConfig {
119 match_action: self.match_action,
120 scope: self.scope,
121 match_validation_type: self.match_validation_type,
122 third_party_active_checker: self.third_party_active_checker,
123 suppressions: self.suppressions,
124 inner: func(self.inner),
125 }
126 }
127
128 pub fn match_action(mut self, action: MatchAction) -> Self {
129 self.match_action = action;
130 self
131 }
132
133 pub fn scope(mut self, scope: Scope) -> Self {
134 self.scope = scope;
135 self
136 }
137
138 pub fn third_party_active_checker(
139 mut self,
140 match_validation_type: MatchValidationType,
141 ) -> Self {
142 self.third_party_active_checker = Some(match_validation_type);
143 self
144 }
145
146 pub fn suppressions(mut self, suppressions: Suppressions) -> Self {
147 self.suppressions = Some(suppressions);
148 self
149 }
150
151 fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
152 #[allow(deprecated)]
153 self.third_party_active_checker
154 .as_ref()
155 .or(self.match_validation_type.as_ref())
156 }
157}
158
159impl<T> Deref for RootRuleConfig<T> {
160 type Target = T;
161
162 fn deref(&self) -> &Self::Target {
163 &self.inner
164 }
165}
166pub struct RootCompiledRule {
167 pub inner: Box<dyn CompiledRule>,
168 pub scope: Scope,
169 pub match_action: MatchAction,
170 pub match_validation_type: Option<MatchValidationType>,
171 pub suppressions: Option<CompiledSuppressions>,
172}
173
174impl RootCompiledRule {
175 pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
176 self.match_validation_type
177 .as_ref()
178 .map(|x| x.get_internal_match_validation_type())
179 }
180}
181
182impl Deref for RootCompiledRule {
183 type Target = dyn CompiledRule;
184
185 fn deref(&self) -> &Self::Target {
186 self.inner.as_ref()
187 }
188}
189
190pub struct StringMatchesCtx<'a> {
191 rule_index: usize,
192 pub regex_caches: &'a mut RegexCaches,
193 pub exclusion_check: &'a ExclusionCheck<'a>,
194 pub excluded_matches: &'a mut AHashSet<String>,
195 pub match_emitter: &'a mut dyn MatchEmitter,
196 pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
197
198 pub per_string_data: &'a mut SharedData,
200 pub per_scanner_data: &'a SharedData,
201 pub per_event_data: &'a mut SharedData,
202}
203
204impl StringMatchesCtx<'_> {
205 pub fn process_async(
215 &self,
216 func: impl for<'a> FnOnce(
217 &'a mut AsyncStringMatchesCtx,
218 )
219 -> Pin<Box<dyn Future<Output = Result<(), ScannerError>> + Send + 'a>>
220 + Send
221 + 'static,
222 ) -> RuleResult {
223 let rule_index = self.rule_index;
224
225 let fut = TOKIO_RUNTIME.spawn(async move {
228 let mut ctx = AsyncStringMatchesCtx {
229 rule_matches: vec![],
230 };
231 (func)(&mut ctx).await?;
232
233 Ok(AsyncRuleInfo {
234 rule_index,
235 rule_matches: ctx.rule_matches,
236 })
237 });
238
239 Ok(RuleStatus::Pending(fut))
240 }
241}
242
243pub struct AsyncStringMatchesCtx {
244 rule_matches: Vec<StringMatch>,
245}
246
247impl AsyncStringMatchesCtx {
248 pub fn emit_match(&mut self, string_match: StringMatch) {
249 self.rule_matches.push(string_match);
250 }
251}
252
253#[must_use]
254pub enum RuleStatus {
255 Done,
256 Pending(PendingRuleResult),
257}
258
259pub type PendingRuleResult = JoinHandle<Result<AsyncRuleInfo, ScannerError>>;
261
262pub struct PendingRuleJob {
263 fut: PendingRuleResult,
264 path: Path<'static>,
265}
266
267pub struct AsyncRuleInfo {
268 rule_index: usize,
269 rule_matches: Vec<StringMatch>,
270}
271
272pub type RuleResult = Result<RuleStatus, ScannerError>;
274
275pub trait CompiledRule: Send + Sync {
277 fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
278 }
280
281 fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
282 }
284
285 fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
286 }
288
289 fn get_string_matches(
290 &self,
291 content: &str,
292 path: &Path,
293 ctx: &mut StringMatchesCtx<'_>,
294 ) -> RuleResult;
295
296 fn should_exclude_multipass_v0(&self) -> bool {
299 false
301 }
302
303 fn on_excluded_match_multipass_v0(&self) {
304 }
306}
307
308impl<T> RuleConfig for Box<T>
309where
310 T: RuleConfig + ?Sized,
311{
312 fn convert_to_compiled_rule(
313 &self,
314 rule_index: usize,
315 labels: Labels,
316 ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
317 self.as_ref().convert_to_compiled_rule(rule_index, labels)
318 }
319}
320
321#[derive(Debug, PartialEq, Clone)]
322struct ScannerFeatures {
323 pub add_implicit_index_wildcards: bool,
324 pub multipass_v0_enabled: bool,
325 pub return_matches: bool,
326 pub skip_rules_with_regex_matching_empty_string: bool,
329}
330
331impl Default for ScannerFeatures {
332 fn default() -> Self {
333 Self {
334 add_implicit_index_wildcards: false,
335 multipass_v0_enabled: true,
336 return_matches: false,
337 skip_rules_with_regex_matching_empty_string: false,
338 }
339 }
340}
341
342pub struct ScanOptions {
343 pub blocked_rules_idx: Vec<usize>,
346 pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
348 pub validate_matches: bool,
351}
352
353impl Default for ScanOptions {
354 fn default() -> Self {
355 Self {
356 blocked_rules_idx: vec![],
357 wildcarded_indices: AHashMap::new(),
358 validate_matches: false,
359 }
360 }
361}
362
363pub struct ScanOptionBuilder {
364 blocked_rules_idx: Vec<usize>,
365 wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
366 validate_matches: bool,
367}
368
369impl ScanOptionBuilder {
370 pub fn new() -> Self {
371 Self {
372 blocked_rules_idx: vec![],
373 wildcarded_indices: AHashMap::new(),
374 validate_matches: false,
375 }
376 }
377
378 pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
379 self.blocked_rules_idx = blocked_rules_idx;
380 self
381 }
382
383 pub fn with_wildcarded_indices(
384 mut self,
385 wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
386 ) -> Self {
387 self.wildcarded_indices = wildcarded_indices;
388 self
389 }
390
391 pub fn with_validate_matching(mut self, validate_matches: bool) -> Self {
392 self.validate_matches = validate_matches;
393 self
394 }
395
396 pub fn build(self) -> ScanOptions {
397 ScanOptions {
398 blocked_rules_idx: self.blocked_rules_idx,
399 wildcarded_indices: self.wildcarded_indices,
400 validate_matches: self.validate_matches,
401 }
402 }
403}
404
405pub struct Scanner {
406 rules: Vec<RootCompiledRule>,
407 scoped_ruleset: ScopedRuleSet,
408 scanner_features: ScannerFeatures,
409 metrics: ScannerMetrics,
410 labels: Labels,
411 match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
412 per_scanner_data: SharedData,
413 async_scan_timeout: Duration,
414}
415
416impl Scanner {
417 pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
418 ScannerBuilder::new(rules)
419 }
420
421 pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
426 self.scan_with_options(event, ScanOptions::default())
427 }
428
429 pub fn scan_with_options<E: Event>(
434 &self,
435 event: &mut E,
436 options: ScanOptions,
437 ) -> Result<Vec<RuleMatch>, ScannerError> {
438 block_on(self.internal_scan_with_metrics(event, options))
439 }
440
441 pub async fn scan_async<E: Event>(
445 &self,
446 event: &mut E,
447 ) -> Result<Vec<RuleMatch>, ScannerError> {
448 self.scan_async_with_options(event, ScanOptions::default())
449 .await
450 }
451
452 pub async fn scan_async_with_options<E: Event>(
453 &self,
454 event: &mut E,
455 options: ScanOptions,
456 ) -> Result<Vec<RuleMatch>, ScannerError> {
457 let fut = self.internal_scan_with_metrics(event, options);
458
459 let timeout = {
462 let _tokio_guard = TOKIO_RUNTIME.enter();
463 timeout(self.async_scan_timeout, fut)
464 };
465
466 timeout.await.unwrap_or(Err(ScannerError::Transient(
467 "Async scan timeout".to_string(),
468 )))
469 }
470
471 fn record_metrics(&self, output_rule_matches: &[RuleMatch], start: Instant) {
472 self.metrics
474 .duration_ns
475 .increment(start.elapsed().as_nanos() as u64);
476 self.metrics.num_scanned_events.increment(1);
478 self.metrics
480 .match_count
481 .increment(output_rule_matches.len() as u64);
482 }
483
484 async fn internal_scan_with_metrics<E: Event>(
485 &self,
486 event: &mut E,
487 options: ScanOptions,
488 ) -> Result<Vec<RuleMatch>, ScannerError> {
489 let start = Instant::now();
490 let result = self.internal_scan(event, options).await;
491 match &result {
492 Ok(rule_matches) => {
493 self.record_metrics(rule_matches, start);
494 }
495 Err(_) => {
496 self.record_metrics(&[], start);
497 }
498 }
499 result
500 }
501
502 async fn internal_scan<E: Event>(
503 &self,
504 event: &mut E,
505 options: ScanOptions,
506 ) -> Result<Vec<RuleMatch>, ScannerError> {
507 let need_match_content = self.scanner_features.return_matches || options.validate_matches;
510 let mut rule_matches = InternalRuleMatchSet::new();
512 let mut excluded_matches = AHashSet::new();
513 let mut async_jobs = vec![];
514
515 access_regex_caches(|regex_caches| {
516 self.scoped_ruleset.visit_string_rule_combinations(
517 event,
518 ScannerContentVisitor {
519 scanner: self,
520 regex_caches,
521 rule_matches: &mut rule_matches,
522 blocked_rules: &options.blocked_rules_idx,
523 excluded_matches: &mut excluded_matches,
524 per_event_data: SharedData::new(),
525 wildcarded_indexes: &options.wildcarded_indices,
526 async_jobs: &mut async_jobs,
527 },
528 )
529 })?;
530
531 for job in async_jobs {
534 let rule_info = job.fut.await.unwrap()?;
535 rule_matches.push_async_matches(
536 &job.path,
537 rule_info
538 .rule_matches
539 .into_iter()
540 .map(|x| InternalRuleMatch::new(rule_info.rule_index, x)),
541 );
542 }
543
544 let mut output_rule_matches = vec![];
545
546 for (path, mut rule_matches) in rule_matches.into_iter() {
547 event.visit_string_mut(&path, |content| {
549 rule_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
551
552 <<E as Event>::Encoding>::calculate_indices(
553 content,
554 rule_matches.iter_mut().map(
555 |rule_match: &mut InternalRuleMatch<E::Encoding>| EncodeIndices {
556 utf8_start: rule_match.utf8_start,
557 utf8_end: rule_match.utf8_end,
558 custom_start: &mut rule_match.custom_start,
559 custom_end: &mut rule_match.custom_end,
560 },
561 ),
562 );
563
564 if self.scanner_features.multipass_v0_enabled {
565 rule_matches.retain(|rule_match| {
568 if self.rules[rule_match.rule_index]
569 .inner
570 .should_exclude_multipass_v0()
571 {
572 let is_false_positive = excluded_matches
573 .contains(&content[rule_match.utf8_start..rule_match.utf8_end]);
574 if is_false_positive && self.scanner_features.multipass_v0_enabled {
575 self.rules[rule_match.rule_index].on_excluded_match_multipass_v0();
576 }
577 !is_false_positive
578 } else {
579 true
580 }
581 });
582 }
583
584 self.suppress_matches::<E::Encoding>(&mut rule_matches, content);
585
586 self.sort_and_remove_overlapping_rules::<E::Encoding>(&mut rule_matches);
587
588 let will_mutate = rule_matches
589 .iter()
590 .any(|rule_match| self.rules[rule_match.rule_index].match_action.is_mutating());
591
592 self.apply_match_actions(
593 content,
594 &path,
595 &mut rule_matches,
596 &mut output_rule_matches,
597 need_match_content,
598 );
599
600 will_mutate
601 });
602 }
603
604 if options.validate_matches {
605 self.validate_matches(&mut output_rule_matches);
606 }
607
608 Ok(output_rule_matches)
609 }
610
611 pub fn suppress_matches<E: Encoding>(
612 &self,
613 rule_matches: &mut Vec<InternalRuleMatch<E>>,
614 content: &str,
615 ) {
616 rule_matches.retain(|rule_match| {
617 if let Some(suppressions) = &self.rules[rule_match.rule_index].suppressions {
618 let match_should_be_suppressed = suppressions.should_match_be_suppressed(
619 &content[rule_match.utf8_start..rule_match.utf8_end],
620 );
621 if match_should_be_suppressed {
622 self.metrics.suppressed_match_count.increment(1);
623 }
624 !match_should_be_suppressed
625 } else {
626 true
627 }
628 });
629 }
630
631 pub fn validate_matches(&self, rule_matches: &mut Vec<RuleMatch>) {
632 let mut match_validator_rule_match_per_type = AHashMap::new();
634
635 let mut validated_rule_matches = vec![];
636
637 for mut rule_match in rule_matches.drain(..) {
638 let rule = &self.rules[rule_match.rule_index];
639 if let Some(match_validation_type) = rule.internal_match_validation_type() {
640 match_validator_rule_match_per_type
641 .entry(match_validation_type)
642 .or_insert_with(Vec::new)
643 .push(rule_match)
644 } else {
645 rule_match.match_status.merge(MatchStatus::NotAvailable);
647 validated_rule_matches.push(rule_match);
648 }
649 }
650
651 RAYON_THREAD_POOL.install(|| {
652 use rayon::prelude::*;
653
654 match_validator_rule_match_per_type.par_iter_mut().for_each(
655 |(match_validation_type, matches_per_type)| {
656 let match_validator = self.match_validators_per_type.get(match_validation_type);
657 if let Some(match_validator) = match_validator {
658 match_validator
659 .as_ref()
660 .validate(matches_per_type, &self.rules)
661 }
662 },
663 );
664 });
665
666 for (_, mut matches) in match_validator_rule_match_per_type {
668 validated_rule_matches.append(&mut matches);
669 }
670
671 validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
673 *rule_matches = validated_rule_matches;
674 }
675
676 fn apply_match_actions<E: Encoding>(
679 &self,
680 content: &mut String,
681 path: &Path<'static>,
682 rule_matches: &mut [InternalRuleMatch<E>],
683 output_rule_matches: &mut Vec<RuleMatch>,
684 need_match_content: bool,
685 ) {
686 let mut utf8_byte_delta: isize = 0;
687 let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
688
689 for rule_match in rule_matches {
690 output_rule_matches.push(self.apply_match_actions_for_string::<E>(
691 content,
692 path.clone(),
693 rule_match,
694 &mut utf8_byte_delta,
695 &mut custom_index_delta,
696 need_match_content,
697 ));
698 }
699 }
700
701 fn apply_match_actions_for_string<E: Encoding>(
703 &self,
704 content: &mut String,
705 path: Path<'static>,
706 rule_match: &InternalRuleMatch<E>,
707 utf8_byte_delta: &mut isize,
709
710 custom_index_delta: &mut <E>::IndexShift,
712 need_match_content: bool,
713 ) -> RuleMatch {
714 let rule = &self.rules[rule_match.rule_index];
715
716 let custom_start =
717 (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
718 + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
719
720 let mut matched_content_copy = None;
721
722 if need_match_content {
723 let mutated_utf8_match_start =
725 (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
726 let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
727
728 debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
730 debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
731
732 let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
733 matched_content_copy = Some(matched_content.to_string());
734 }
735
736 if rule.match_action.is_mutating() {
737 let mutated_utf8_match_start =
738 (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
739 let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
740
741 debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
743 debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
744
745 let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
746 if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
747 let before_replacement = &matched_content[replacement.start..replacement.end];
748
749 <E>::adjust_shift(
751 custom_index_delta,
752 before_replacement,
753 &replacement.replacement,
754 );
755 *utf8_byte_delta +=
756 replacement.replacement.len() as isize - before_replacement.len() as isize;
757
758 let replacement_start = mutated_utf8_match_start + replacement.start;
759 let replacement_end = mutated_utf8_match_start + replacement.end;
760 content.replace_range(replacement_start..replacement_end, &replacement.replacement);
761 }
762 }
763
764 let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
765 let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
766 + shift_offset) as usize;
767
768 let rule = &self.rules[rule_match.rule_index];
769
770 let match_status: MatchStatus = if rule.match_validation_type.is_some() {
771 MatchStatus::NotChecked
772 } else {
773 MatchStatus::NotAvailable
774 };
775
776 RuleMatch {
777 rule_index: rule_match.rule_index,
778 path,
779 replacement_type: rule.match_action.replacement_type(),
780 start_index: custom_start,
781 end_index_exclusive: custom_end,
782 shift_offset,
783 match_value: matched_content_copy,
784 match_status,
785 }
786 }
787
788 fn sort_and_remove_overlapping_rules<E: Encoding>(
789 &self,
790 rule_matches: &mut Vec<InternalRuleMatch<E>>,
791 ) {
792 rule_matches.sort_unstable_by(|a, b| {
796 let ord = self.rules[a.rule_index]
798 .match_action
799 .is_mutating()
800 .cmp(&self.rules[b.rule_index].match_action.is_mutating())
801 .reverse();
802
803 let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
805
806 let ord = ord.then(a.len().cmp(&b.len()).reverse());
808
809 let ord = ord.then(a.rule_index.cmp(&b.rule_index));
811
812 ord.reverse()
814 });
815
816 let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
817
818 'rule_matches: while let Some(rule_match) = rule_matches.pop() {
819 if self.rules[rule_match.rule_index].match_action.is_mutating() {
820 if let Some(last) = retained_rules.last()
822 && last.utf8_end > rule_match.utf8_start
823 {
824 continue;
825 }
826 } else {
827 for retained_rule in &retained_rules {
830 if retained_rule.utf8_start < rule_match.utf8_end
831 && retained_rule.utf8_end > rule_match.utf8_start
832 {
833 continue 'rule_matches;
834 }
835 }
836 };
837 retained_rules.push(rule_match);
838 }
839
840 retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
842
843 *rule_matches = retained_rules;
844 }
845}
846
847impl Drop for Scanner {
848 fn drop(&mut self) {
849 let stats = &*GLOBAL_STATS;
850 stats.scanner_deletions.increment(1);
851 stats.decrement_total_scanners();
852 }
853}
854
855#[derive(Default)]
856pub struct ScannerBuilder<'a> {
857 rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
858 labels: Labels,
859 scanner_features: ScannerFeatures,
860 async_scan_timeout: Duration,
861}
862
863impl ScannerBuilder<'_> {
864 pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
865 ScannerBuilder {
866 rules,
867 labels: Labels::empty(),
868 scanner_features: ScannerFeatures::default(),
869 async_scan_timeout: Duration::from_secs(60 * 5),
870 }
871 }
872
873 pub fn labels(mut self, labels: Labels) -> Self {
874 self.labels = labels;
875 self
876 }
877
878 pub fn with_async_scan_timeout(mut self, duration: Duration) -> Self {
879 self.async_scan_timeout = duration;
880 self
881 }
882
883 pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
884 self.scanner_features.add_implicit_index_wildcards = value;
885 self
886 }
887
888 pub fn with_return_matches(mut self, value: bool) -> Self {
889 self.scanner_features.return_matches = value;
890 self
891 }
892
893 pub fn with_multipass_v0(mut self, value: bool) -> Self {
897 self.scanner_features.multipass_v0_enabled = value;
898 self
899 }
900
901 pub fn with_skip_rules_with_regex_matching_empty_string(mut self, value: bool) -> Self {
902 self.scanner_features
903 .skip_rules_with_regex_matching_empty_string = value;
904 self
905 }
906
907 pub fn build(self) -> Result<Scanner, CreateScannerError> {
908 let mut match_validators_per_type = AHashMap::new();
909
910 for rule in self.rules.iter() {
911 if let Some(match_validation_type) = &rule.get_third_party_active_checker()
912 && match_validation_type.can_create_match_validator()
913 {
914 let internal_type = match_validation_type.get_internal_match_validation_type();
915 let match_validator = match_validation_type.into_match_validator();
916 if let Ok(match_validator) = match_validator {
917 if !match_validators_per_type.contains_key(&internal_type) {
918 match_validators_per_type.insert(internal_type, match_validator);
919 }
920 } else {
921 return Err(CreateScannerError::InvalidMatchValidator(
922 MatchValidatorCreationError::InternalError,
923 ));
924 }
925 }
926 }
927
928 let compiled_rules = self
929 .rules
930 .iter()
931 .enumerate()
932 .filter_map(|(rule_index, config)| {
933 let inner = match config.convert_to_compiled_rule(rule_index, self.labels.clone()) {
934 Ok(inner) => Ok(inner),
935 Err(err) => {
936 if self
937 .scanner_features
938 .skip_rules_with_regex_matching_empty_string
939 && err
940 == CreateScannerError::InvalidRegex(
941 RegexValidationError::MatchesEmptyString,
942 )
943 {
944 #[allow(clippy::print_stdout)]
946 {
947 println!("skipping rule that matches empty string: rule_index={}, labels={:?}", rule_index, self.labels.clone());
948 }
949 return None;
950 } else {
951 Err(err)
952 }
953 }
954 };
955 Some((config, inner))
956 })
957 .map(|(config, inner)| {
958 config.match_action.validate()?;
959 let compiled_suppressions = match &config.suppressions {
960 Some(s) => Some(s.clone().try_into()?),
961 None => None,
962 };
963 Ok(RootCompiledRule {
964 inner: inner?,
965 scope: config.scope.clone(),
966 match_action: config.match_action.clone(),
967 match_validation_type: config.get_third_party_active_checker().cloned(),
968 suppressions: compiled_suppressions,
969 })
970 })
971 .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
972
973 let mut per_scanner_data = SharedData::new();
974
975 compiled_rules.iter().for_each(|rule| {
976 rule.init_per_scanner_data(&mut per_scanner_data);
977 });
978
979 let scoped_ruleset = ScopedRuleSet::new(
980 &compiled_rules
981 .iter()
982 .map(|rule| rule.scope.clone())
983 .collect::<Vec<_>>(),
984 )
985 .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
986
987 {
988 let stats = &*GLOBAL_STATS;
989 stats.scanner_creations.increment(1);
990 stats.increment_total_scanners();
991 }
992
993 Ok(Scanner {
994 rules: compiled_rules,
995 scoped_ruleset,
996 scanner_features: self.scanner_features,
997 metrics: ScannerMetrics::new(&self.labels),
998 match_validators_per_type,
999 labels: self.labels,
1000 per_scanner_data,
1001 async_scan_timeout: self.async_scan_timeout,
1002 })
1003 }
1004}
1005
1006struct ScannerContentVisitor<'a, E: Encoding> {
1007 scanner: &'a Scanner,
1008 regex_caches: &'a mut RegexCaches,
1009 rule_matches: &'a mut InternalRuleMatchSet<E>,
1010 blocked_rules: &'a Vec<usize>,
1013 excluded_matches: &'a mut AHashSet<String>,
1014 per_event_data: SharedData,
1015 wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
1016 async_jobs: &'a mut Vec<PendingRuleJob>,
1017}
1018
1019impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
1020 fn visit_content<'b>(
1021 &'b mut self,
1022 path: &Path<'a>,
1023 content: &str,
1024 mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
1025 exclusion_check: ExclusionCheck<'b>,
1026 ) -> Result<bool, ScannerError> {
1027 let mut path_rules_matches = vec![];
1029
1030 let mut per_string_data = SharedData::new();
1032 let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
1033
1034 rule_visitor.visit_rule_indices(|rule_index| {
1035 if self.blocked_rules.contains(&rule_index) {
1036 return Ok(());
1037 }
1038 let rule = &self.scanner.rules[rule_index];
1039 {
1040 let mut emitter = |rule_match: StringMatch| {
1042 assert_ne!(rule_match.start, rule_match.end, "empty match detected");
1045 path_rules_matches.push(InternalRuleMatch::new(rule_index, rule_match));
1046 };
1047
1048 rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
1049
1050 rule.init_per_event_data(&mut self.per_event_data);
1052
1053 let mut ctx = StringMatchesCtx {
1054 rule_index,
1055 regex_caches: self.regex_caches,
1056 exclusion_check: &exclusion_check,
1057 excluded_matches: self.excluded_matches,
1058 match_emitter: &mut emitter,
1059 wildcard_indices: wildcard_indices_per_path,
1060 per_string_data: &mut per_string_data,
1061 per_scanner_data: &self.scanner.per_scanner_data,
1062 per_event_data: &mut self.per_event_data,
1063 };
1064
1065 let async_status = rule.get_string_matches(content, path, &mut ctx)?;
1066
1067 match async_status {
1068 RuleStatus::Done => {
1069 }
1071 RuleStatus::Pending(fut) => {
1072 self.async_jobs.push(PendingRuleJob {
1073 fut,
1074 path: path.into_static(),
1075 });
1076 }
1077 }
1078 }
1079 Ok(())
1080 })?;
1081
1082 let needs_to_access_content = !path_rules_matches.is_empty() || !self.async_jobs.is_empty();
1087
1088 self.rule_matches
1089 .push_sync_matches(path, path_rules_matches);
1090
1091 Ok(needs_to_access_content)
1092 }
1093}
1094
1095fn get_next_regex_start(content: &str, regex_match: (usize, usize)) -> Option<usize> {
1097 if let Some((i, _)) = content[regex_match.0..].char_indices().nth(1) {
1099 Some(regex_match.0 + i)
1100 } else {
1101 None
1103 }
1104}
1105
1106fn is_false_positive_match(
1107 regex_match_range: (usize, usize),
1108 rule: &RegexCompiledRule,
1109 content: &str,
1110 check_excluded_keywords: bool,
1111) -> bool {
1112 if check_excluded_keywords
1113 && let Some(excluded_keywords) = &rule.excluded_keywords
1114 && excluded_keywords.is_false_positive_match(content, regex_match_range.0)
1115 {
1116 return true;
1117 }
1118
1119 if let Some(validator) = rule.validator.as_ref()
1120 && !validator.is_valid_match(&content[regex_match_range.0..regex_match_range.1])
1121 {
1122 return true;
1123 }
1124 false
1125}