1use crate::encoding::Encoding;
2use crate::event::Event;
3use std::future::Future;
4
5use crate::match_validation::{
6 config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
7 match_validator::MatchValidator,
8};
9
10use error::MatchValidatorCreationError;
11
12use self::metrics::ScannerMetrics;
13use crate::match_validation::match_validator::RAYON_THREAD_POOL;
14use crate::observability::labels::Labels;
15use crate::rule_match::{InternalRuleMatch, RuleMatch};
16use crate::scanner::config::RuleConfig;
17use crate::scanner::internal_rule_match_set::InternalRuleMatchSet;
18use crate::scanner::regex_rule::compiled::RegexCompiledRule;
19use crate::scanner::regex_rule::{RegexCaches, access_regex_caches};
20use crate::scanner::scope::Scope;
21pub use crate::scanner::shared_data::SharedData;
22use crate::scanner::suppression::{CompiledSuppressions, Suppressions};
23use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
24pub use crate::secondary_validation::Validator;
25use crate::stats::GLOBAL_STATS;
26use crate::tokio::TOKIO_RUNTIME;
27use crate::{
28 CreateScannerError, EncodeIndices, MatchAction, Path, RegexValidationError, ScannerError,
29};
30use ahash::{AHashMap, AHashSet};
31use futures::executor::block_on;
32use regex_automata::Match;
33use serde::{Deserialize, Serialize};
34use serde_with::serde_as;
35use std::ops::Deref;
36use std::pin::Pin;
37use std::sync::Arc;
38use std::time::{Duration, Instant};
39use tokio::task::JoinHandle;
40use tokio::time::timeout;
41
42pub mod config;
43pub mod error;
44pub mod metrics;
45pub mod regex_rule;
46pub mod scope;
47pub mod shared_data;
48pub mod shared_pool;
49pub mod suppression;
50
51mod internal_rule_match_set;
52#[cfg(test)]
53mod test;
54
55#[derive(Copy, Clone)]
56pub struct StringMatch {
57 pub start: usize,
58 pub end: usize,
59}
60
61pub trait MatchEmitter<T = ()> {
62 fn emit(&mut self, string_match: StringMatch) -> T;
63}
64
65impl<F, T> MatchEmitter<T> for F
68where
69 F: FnMut(StringMatch) -> T,
70{
71 fn emit(&mut self, string_match: StringMatch) -> T {
72 (self)(string_match)
74 }
75}
76
77#[serde_as]
78#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
79pub struct RootRuleConfig<T> {
80 pub match_action: MatchAction,
81 #[serde(default)]
82 pub scope: Scope,
83 #[deprecated(note = "Use `third_party_active_checker` instead")]
84 match_validation_type: Option<MatchValidationType>,
85 third_party_active_checker: Option<MatchValidationType>,
86 suppressions: Option<Suppressions>,
87 #[serde(flatten)]
88 pub inner: T,
89}
90
91impl<T> RootRuleConfig<T>
92where
93 T: RuleConfig + 'static,
94{
95 pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
96 RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
97 }
98
99 pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
100 self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
101 }
102}
103
104impl<T> RootRuleConfig<T> {
105 pub fn new(inner: T) -> Self {
106 #[allow(deprecated)]
107 Self {
108 match_action: MatchAction::None,
109 scope: Scope::all(),
110 match_validation_type: None,
111 third_party_active_checker: None,
112 suppressions: None,
113 inner,
114 }
115 }
116
117 pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
118 #[allow(deprecated)]
119 RootRuleConfig {
120 match_action: self.match_action,
121 scope: self.scope,
122 match_validation_type: self.match_validation_type,
123 third_party_active_checker: self.third_party_active_checker,
124 suppressions: self.suppressions,
125 inner: func(self.inner),
126 }
127 }
128
129 pub fn match_action(mut self, action: MatchAction) -> Self {
130 self.match_action = action;
131 self
132 }
133
134 pub fn scope(mut self, scope: Scope) -> Self {
135 self.scope = scope;
136 self
137 }
138
139 pub fn third_party_active_checker(
140 mut self,
141 match_validation_type: MatchValidationType,
142 ) -> Self {
143 self.third_party_active_checker = Some(match_validation_type);
144 self
145 }
146
147 pub fn suppressions(mut self, suppressions: Suppressions) -> Self {
148 self.suppressions = Some(suppressions);
149 self
150 }
151
152 fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
153 #[allow(deprecated)]
154 self.third_party_active_checker
155 .as_ref()
156 .or(self.match_validation_type.as_ref())
157 }
158}
159
160impl<T> Deref for RootRuleConfig<T> {
161 type Target = T;
162
163 fn deref(&self) -> &Self::Target {
164 &self.inner
165 }
166}
167pub struct RootCompiledRule {
168 pub inner: Box<dyn CompiledRule>,
169 pub scope: Scope,
170 pub match_action: MatchAction,
171 pub match_validation_type: Option<MatchValidationType>,
172 pub suppressions: Option<CompiledSuppressions>,
173}
174
175impl RootCompiledRule {
176 pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
177 self.match_validation_type
178 .as_ref()
179 .map(|x| x.get_internal_match_validation_type())
180 }
181}
182
183impl Deref for RootCompiledRule {
184 type Target = dyn CompiledRule;
185
186 fn deref(&self) -> &Self::Target {
187 self.inner.as_ref()
188 }
189}
190
191pub struct StringMatchesCtx<'a> {
192 rule_index: usize,
193 pub regex_caches: &'a mut RegexCaches,
194 pub exclusion_check: &'a ExclusionCheck<'a>,
195 pub excluded_matches: &'a mut AHashSet<String>,
196 pub match_emitter: &'a mut dyn MatchEmitter,
197 pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
198
199 pub per_string_data: &'a mut SharedData,
201 pub per_scanner_data: &'a SharedData,
202 pub per_event_data: &'a mut SharedData,
203}
204
205impl StringMatchesCtx<'_> {
206 pub fn process_async(
216 &self,
217 func: impl for<'a> FnOnce(
218 &'a mut AsyncStringMatchesCtx,
219 )
220 -> Pin<Box<dyn Future<Output = Result<(), ScannerError>> + Send + 'a>>
221 + Send
222 + 'static,
223 ) -> RuleResult {
224 let rule_index = self.rule_index;
225
226 let fut = TOKIO_RUNTIME.spawn(async move {
229 let mut ctx = AsyncStringMatchesCtx {
230 rule_matches: vec![],
231 };
232 (func)(&mut ctx).await?;
233
234 Ok(AsyncRuleInfo {
235 rule_index,
236 rule_matches: ctx.rule_matches,
237 })
238 });
239
240 Ok(RuleStatus::Pending(fut))
241 }
242}
243
244pub struct AsyncStringMatchesCtx {
245 rule_matches: Vec<StringMatch>,
246}
247
248impl AsyncStringMatchesCtx {
249 pub fn emit_match(&mut self, string_match: StringMatch) {
250 self.rule_matches.push(string_match);
251 }
252}
253
254#[must_use]
255pub enum RuleStatus {
256 Done,
257 Pending(PendingRuleResult),
258}
259
260pub type PendingRuleResult = JoinHandle<Result<AsyncRuleInfo, ScannerError>>;
262
263pub struct PendingRuleJob {
264 fut: PendingRuleResult,
265 path: Path<'static>,
266}
267
268pub struct AsyncRuleInfo {
269 rule_index: usize,
270 rule_matches: Vec<StringMatch>,
271}
272
273pub type RuleResult = Result<RuleStatus, ScannerError>;
275
276pub trait CompiledRule: Send + Sync {
278 fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
279 }
281
282 fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
283 }
285
286 fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
287 }
289
290 fn get_string_matches(
291 &self,
292 content: &str,
293 path: &Path,
294 ctx: &mut StringMatchesCtx<'_>,
295 ) -> RuleResult;
296
297 fn should_exclude_multipass_v0(&self) -> bool {
300 false
302 }
303
304 fn on_excluded_match_multipass_v0(&self) {
305 }
307}
308
309impl<T> RuleConfig for Box<T>
310where
311 T: RuleConfig + ?Sized,
312{
313 fn convert_to_compiled_rule(
314 &self,
315 rule_index: usize,
316 labels: Labels,
317 ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
318 self.as_ref().convert_to_compiled_rule(rule_index, labels)
319 }
320}
321
322#[derive(Debug, PartialEq, Clone)]
323struct ScannerFeatures {
324 pub add_implicit_index_wildcards: bool,
325 pub multipass_v0_enabled: bool,
326 pub return_matches: bool,
327 pub skip_rules_with_regex_matching_empty_string: bool,
330}
331
332impl Default for ScannerFeatures {
333 fn default() -> Self {
334 Self {
335 add_implicit_index_wildcards: false,
336 multipass_v0_enabled: true,
337 return_matches: false,
338 skip_rules_with_regex_matching_empty_string: false,
339 }
340 }
341}
342
343pub struct ScanOptions {
344 pub blocked_rules_idx: Vec<usize>,
347 pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
349 pub validate_matches: bool,
352}
353
354impl Default for ScanOptions {
355 fn default() -> Self {
356 Self {
357 blocked_rules_idx: vec![],
358 wildcarded_indices: AHashMap::new(),
359 validate_matches: false,
360 }
361 }
362}
363
364pub struct ScanOptionBuilder {
365 blocked_rules_idx: Vec<usize>,
366 wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
367 validate_matches: bool,
368}
369
370impl ScanOptionBuilder {
371 pub fn new() -> Self {
372 Self {
373 blocked_rules_idx: vec![],
374 wildcarded_indices: AHashMap::new(),
375 validate_matches: false,
376 }
377 }
378
379 pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
380 self.blocked_rules_idx = blocked_rules_idx;
381 self
382 }
383
384 pub fn with_wildcarded_indices(
385 mut self,
386 wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
387 ) -> Self {
388 self.wildcarded_indices = wildcarded_indices;
389 self
390 }
391
392 pub fn with_validate_matching(mut self, validate_matches: bool) -> Self {
393 self.validate_matches = validate_matches;
394 self
395 }
396
397 pub fn build(self) -> ScanOptions {
398 ScanOptions {
399 blocked_rules_idx: self.blocked_rules_idx,
400 wildcarded_indices: self.wildcarded_indices,
401 validate_matches: self.validate_matches,
402 }
403 }
404}
405
406pub struct Scanner {
407 rules: Vec<RootCompiledRule>,
408 scoped_ruleset: ScopedRuleSet,
409 scanner_features: ScannerFeatures,
410 metrics: ScannerMetrics,
411 labels: Labels,
412 match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
413 per_scanner_data: SharedData,
414 async_scan_timeout: Duration,
415}
416
417impl Scanner {
418 pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
419 ScannerBuilder::new(rules)
420 }
421
422 pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
427 self.scan_with_options(event, ScanOptions::default())
428 }
429
430 pub fn scan_with_options<E: Event>(
435 &self,
436 event: &mut E,
437 options: ScanOptions,
438 ) -> Result<Vec<RuleMatch>, ScannerError> {
439 block_on(self.internal_scan_with_metrics(event, options))
440 }
441
442 pub async fn scan_async<E: Event>(
446 &self,
447 event: &mut E,
448 ) -> Result<Vec<RuleMatch>, ScannerError> {
449 self.scan_async_with_options(event, ScanOptions::default())
450 .await
451 }
452
453 pub async fn scan_async_with_options<E: Event>(
454 &self,
455 event: &mut E,
456 options: ScanOptions,
457 ) -> Result<Vec<RuleMatch>, ScannerError> {
458 let fut = self.internal_scan_with_metrics(event, options);
459
460 let timeout = {
463 let _tokio_guard = TOKIO_RUNTIME.enter();
464 timeout(self.async_scan_timeout, fut)
465 };
466
467 timeout.await.unwrap_or(Err(ScannerError::Transient(
468 "Async scan timeout".to_string(),
469 )))
470 }
471
472 fn record_metrics(&self, output_rule_matches: &[RuleMatch], start: Instant) {
473 self.metrics
475 .duration_ns
476 .increment(start.elapsed().as_nanos() as u64);
477 self.metrics.num_scanned_events.increment(1);
479 self.metrics
481 .match_count
482 .increment(output_rule_matches.len() as u64);
483 }
484
485 async fn internal_scan_with_metrics<E: Event>(
486 &self,
487 event: &mut E,
488 options: ScanOptions,
489 ) -> Result<Vec<RuleMatch>, ScannerError> {
490 let start = Instant::now();
491 let result = self.internal_scan(event, options).await;
492 match &result {
493 Ok(rule_matches) => {
494 self.record_metrics(rule_matches, start);
495 }
496 Err(_) => {
497 self.record_metrics(&[], start);
498 }
499 }
500 result
501 }
502
503 async fn internal_scan<E: Event>(
504 &self,
505 event: &mut E,
506 options: ScanOptions,
507 ) -> Result<Vec<RuleMatch>, ScannerError> {
508 let need_match_content = self.scanner_features.return_matches || options.validate_matches;
511 let mut rule_matches = InternalRuleMatchSet::new();
513 let mut excluded_matches = AHashSet::new();
514 let mut async_jobs = vec![];
515
516 access_regex_caches(|regex_caches| {
517 self.scoped_ruleset.visit_string_rule_combinations(
518 event,
519 ScannerContentVisitor {
520 scanner: self,
521 regex_caches,
522 rule_matches: &mut rule_matches,
523 blocked_rules: &options.blocked_rules_idx,
524 excluded_matches: &mut excluded_matches,
525 per_event_data: SharedData::new(),
526 wildcarded_indexes: &options.wildcarded_indices,
527 async_jobs: &mut async_jobs,
528 },
529 )
530 })?;
531
532 for job in async_jobs {
535 let rule_info = job.fut.await.unwrap()?;
536 rule_matches.push_async_matches(
537 &job.path,
538 rule_info
539 .rule_matches
540 .into_iter()
541 .map(|x| InternalRuleMatch::new(rule_info.rule_index, x)),
542 );
543 }
544
545 let mut output_rule_matches = vec![];
546
547 for (path, mut rule_matches) in rule_matches.into_iter() {
548 event.visit_string_mut(&path, |content| {
550 rule_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
552
553 <<E as Event>::Encoding>::calculate_indices(
554 content,
555 rule_matches.iter_mut().map(
556 |rule_match: &mut InternalRuleMatch<E::Encoding>| EncodeIndices {
557 utf8_start: rule_match.utf8_start,
558 utf8_end: rule_match.utf8_end,
559 custom_start: &mut rule_match.custom_start,
560 custom_end: &mut rule_match.custom_end,
561 },
562 ),
563 );
564
565 if self.scanner_features.multipass_v0_enabled {
566 rule_matches.retain(|rule_match| {
569 if self.rules[rule_match.rule_index]
570 .inner
571 .should_exclude_multipass_v0()
572 {
573 let is_false_positive = excluded_matches
574 .contains(&content[rule_match.utf8_start..rule_match.utf8_end]);
575 if is_false_positive && self.scanner_features.multipass_v0_enabled {
576 self.rules[rule_match.rule_index].on_excluded_match_multipass_v0();
577 }
578 !is_false_positive
579 } else {
580 true
581 }
582 });
583 }
584
585 self.suppress_matches::<E::Encoding>(&mut rule_matches, content);
586
587 self.sort_and_remove_overlapping_rules::<E::Encoding>(&mut rule_matches);
588
589 let will_mutate = rule_matches
590 .iter()
591 .any(|rule_match| self.rules[rule_match.rule_index].match_action.is_mutating());
592
593 self.apply_match_actions(
594 content,
595 &path,
596 &mut rule_matches,
597 &mut output_rule_matches,
598 need_match_content,
599 );
600
601 will_mutate
602 });
603 }
604
605 if options.validate_matches {
606 self.validate_matches(&mut output_rule_matches);
607 }
608
609 Ok(output_rule_matches)
610 }
611
612 pub fn suppress_matches<E: Encoding>(
613 &self,
614 rule_matches: &mut Vec<InternalRuleMatch<E>>,
615 content: &str,
616 ) {
617 rule_matches.retain(|rule_match| {
618 if let Some(suppressions) = &self.rules[rule_match.rule_index].suppressions {
619 !suppressions.should_match_be_suppressed(content)
620 } else {
621 true
622 }
623 });
624 }
625
626 pub fn validate_matches(&self, rule_matches: &mut Vec<RuleMatch>) {
627 let mut match_validator_rule_match_per_type = AHashMap::new();
629
630 let mut validated_rule_matches = vec![];
631
632 for mut rule_match in rule_matches.drain(..) {
633 let rule = &self.rules[rule_match.rule_index];
634 if let Some(match_validation_type) = rule.internal_match_validation_type() {
635 match_validator_rule_match_per_type
636 .entry(match_validation_type)
637 .or_insert_with(Vec::new)
638 .push(rule_match)
639 } else {
640 rule_match.match_status.merge(MatchStatus::NotAvailable);
642 validated_rule_matches.push(rule_match);
643 }
644 }
645
646 RAYON_THREAD_POOL.install(|| {
647 use rayon::prelude::*;
648
649 match_validator_rule_match_per_type.par_iter_mut().for_each(
650 |(match_validation_type, matches_per_type)| {
651 let match_validator = self.match_validators_per_type.get(match_validation_type);
652 if let Some(match_validator) = match_validator {
653 match_validator
654 .as_ref()
655 .validate(matches_per_type, &self.rules)
656 }
657 },
658 );
659 });
660
661 for (_, mut matches) in match_validator_rule_match_per_type {
663 validated_rule_matches.append(&mut matches);
664 }
665
666 validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
668 *rule_matches = validated_rule_matches;
669 }
670
671 fn apply_match_actions<E: Encoding>(
674 &self,
675 content: &mut String,
676 path: &Path<'static>,
677 rule_matches: &mut [InternalRuleMatch<E>],
678 output_rule_matches: &mut Vec<RuleMatch>,
679 need_match_content: bool,
680 ) {
681 let mut utf8_byte_delta: isize = 0;
682 let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
683
684 for rule_match in rule_matches {
685 output_rule_matches.push(self.apply_match_actions_for_string::<E>(
686 content,
687 path.clone(),
688 rule_match,
689 &mut utf8_byte_delta,
690 &mut custom_index_delta,
691 need_match_content,
692 ));
693 }
694 }
695
696 fn apply_match_actions_for_string<E: Encoding>(
698 &self,
699 content: &mut String,
700 path: Path<'static>,
701 rule_match: &InternalRuleMatch<E>,
702 utf8_byte_delta: &mut isize,
704
705 custom_index_delta: &mut <E>::IndexShift,
707 need_match_content: bool,
708 ) -> RuleMatch {
709 let rule = &self.rules[rule_match.rule_index];
710
711 let custom_start =
712 (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
713 + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
714
715 let mut matched_content_copy = None;
716
717 if need_match_content {
718 let mutated_utf8_match_start =
720 (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
721 let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
722
723 debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
725 debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
726
727 let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
728 matched_content_copy = Some(matched_content.to_string());
729 }
730
731 if rule.match_action.is_mutating() {
732 let mutated_utf8_match_start =
733 (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
734 let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
735
736 debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
738 debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
739
740 let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
741 if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
742 let before_replacement = &matched_content[replacement.start..replacement.end];
743
744 <E>::adjust_shift(
746 custom_index_delta,
747 before_replacement,
748 &replacement.replacement,
749 );
750 *utf8_byte_delta +=
751 replacement.replacement.len() as isize - before_replacement.len() as isize;
752
753 let replacement_start = mutated_utf8_match_start + replacement.start;
754 let replacement_end = mutated_utf8_match_start + replacement.end;
755 content.replace_range(replacement_start..replacement_end, &replacement.replacement);
756 }
757 }
758
759 let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
760 let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
761 + shift_offset) as usize;
762
763 let rule = &self.rules[rule_match.rule_index];
764
765 let match_status: MatchStatus = if rule.match_validation_type.is_some() {
766 MatchStatus::NotChecked
767 } else {
768 MatchStatus::NotAvailable
769 };
770
771 RuleMatch {
772 rule_index: rule_match.rule_index,
773 path,
774 replacement_type: rule.match_action.replacement_type(),
775 start_index: custom_start,
776 end_index_exclusive: custom_end,
777 shift_offset,
778 match_value: matched_content_copy,
779 match_status,
780 }
781 }
782
783 fn sort_and_remove_overlapping_rules<E: Encoding>(
784 &self,
785 rule_matches: &mut Vec<InternalRuleMatch<E>>,
786 ) {
787 rule_matches.sort_unstable_by(|a, b| {
791 let ord = self.rules[a.rule_index]
793 .match_action
794 .is_mutating()
795 .cmp(&self.rules[b.rule_index].match_action.is_mutating())
796 .reverse();
797
798 let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
800
801 let ord = ord.then(a.len().cmp(&b.len()).reverse());
803
804 let ord = ord.then(a.rule_index.cmp(&b.rule_index));
806
807 ord.reverse()
809 });
810
811 let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
812
813 'rule_matches: while let Some(rule_match) = rule_matches.pop() {
814 if self.rules[rule_match.rule_index].match_action.is_mutating() {
815 if let Some(last) = retained_rules.last()
817 && last.utf8_end > rule_match.utf8_start
818 {
819 continue;
820 }
821 } else {
822 for retained_rule in &retained_rules {
825 if retained_rule.utf8_start < rule_match.utf8_end
826 && retained_rule.utf8_end > rule_match.utf8_start
827 {
828 continue 'rule_matches;
829 }
830 }
831 };
832 retained_rules.push(rule_match);
833 }
834
835 retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
837
838 *rule_matches = retained_rules;
839 }
840}
841
842impl Drop for Scanner {
843 fn drop(&mut self) {
844 let stats = &*GLOBAL_STATS;
845 stats.scanner_deletions.increment(1);
846 stats.decrement_total_scanners();
847 }
848}
849
850#[derive(Default)]
851pub struct ScannerBuilder<'a> {
852 rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
853 labels: Labels,
854 scanner_features: ScannerFeatures,
855 async_scan_timeout: Duration,
856}
857
858impl ScannerBuilder<'_> {
859 pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
860 ScannerBuilder {
861 rules,
862 labels: Labels::empty(),
863 scanner_features: ScannerFeatures::default(),
864 async_scan_timeout: Duration::from_secs(60 * 5),
865 }
866 }
867
868 pub fn labels(mut self, labels: Labels) -> Self {
869 self.labels = labels;
870 self
871 }
872
873 pub fn with_async_scan_timeout(mut self, duration: Duration) -> Self {
874 self.async_scan_timeout = duration;
875 self
876 }
877
878 pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
879 self.scanner_features.add_implicit_index_wildcards = value;
880 self
881 }
882
883 pub fn with_return_matches(mut self, value: bool) -> Self {
884 self.scanner_features.return_matches = value;
885 self
886 }
887
888 pub fn with_multipass_v0(mut self, value: bool) -> Self {
892 self.scanner_features.multipass_v0_enabled = value;
893 self
894 }
895
896 pub fn with_skip_rules_with_regex_matching_empty_string(mut self, value: bool) -> Self {
897 self.scanner_features
898 .skip_rules_with_regex_matching_empty_string = value;
899 self
900 }
901
902 pub fn build(self) -> Result<Scanner, CreateScannerError> {
903 let mut match_validators_per_type = AHashMap::new();
904
905 for rule in self.rules.iter() {
906 if let Some(match_validation_type) = &rule.get_third_party_active_checker()
907 && match_validation_type.can_create_match_validator()
908 {
909 let internal_type = match_validation_type.get_internal_match_validation_type();
910 let match_validator = match_validation_type.into_match_validator();
911 if let Ok(match_validator) = match_validator {
912 if !match_validators_per_type.contains_key(&internal_type) {
913 match_validators_per_type.insert(internal_type, match_validator);
914 }
915 } else {
916 return Err(CreateScannerError::InvalidMatchValidator(
917 MatchValidatorCreationError::InternalError,
918 ));
919 }
920 }
921 }
922
923 let compiled_rules = self
924 .rules
925 .iter()
926 .enumerate()
927 .filter_map(|(rule_index, config)| {
928 let inner = match config.convert_to_compiled_rule(rule_index, self.labels.clone()) {
929 Ok(inner) => Ok(inner),
930 Err(err) => {
931 if self
932 .scanner_features
933 .skip_rules_with_regex_matching_empty_string
934 && err
935 == CreateScannerError::InvalidRegex(
936 RegexValidationError::MatchesEmptyString,
937 )
938 {
939 #[allow(clippy::print_stdout)]
941 {
942 println!("skipping rule that matches empty string: rule_index={}, labels={:?}", rule_index, self.labels.clone());
943 }
944 return None;
945 } else {
946 Err(err)
947 }
948 }
949 };
950 Some((config, inner))
951 })
952 .map(|(config, inner)| {
953 config.match_action.validate()?;
954 Ok(RootCompiledRule {
955 inner: inner?,
956 scope: config.scope.clone(),
957 match_action: config.match_action.clone(),
958 match_validation_type: config.get_third_party_active_checker().cloned(),
959 suppressions: config.suppressions.clone().map(|config| config.into()),
960 })
961 })
962 .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
963
964 let mut per_scanner_data = SharedData::new();
965
966 compiled_rules.iter().for_each(|rule| {
967 rule.init_per_scanner_data(&mut per_scanner_data);
968 });
969
970 let scoped_ruleset = ScopedRuleSet::new(
971 &compiled_rules
972 .iter()
973 .map(|rule| rule.scope.clone())
974 .collect::<Vec<_>>(),
975 )
976 .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
977
978 {
979 let stats = &*GLOBAL_STATS;
980 stats.scanner_creations.increment(1);
981 stats.increment_total_scanners();
982 }
983
984 Ok(Scanner {
985 rules: compiled_rules,
986 scoped_ruleset,
987 scanner_features: self.scanner_features,
988 metrics: ScannerMetrics::new(&self.labels),
989 match_validators_per_type,
990 labels: self.labels,
991 per_scanner_data,
992 async_scan_timeout: self.async_scan_timeout,
993 })
994 }
995}
996
997struct ScannerContentVisitor<'a, E: Encoding> {
998 scanner: &'a Scanner,
999 regex_caches: &'a mut RegexCaches,
1000 rule_matches: &'a mut InternalRuleMatchSet<E>,
1001 blocked_rules: &'a Vec<usize>,
1004 excluded_matches: &'a mut AHashSet<String>,
1005 per_event_data: SharedData,
1006 wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
1007 async_jobs: &'a mut Vec<PendingRuleJob>,
1008}
1009
1010impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
1011 fn visit_content<'b>(
1012 &'b mut self,
1013 path: &Path<'a>,
1014 content: &str,
1015 mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
1016 exclusion_check: ExclusionCheck<'b>,
1017 ) -> Result<bool, ScannerError> {
1018 let mut path_rules_matches = vec![];
1020
1021 let mut per_string_data = SharedData::new();
1023 let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
1024
1025 rule_visitor.visit_rule_indices(|rule_index| {
1026 if self.blocked_rules.contains(&rule_index) {
1027 return Ok(());
1028 }
1029 let rule = &self.scanner.rules[rule_index];
1030 {
1031 let mut emitter = |rule_match: StringMatch| {
1033 assert_ne!(rule_match.start, rule_match.end, "empty match detected");
1036 path_rules_matches.push(InternalRuleMatch::new(rule_index, rule_match));
1037 };
1038
1039 rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
1040
1041 rule.init_per_event_data(&mut self.per_event_data);
1043
1044 let mut ctx = StringMatchesCtx {
1045 rule_index,
1046 regex_caches: self.regex_caches,
1047 exclusion_check: &exclusion_check,
1048 excluded_matches: self.excluded_matches,
1049 match_emitter: &mut emitter,
1050 wildcard_indices: wildcard_indices_per_path,
1051 per_string_data: &mut per_string_data,
1052 per_scanner_data: &self.scanner.per_scanner_data,
1053 per_event_data: &mut self.per_event_data,
1054 };
1055
1056 let async_status = rule.get_string_matches(content, path, &mut ctx)?;
1057
1058 match async_status {
1059 RuleStatus::Done => {
1060 }
1062 RuleStatus::Pending(fut) => {
1063 self.async_jobs.push(PendingRuleJob {
1064 fut,
1065 path: path.into_static(),
1066 });
1067 }
1068 }
1069 }
1070 Ok(())
1071 })?;
1072
1073 let needs_to_access_content = !path_rules_matches.is_empty() || !self.async_jobs.is_empty();
1078
1079 self.rule_matches
1080 .push_sync_matches(path, path_rules_matches);
1081
1082 Ok(needs_to_access_content)
1083 }
1084}
1085
1086fn get_next_regex_start(content: &str, regex_match: &Match) -> Option<usize> {
1088 if let Some((i, _)) = content[regex_match.start()..].char_indices().nth(1) {
1090 Some(regex_match.start() + i)
1091 } else {
1092 None
1094 }
1095}
1096
1097fn is_false_positive_match(
1098 regex_match: &Match,
1099 rule: &RegexCompiledRule,
1100 content: &str,
1101 check_excluded_keywords: bool,
1102) -> bool {
1103 if check_excluded_keywords
1104 && let Some(excluded_keywords) = &rule.excluded_keywords
1105 && excluded_keywords.is_false_positive_match(content, regex_match.start())
1106 {
1107 return true;
1108 }
1109
1110 if let Some(validator) = rule.validator.as_ref()
1111 && !validator.is_valid_match(&content[regex_match.range()])
1112 {
1113 return true;
1114 }
1115 false
1116}