1use crate::encoding::Encoding;
2use crate::event::Event;
3use std::future::Future;
4
5use crate::match_validation::{
6 config::InternalMatchValidationType, config::MatchValidationType, match_status::MatchStatus,
7 match_validator::MatchValidator,
8};
9
10use error::{MatchValidationError, MatchValidatorCreationError};
11
12use self::metrics::ScannerMetrics;
13use crate::match_validation::match_validator::RAYON_THREAD_POOL;
14use crate::observability::labels::Labels;
15use crate::rule_match::{InternalRuleMatch, RuleMatch};
16use crate::scanner::config::RuleConfig;
17use crate::scanner::internal_rule_match_set::InternalRuleMatchSet;
18use crate::scanner::regex_rule::compiled::RegexCompiledRule;
19use crate::scanner::regex_rule::{access_regex_caches, RegexCaches};
20use crate::scanner::scope::Scope;
21pub use crate::scanner::shared_data::SharedData;
22use crate::scoped_ruleset::{ContentVisitor, ExclusionCheck, ScopedRuleSet};
23pub use crate::secondary_validation::Validator;
24use crate::stats::GLOBAL_STATS;
25use crate::tokio::TOKIO_RUNTIME;
26use crate::{
27 CreateScannerError, EncodeIndices, MatchAction, Path, RegexValidationError, ScannerError,
28};
29use ahash::{AHashMap, AHashSet};
30use futures::executor::block_on;
31use regex_automata::Match;
32use serde::{Deserialize, Serialize};
33use serde_with::serde_as;
34use std::ops::Deref;
35use std::pin::Pin;
36use std::sync::Arc;
37use std::time::{Duration, Instant};
38use tokio::task::JoinHandle;
39use tokio::time::timeout;
40
41pub mod config;
42pub mod error;
43pub mod metrics;
44pub mod regex_rule;
45pub mod scope;
46pub mod shared_data;
47pub mod shared_pool;
48
49mod internal_rule_match_set;
50#[cfg(test)]
51mod test;
52
53#[derive(Copy, Clone)]
54pub struct StringMatch {
55 pub start: usize,
56 pub end: usize,
57}
58
59pub trait MatchEmitter<T = ()> {
60 fn emit(&mut self, string_match: StringMatch) -> T;
61}
62
63impl<F, T> MatchEmitter<T> for F
66where
67 F: FnMut(StringMatch) -> T,
68{
69 fn emit(&mut self, string_match: StringMatch) -> T {
70 (self)(string_match)
72 }
73}
74
75#[serde_as]
76#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
77pub struct RootRuleConfig<T> {
78 pub match_action: MatchAction,
79 #[serde(default)]
80 pub scope: Scope,
81 #[deprecated(note = "Use `third_party_active_checker` instead")]
82 match_validation_type: Option<MatchValidationType>,
83 third_party_active_checker: Option<MatchValidationType>,
84 #[serde(flatten)]
85 pub inner: T,
86}
87
88impl<T> RootRuleConfig<T>
89where
90 T: RuleConfig + 'static,
91{
92 pub fn new_dyn(inner: T) -> RootRuleConfig<Arc<dyn RuleConfig>> {
93 RootRuleConfig::new(Arc::new(inner) as Arc<dyn RuleConfig>)
94 }
95
96 pub fn into_dyn(self) -> RootRuleConfig<Arc<dyn RuleConfig>> {
97 self.map_inner(|x| Arc::new(x) as Arc<dyn RuleConfig>)
98 }
99}
100
101impl<T> RootRuleConfig<T> {
102 pub fn new(inner: T) -> Self {
103 #[allow(deprecated)]
104 Self {
105 match_action: MatchAction::None,
106 scope: Scope::all(),
107 match_validation_type: None,
108 third_party_active_checker: None,
109 inner,
110 }
111 }
112
113 pub fn map_inner<U>(self, func: impl FnOnce(T) -> U) -> RootRuleConfig<U> {
114 #[allow(deprecated)]
115 RootRuleConfig {
116 match_action: self.match_action,
117 scope: self.scope,
118 match_validation_type: self.match_validation_type,
119 third_party_active_checker: self.third_party_active_checker,
120 inner: func(self.inner),
121 }
122 }
123
124 pub fn match_action(mut self, action: MatchAction) -> Self {
125 self.match_action = action;
126 self
127 }
128
129 pub fn scope(mut self, scope: Scope) -> Self {
130 self.scope = scope;
131 self
132 }
133
134 pub fn third_party_active_checker(
135 mut self,
136 match_validation_type: MatchValidationType,
137 ) -> Self {
138 self.third_party_active_checker = Some(match_validation_type);
139 self
140 }
141
142 fn get_third_party_active_checker(&self) -> Option<&MatchValidationType> {
143 #[allow(deprecated)]
144 self.third_party_active_checker
145 .as_ref()
146 .or(self.match_validation_type.as_ref())
147 }
148}
149
150impl<T> Deref for RootRuleConfig<T> {
151 type Target = T;
152
153 fn deref(&self) -> &Self::Target {
154 &self.inner
155 }
156}
157pub struct RootCompiledRule {
158 pub inner: Box<dyn CompiledRule>,
159 pub scope: Scope,
160 pub match_action: MatchAction,
161 pub match_validation_type: Option<MatchValidationType>,
162}
163
164impl RootCompiledRule {
165 pub fn internal_match_validation_type(&self) -> Option<InternalMatchValidationType> {
166 self.match_validation_type
167 .as_ref()
168 .map(|x| x.get_internal_match_validation_type())
169 }
170}
171
172impl Deref for RootCompiledRule {
173 type Target = dyn CompiledRule;
174
175 fn deref(&self) -> &Self::Target {
176 self.inner.as_ref()
177 }
178}
179
180pub struct StringMatchesCtx<'a> {
181 rule_index: usize,
182 pub regex_caches: &'a mut RegexCaches,
183 pub exclusion_check: &'a ExclusionCheck<'a>,
184 pub excluded_matches: &'a mut AHashSet<String>,
185 pub match_emitter: &'a mut dyn MatchEmitter,
186 pub wildcard_indices: Option<&'a Vec<(usize, usize)>>,
187
188 pub per_string_data: &'a mut SharedData,
190 pub per_scanner_data: &'a SharedData,
191 pub per_event_data: &'a mut SharedData,
192}
193
194impl StringMatchesCtx<'_> {
195 pub fn process_async(
205 &self,
206 func: impl for<'a> FnOnce(
207 &'a mut AsyncStringMatchesCtx,
208 )
209 -> Pin<Box<dyn Future<Output = Result<(), ScannerError>> + Send + 'a>>
210 + Send
211 + 'static,
212 ) -> RuleResult {
213 let rule_index = self.rule_index;
214
215 let fut = TOKIO_RUNTIME.spawn(async move {
218 let mut ctx = AsyncStringMatchesCtx {
219 rule_matches: vec![],
220 };
221 (func)(&mut ctx).await?;
222
223 Ok(AsyncRuleInfo {
224 rule_index,
225 rule_matches: ctx.rule_matches,
226 })
227 });
228
229 Ok(RuleStatus::Pending(fut))
230 }
231}
232
233pub struct AsyncStringMatchesCtx {
234 rule_matches: Vec<StringMatch>,
235}
236
237impl AsyncStringMatchesCtx {
238 pub fn emit_match(&mut self, string_match: StringMatch) {
239 self.rule_matches.push(string_match);
240 }
241}
242
243#[must_use]
244pub enum RuleStatus {
245 Done,
246 Pending(PendingRuleResult),
247}
248
249pub type PendingRuleResult = JoinHandle<Result<AsyncRuleInfo, ScannerError>>;
251
252pub struct PendingRuleJob {
253 fut: PendingRuleResult,
254 path: Path<'static>,
255}
256
257pub struct AsyncRuleInfo {
258 rule_index: usize,
259 rule_matches: Vec<StringMatch>,
260}
261
262pub type RuleResult = Result<RuleStatus, ScannerError>;
264
265pub trait CompiledRule: Send + Sync {
267 fn init_per_scanner_data(&self, _per_scanner_data: &mut SharedData) {
268 }
270
271 fn init_per_string_data(&self, _labels: &Labels, _per_string_data: &mut SharedData) {
272 }
274
275 fn init_per_event_data(&self, _per_event_data: &mut SharedData) {
276 }
278
279 fn get_string_matches(
280 &self,
281 content: &str,
282 path: &Path,
283 ctx: &mut StringMatchesCtx<'_>,
284 ) -> RuleResult;
285
286 fn should_exclude_multipass_v0(&self) -> bool {
289 false
291 }
292
293 fn on_excluded_match_multipass_v0(&self) {
294 }
296}
297
298impl<T> RuleConfig for Box<T>
299where
300 T: RuleConfig + ?Sized,
301{
302 fn convert_to_compiled_rule(
303 &self,
304 rule_index: usize,
305 labels: Labels,
306 ) -> Result<Box<dyn CompiledRule>, CreateScannerError> {
307 self.as_ref().convert_to_compiled_rule(rule_index, labels)
308 }
309}
310
311#[derive(Debug, PartialEq, Clone)]
312struct ScannerFeatures {
313 pub add_implicit_index_wildcards: bool,
314 pub multipass_v0_enabled: bool,
315 pub return_matches: bool,
316 pub skip_rules_with_regex_matching_empty_string: bool,
319}
320
321impl Default for ScannerFeatures {
322 fn default() -> Self {
323 Self {
324 add_implicit_index_wildcards: false,
325 multipass_v0_enabled: true,
326 return_matches: false,
327 skip_rules_with_regex_matching_empty_string: false,
328 }
329 }
330}
331
332pub struct ScanOptions {
333 pub blocked_rules_idx: Vec<usize>,
336 pub wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
338}
339
340impl Default for ScanOptions {
341 fn default() -> Self {
342 Self {
343 blocked_rules_idx: vec![],
344 wildcarded_indices: AHashMap::new(),
345 }
346 }
347}
348
349pub struct ScanOptionBuilder {
350 blocked_rules_idx: Vec<usize>,
351 wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
352}
353
354impl ScanOptionBuilder {
355 pub fn new() -> Self {
356 Self {
357 blocked_rules_idx: vec![],
358 wildcarded_indices: AHashMap::new(),
359 }
360 }
361
362 pub fn with_blocked_rules_idx(mut self, blocked_rules_idx: Vec<usize>) -> Self {
363 self.blocked_rules_idx = blocked_rules_idx;
364 self
365 }
366
367 pub fn with_wildcarded_indices(
368 mut self,
369 wildcarded_indices: AHashMap<Path<'static>, Vec<(usize, usize)>>,
370 ) -> Self {
371 self.wildcarded_indices = wildcarded_indices;
372 self
373 }
374
375 pub fn build(self) -> ScanOptions {
376 ScanOptions {
377 blocked_rules_idx: self.blocked_rules_idx,
378 wildcarded_indices: self.wildcarded_indices,
379 }
380 }
381}
382
383pub struct Scanner {
384 rules: Vec<RootCompiledRule>,
385 scoped_ruleset: ScopedRuleSet,
386 scanner_features: ScannerFeatures,
387 metrics: ScannerMetrics,
388 labels: Labels,
389 match_validators_per_type: AHashMap<InternalMatchValidationType, Box<dyn MatchValidator>>,
390 per_scanner_data: SharedData,
391 async_scan_timeout: Duration,
392}
393
394impl Scanner {
395 pub fn builder(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
396 ScannerBuilder::new(rules)
397 }
398
399 pub fn scan<E: Event>(&self, event: &mut E) -> Result<Vec<RuleMatch>, ScannerError> {
403 self.scan_with_options(event, ScanOptions::default())
404 }
405
406 pub async fn scan_async<E: Event>(
410 &self,
411 event: &mut E,
412 ) -> Result<Vec<RuleMatch>, ScannerError> {
413 self.scan_async_with_options(event, ScanOptions::default())
414 .await
415 }
416
417 pub fn scan_with_options<E: Event>(
418 &self,
419 event: &mut E,
420 options: ScanOptions,
421 ) -> Result<Vec<RuleMatch>, ScannerError> {
422 block_on(self.internal_scan_with_metrics(event, options))
423 }
424
425 pub async fn scan_async_with_options<E: Event>(
426 &self,
427 event: &mut E,
428 options: ScanOptions,
429 ) -> Result<Vec<RuleMatch>, ScannerError> {
430 let fut = self.internal_scan_with_metrics(event, options);
431
432 let timeout = {
435 let _tokio_guard = TOKIO_RUNTIME.enter();
436 timeout(self.async_scan_timeout, fut)
437 };
438
439 timeout.await.unwrap_or(Err(ScannerError::Transient(
440 "Async scan timeout".to_string(),
441 )))
442 }
443
444 fn record_metrics(&self, output_rule_matches: &[RuleMatch], start: Instant) {
445 self.metrics
447 .duration_ns
448 .increment(start.elapsed().as_nanos() as u64);
449 self.metrics.num_scanned_events.increment(1);
451 self.metrics
453 .match_count
454 .increment(output_rule_matches.len() as u64);
455 }
456
457 async fn internal_scan_with_metrics<E: Event>(
458 &self,
459 event: &mut E,
460 options: ScanOptions,
461 ) -> Result<Vec<RuleMatch>, ScannerError> {
462 let start = Instant::now();
463 let result = self.internal_scan(event, options).await;
464 match &result {
465 Ok(rule_matches) => {
466 self.record_metrics(rule_matches, start);
467 }
468 Err(_) => {
469 self.record_metrics(&[], start);
470 }
471 }
472 result
473 }
474
475 async fn internal_scan<E: Event>(
476 &self,
477 event: &mut E,
478 options: ScanOptions,
479 ) -> Result<Vec<RuleMatch>, ScannerError> {
480 let mut rule_matches = InternalRuleMatchSet::new();
482 let mut excluded_matches = AHashSet::new();
483 let mut async_jobs = vec![];
484
485 access_regex_caches(|regex_caches| {
486 self.scoped_ruleset.visit_string_rule_combinations(
487 event,
488 ScannerContentVisitor {
489 scanner: self,
490 regex_caches,
491 rule_matches: &mut rule_matches,
492 blocked_rules: &options.blocked_rules_idx,
493 excluded_matches: &mut excluded_matches,
494 per_event_data: SharedData::new(),
495 wildcarded_indexes: &options.wildcarded_indices,
496 async_jobs: &mut async_jobs,
497 },
498 )
499 })?;
500
501 for job in async_jobs {
504 let rule_info = job.fut.await.unwrap()?;
505 rule_matches.push_async_matches(
506 &job.path,
507 rule_info
508 .rule_matches
509 .into_iter()
510 .map(|x| InternalRuleMatch::new(rule_info.rule_index, x)),
511 );
512 }
513
514 let mut output_rule_matches = vec![];
515
516 for (path, mut rule_matches) in rule_matches.into_iter() {
517 event.visit_string_mut(&path, |content| {
519 rule_matches.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
521
522 <<E as Event>::Encoding>::calculate_indices(
523 content,
524 rule_matches.iter_mut().map(
525 |rule_match: &mut InternalRuleMatch<E::Encoding>| EncodeIndices {
526 utf8_start: rule_match.utf8_start,
527 utf8_end: rule_match.utf8_end,
528 custom_start: &mut rule_match.custom_start,
529 custom_end: &mut rule_match.custom_end,
530 },
531 ),
532 );
533
534 if self.scanner_features.multipass_v0_enabled {
535 rule_matches.retain(|rule_match| {
538 if self.rules[rule_match.rule_index]
539 .inner
540 .should_exclude_multipass_v0()
541 {
542 let is_false_positive = excluded_matches
543 .contains(&content[rule_match.utf8_start..rule_match.utf8_end]);
544 if is_false_positive && self.scanner_features.multipass_v0_enabled {
545 self.rules[rule_match.rule_index].on_excluded_match_multipass_v0();
546 }
547 !is_false_positive
548 } else {
549 true
550 }
551 });
552 }
553
554 self.sort_and_remove_overlapping_rules::<E::Encoding>(&mut rule_matches);
555
556 let will_mutate = rule_matches
557 .iter()
558 .any(|rule_match| self.rules[rule_match.rule_index].match_action.is_mutating());
559
560 self.apply_match_actions(
561 content,
562 &path,
563 &mut rule_matches,
564 &mut output_rule_matches,
565 );
566
567 will_mutate
568 });
569 }
570
571 Ok(output_rule_matches)
572 }
573
574 pub fn validate_matches(
575 &self,
576 rule_matches: &mut Vec<RuleMatch>,
577 ) -> Result<(), MatchValidationError> {
578 if !self.scanner_features.return_matches {
579 return Err(MatchValidationError::NoMatchValidationType);
580 }
581 let mut match_validator_rule_match_per_type = AHashMap::new();
583
584 let mut validated_rule_matches = vec![];
585
586 for mut rule_match in rule_matches.drain(..) {
587 let rule = &self.rules[rule_match.rule_index];
588 if let Some(match_validation_type) = rule.internal_match_validation_type() {
589 match_validator_rule_match_per_type
590 .entry(match_validation_type)
591 .or_insert_with(Vec::new)
592 .push(rule_match)
593 } else {
594 rule_match.match_status.merge(MatchStatus::NotAvailable);
596 validated_rule_matches.push(rule_match);
597 }
598 }
599
600 RAYON_THREAD_POOL.install(|| {
601 use rayon::prelude::*;
602
603 match_validator_rule_match_per_type.par_iter_mut().for_each(
604 |(match_validation_type, matches_per_type)| {
605 let match_validator = self.match_validators_per_type.get(match_validation_type);
606 if let Some(match_validator) = match_validator {
607 match_validator
608 .as_ref()
609 .validate(matches_per_type, &self.rules)
610 }
611 },
612 );
613 });
614
615 for (_, mut matches) in match_validator_rule_match_per_type {
617 validated_rule_matches.append(&mut matches);
618 }
619
620 validated_rule_matches.sort_by_key(|rule_match| rule_match.start_index);
622 *rule_matches = validated_rule_matches;
623 Ok(())
624 }
625
626 fn apply_match_actions<E: Encoding>(
629 &self,
630 content: &mut String,
631 path: &Path<'static>,
632 rule_matches: &mut [InternalRuleMatch<E>],
633 output_rule_matches: &mut Vec<RuleMatch>,
634 ) {
635 let mut utf8_byte_delta: isize = 0;
636 let mut custom_index_delta: <E>::IndexShift = <E>::zero_shift();
637
638 for rule_match in rule_matches {
639 output_rule_matches.push(self.apply_match_actions_for_string::<E>(
640 content,
641 path.clone(),
642 rule_match,
643 &mut utf8_byte_delta,
644 &mut custom_index_delta,
645 ));
646 }
647 }
648
649 fn apply_match_actions_for_string<E: Encoding>(
651 &self,
652 content: &mut String,
653 path: Path<'static>,
654 rule_match: &InternalRuleMatch<E>,
655 utf8_byte_delta: &mut isize,
657
658 custom_index_delta: &mut <E>::IndexShift,
660 ) -> RuleMatch {
661 let rule = &self.rules[rule_match.rule_index];
662
663 let custom_start =
664 (<E>::get_index(&rule_match.custom_start, rule_match.utf8_start) as isize
665 + <E>::get_shift(custom_index_delta, *utf8_byte_delta)) as usize;
666
667 let mut matched_content_copy = None;
668
669 if self.scanner_features.return_matches {
670 let mutated_utf8_match_start =
672 (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
673 let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
674
675 debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
677 debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
678
679 let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
680 matched_content_copy = Some(matched_content.to_string());
681 }
682
683 if rule.match_action.is_mutating() {
684 let mutated_utf8_match_start =
685 (rule_match.utf8_start as isize + *utf8_byte_delta) as usize;
686 let mutated_utf8_match_end = (rule_match.utf8_end as isize + *utf8_byte_delta) as usize;
687
688 debug_assert!(content.is_char_boundary(mutated_utf8_match_start));
690 debug_assert!(content.is_char_boundary(mutated_utf8_match_end));
691
692 let matched_content = &content[mutated_utf8_match_start..mutated_utf8_match_end];
693 if let Some(replacement) = rule.match_action.get_replacement(matched_content) {
694 let before_replacement = &matched_content[replacement.start..replacement.end];
695
696 <E>::adjust_shift(
698 custom_index_delta,
699 before_replacement,
700 &replacement.replacement,
701 );
702 *utf8_byte_delta +=
703 replacement.replacement.len() as isize - before_replacement.len() as isize;
704
705 let replacement_start = mutated_utf8_match_start + replacement.start;
706 let replacement_end = mutated_utf8_match_start + replacement.end;
707 content.replace_range(replacement_start..replacement_end, &replacement.replacement);
708 }
709 }
710
711 let shift_offset = <E>::get_shift(custom_index_delta, *utf8_byte_delta);
712 let custom_end = (<E>::get_index(&rule_match.custom_end, rule_match.utf8_end) as isize
713 + shift_offset) as usize;
714
715 let rule = &self.rules[rule_match.rule_index];
716
717 let match_status: MatchStatus = if rule.match_validation_type.is_some() {
718 MatchStatus::NotChecked
719 } else {
720 MatchStatus::NotAvailable
721 };
722
723 RuleMatch {
724 rule_index: rule_match.rule_index,
725 path,
726 replacement_type: rule.match_action.replacement_type(),
727 start_index: custom_start,
728 end_index_exclusive: custom_end,
729 shift_offset,
730 match_value: matched_content_copy,
731 match_status,
732 }
733 }
734
735 fn sort_and_remove_overlapping_rules<E: Encoding>(
736 &self,
737 rule_matches: &mut Vec<InternalRuleMatch<E>>,
738 ) {
739 rule_matches.sort_unstable_by(|a, b| {
743 let ord = self.rules[a.rule_index]
745 .match_action
746 .is_mutating()
747 .cmp(&self.rules[b.rule_index].match_action.is_mutating())
748 .reverse();
749
750 let ord = ord.then(a.utf8_start.cmp(&b.utf8_start));
752
753 let ord = ord.then(a.len().cmp(&b.len()).reverse());
755
756 let ord = ord.then(a.rule_index.cmp(&b.rule_index));
758
759 ord.reverse()
761 });
762
763 let mut retained_rules: Vec<InternalRuleMatch<E>> = vec![];
764
765 'rule_matches: while let Some(rule_match) = rule_matches.pop() {
766 if self.rules[rule_match.rule_index].match_action.is_mutating() {
767 if let Some(last) = retained_rules.last() {
769 if last.utf8_end > rule_match.utf8_start {
770 continue;
771 }
772 }
773 } else {
774 for retained_rule in &retained_rules {
777 if retained_rule.utf8_start < rule_match.utf8_end
778 && retained_rule.utf8_end > rule_match.utf8_start
779 {
780 continue 'rule_matches;
781 }
782 }
783 };
784 retained_rules.push(rule_match);
785 }
786
787 retained_rules.sort_unstable_by_key(|rule_match| rule_match.utf8_start);
789
790 *rule_matches = retained_rules;
791 }
792}
793
794impl Drop for Scanner {
795 fn drop(&mut self) {
796 let stats = &*GLOBAL_STATS;
797 stats.scanner_deletions.increment(1);
798 stats.decrement_total_scanners();
799 }
800}
801
802#[derive(Default)]
803pub struct ScannerBuilder<'a> {
804 rules: &'a [RootRuleConfig<Arc<dyn RuleConfig>>],
805 labels: Labels,
806 scanner_features: ScannerFeatures,
807 async_scan_timeout: Duration,
808}
809
810impl ScannerBuilder<'_> {
811 pub fn new(rules: &[RootRuleConfig<Arc<dyn RuleConfig>>]) -> ScannerBuilder<'_> {
812 ScannerBuilder {
813 rules,
814 labels: Labels::empty(),
815 scanner_features: ScannerFeatures::default(),
816 async_scan_timeout: Duration::from_secs(60),
817 }
818 }
819
820 pub fn labels(mut self, labels: Labels) -> Self {
821 self.labels = labels;
822 self
823 }
824
825 pub fn with_async_scan_timeout(mut self, duration: Duration) -> Self {
826 self.async_scan_timeout = duration;
827 self
828 }
829
830 pub fn with_implicit_wildcard_indexes_for_scopes(mut self, value: bool) -> Self {
831 self.scanner_features.add_implicit_index_wildcards = value;
832 self
833 }
834
835 pub fn with_return_matches(mut self, value: bool) -> Self {
836 self.scanner_features.return_matches = value;
837 self
838 }
839
840 pub fn with_multipass_v0(mut self, value: bool) -> Self {
844 self.scanner_features.multipass_v0_enabled = value;
845 self
846 }
847
848 pub fn with_skip_rules_with_regex_matching_empty_string(mut self, value: bool) -> Self {
849 self.scanner_features
850 .skip_rules_with_regex_matching_empty_string = value;
851 self
852 }
853
854 pub fn build(self) -> Result<Scanner, CreateScannerError> {
855 let mut match_validators_per_type = AHashMap::new();
856
857 for rule in self.rules.iter() {
858 if let Some(match_validation_type) = &rule.get_third_party_active_checker() {
859 if match_validation_type.can_create_match_validator() {
860 let internal_type = match_validation_type.get_internal_match_validation_type();
861 let match_validator = match_validation_type.into_match_validator();
862 if let Ok(match_validator) = match_validator {
863 if !match_validators_per_type.contains_key(&internal_type) {
864 match_validators_per_type.insert(internal_type, match_validator);
865 }
866 } else {
867 return Err(CreateScannerError::InvalidMatchValidator(
868 MatchValidatorCreationError::InternalError,
869 ));
870 }
871 }
872 }
873 }
874
875 let compiled_rules = self
876 .rules
877 .iter()
878 .enumerate()
879 .filter_map(|(rule_index, config)| {
880 let inner = match config.convert_to_compiled_rule(rule_index, self.labels.clone()) {
881 Ok(inner) => Ok(inner),
882 Err(err) => {
883 if self
884 .scanner_features
885 .skip_rules_with_regex_matching_empty_string
886 && err
887 == CreateScannerError::InvalidRegex(
888 RegexValidationError::MatchesEmptyString,
889 )
890 {
891 #[allow(clippy::print_stdout)]
893 {
894 println!("skipping rule that matches empty string: rule_index={}, labels={:?}", rule_index, self.labels.clone());
895 }
896 return None;
897 } else {
898 Err(err)
899 }
900 }
901 };
902 Some((config, inner))
903 })
904 .map(|(config, inner)| {
905 config.match_action.validate()?;
906 Ok(RootCompiledRule {
907 inner: inner?,
908 scope: config.scope.clone(),
909 match_action: config.match_action.clone(),
910 match_validation_type: config.get_third_party_active_checker().cloned(),
911 })
912 })
913 .collect::<Result<Vec<RootCompiledRule>, CreateScannerError>>()?;
914
915 let mut per_scanner_data = SharedData::new();
916
917 compiled_rules.iter().for_each(|rule| {
918 rule.init_per_scanner_data(&mut per_scanner_data);
919 });
920
921 let scoped_ruleset = ScopedRuleSet::new(
922 &compiled_rules
923 .iter()
924 .map(|rule| rule.scope.clone())
925 .collect::<Vec<_>>(),
926 )
927 .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards);
928
929 {
930 let stats = &*GLOBAL_STATS;
931 stats.scanner_creations.increment(1);
932 stats.increment_total_scanners();
933 }
934
935 Ok(Scanner {
936 rules: compiled_rules,
937 scoped_ruleset,
938 scanner_features: self.scanner_features,
939 metrics: ScannerMetrics::new(&self.labels),
940 match_validators_per_type,
941 labels: self.labels,
942 per_scanner_data,
943 async_scan_timeout: self.async_scan_timeout,
944 })
945 }
946}
947
948struct ScannerContentVisitor<'a, E: Encoding> {
949 scanner: &'a Scanner,
950 regex_caches: &'a mut RegexCaches,
951 rule_matches: &'a mut InternalRuleMatchSet<E>,
952 blocked_rules: &'a Vec<usize>,
955 excluded_matches: &'a mut AHashSet<String>,
956 per_event_data: SharedData,
957 wildcarded_indexes: &'a AHashMap<Path<'static>, Vec<(usize, usize)>>,
958 async_jobs: &'a mut Vec<PendingRuleJob>,
959}
960
961impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> {
962 fn visit_content<'b>(
963 &'b mut self,
964 path: &Path<'a>,
965 content: &str,
966 mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor,
967 exclusion_check: ExclusionCheck<'b>,
968 ) -> Result<bool, ScannerError> {
969 let mut path_rules_matches = vec![];
971
972 let mut per_string_data = SharedData::new();
974 let wildcard_indices_per_path = self.wildcarded_indexes.get(path);
975
976 rule_visitor.visit_rule_indices(|rule_index| {
977 if self.blocked_rules.contains(&rule_index) {
978 return Ok(());
979 }
980 let rule = &self.scanner.rules[rule_index];
981 {
982 let mut emitter = |rule_match: StringMatch| {
984 assert_ne!(rule_match.start, rule_match.end, "empty match detected");
987 path_rules_matches.push(InternalRuleMatch::new(rule_index, rule_match));
988 };
989
990 rule.init_per_string_data(&self.scanner.labels, &mut per_string_data);
991
992 rule.init_per_event_data(&mut self.per_event_data);
994
995 let mut ctx = StringMatchesCtx {
996 rule_index,
997 regex_caches: self.regex_caches,
998 exclusion_check: &exclusion_check,
999 excluded_matches: self.excluded_matches,
1000 match_emitter: &mut emitter,
1001 wildcard_indices: wildcard_indices_per_path,
1002 per_string_data: &mut per_string_data,
1003 per_scanner_data: &self.scanner.per_scanner_data,
1004 per_event_data: &mut self.per_event_data,
1005 };
1006
1007 let async_status = rule.get_string_matches(content, path, &mut ctx)?;
1008
1009 match async_status {
1010 RuleStatus::Done => {
1011 }
1013 RuleStatus::Pending(fut) => {
1014 self.async_jobs.push(PendingRuleJob {
1015 fut,
1016 path: path.into_static(),
1017 });
1018 }
1019 }
1020 }
1021 Ok(())
1022 })?;
1023
1024 let needs_to_access_content = !path_rules_matches.is_empty() || !self.async_jobs.is_empty();
1029
1030 self.rule_matches
1031 .push_sync_matches(path, path_rules_matches);
1032
1033 Ok(needs_to_access_content)
1034 }
1035}
1036
1037fn get_next_regex_start(content: &str, regex_match: &Match) -> Option<usize> {
1039 if let Some((i, _)) = content[regex_match.start()..].char_indices().nth(1) {
1041 Some(regex_match.start() + i)
1042 } else {
1043 None
1045 }
1046}
1047
1048fn is_false_positive_match(
1049 regex_match: &Match,
1050 rule: &RegexCompiledRule,
1051 content: &str,
1052 check_excluded_keywords: bool,
1053) -> bool {
1054 if check_excluded_keywords {
1055 if let Some(excluded_keywords) = &rule.excluded_keywords {
1056 if excluded_keywords.is_false_positive_match(content, regex_match.start()) {
1057 return true;
1058 }
1059 }
1060 }
1061
1062 if let Some(validator) = rule.validator.as_ref() {
1063 if !validator.is_valid_match(&content[regex_match.range()]) {
1064 return true;
1065 };
1066 }
1067 false
1068}