1use crate::{
4 api, charclasses,
5 codepointset::{interval_contains, CodePointSet, Interval},
6 ir,
7 types::{
8 BracketContents, CaptureGroupID, CaptureGroupName, CharacterClassType, MAX_CAPTURE_GROUPS,
9 MAX_LOOPS,
10 },
11 unicode::{
12 self, unicode_property_from_str, unicode_property_name_from_str, PropertyEscapeKind,
13 },
14 unicodetables::{id_continue_ranges, id_start_ranges},
15 util::to_char_sat,
16};
17use core::{fmt, iter::Peekable};
18#[cfg(feature = "std")]
19use std::collections::HashMap;
20#[cfg(not(feature = "std"))]
21use {
22 alloc::{
23 boxed::Box,
24 string::{String, ToString},
25 vec::Vec,
26 },
27 hashbrown::HashMap,
28};
29
30#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
34pub struct Error {
35 pub text: String,
36}
37
38impl fmt::Display for Error {
39 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
40 f.write_str(&self.text)
41 }
42}
43
44#[cfg(feature = "std")]
45impl std::error::Error for Error {}
46
47enum ClassAtom {
48 CodePoint(u32),
49 CharacterClass {
50 class_type: CharacterClassType,
51 positive: bool,
52 },
53 Range {
54 iv: CodePointSet,
55 negate: bool,
56 },
57}
58
59#[derive(Debug, Clone)]
61struct ClassSet {
62 codepoints: CodePointSet,
63 alternatives: ClassSetAlternativeStrings,
64}
65
66impl ClassSet {
67 fn new() -> Self {
68 ClassSet {
69 codepoints: CodePointSet::new(),
70 alternatives: ClassSetAlternativeStrings::new(),
71 }
72 }
73
74 fn node(self, icase: bool, negate_set: bool) -> ir::Node {
75 let codepoints = if icase {
76 unicode::add_icase_code_points(self.codepoints)
77 } else {
78 self.codepoints
79 };
80 if codepoints.is_empty() && self.alternatives.0.is_empty() {
81 ir::Node::Bracket(BracketContents {
82 invert: negate_set,
83 cps: codepoints,
84 })
85 } else if codepoints.is_empty() {
86 make_alt(self.alternatives.to_nodes(icase))
87 } else if self.alternatives.0.is_empty() {
88 ir::Node::Bracket(BracketContents {
89 invert: negate_set,
90 cps: codepoints,
91 })
92 } else {
93 let mut nodes = self.alternatives.to_nodes(icase);
94 nodes.push(ir::Node::Bracket(BracketContents {
95 invert: negate_set,
96 cps: codepoints,
97 }));
98 make_alt(nodes)
99 }
100 }
101
102 fn union_operand(&mut self, operand: ClassSetOperand) {
103 match operand {
104 ClassSetOperand::ClassSetCharacter(c) => {
105 self.codepoints.add_one(c);
106 }
107 ClassSetOperand::CharacterClassEscape(cps) => {
108 self.codepoints.add_set(cps);
109 }
110 ClassSetOperand::Class(class) => {
111 self.codepoints.add_set(class.codepoints);
112 self.alternatives.extend(class.alternatives);
113 }
114 ClassSetOperand::ClassStringDisjunction(s) => {
115 self.alternatives.extend(s);
116 }
117 }
118 }
119
120 fn intersect_operand(&mut self, operand: ClassSetOperand) {
121 match operand {
122 ClassSetOperand::ClassSetCharacter(c) => {
123 if self.codepoints.contains(c) {
124 self.codepoints =
125 CodePointSet::from_sorted_disjoint_intervals(Vec::from([Interval {
126 first: c,
127 last: c,
128 }]));
129 } else {
130 self.codepoints.clear();
131 }
132 let mut found_alternative = false;
133 for alternative in &mut self.alternatives.0 {
134 if alternative.len() == 1 && alternative[0] == c {
135 found_alternative = true;
136 break;
137 }
138 }
139 self.alternatives.0.clear();
140 if found_alternative {
141 self.alternatives.0.push(Vec::from([c]));
142 }
143 }
144 ClassSetOperand::CharacterClassEscape(cps) => {
145 self.codepoints.intersect(cps.intervals());
146 let mut retained = Vec::new();
147 for alternative in &self.alternatives.0 {
148 if alternative.len() == 1 && cps.contains(alternative[0]) {
149 retained.push(alternative.clone());
150 }
151 }
152 self.alternatives.0 = retained;
153 }
154 ClassSetOperand::Class(class) => {
155 let mut retained_codepoints = CodePointSet::new();
156 for alternative in &class.alternatives.0 {
157 if alternative.len() == 1 && self.codepoints.contains(alternative[0]) {
158 retained_codepoints.add_one(alternative[0]);
159 }
160 }
161 let mut retained_alternatives = ClassSetAlternativeStrings::new();
162 for alternative in &self.alternatives.0 {
163 if alternative.len() == 1 && class.codepoints.contains(alternative[0]) {
164 retained_alternatives.0.push(alternative.clone());
165 }
166 }
167 self.codepoints.intersect(class.codepoints.intervals());
168 self.codepoints.add_set(retained_codepoints);
169 self.alternatives.intersect(class.alternatives);
170 self.alternatives.extend(retained_alternatives);
171 }
172 ClassSetOperand::ClassStringDisjunction(s) => {
173 let mut retained = CodePointSet::new();
174 for alternative in &s.0 {
175 if alternative.len() == 1 && self.codepoints.contains(alternative[0]) {
176 retained.add_one(alternative[0]);
177 }
178 }
179 self.codepoints = retained;
180 self.alternatives.intersect(s);
181 }
182 }
183 }
184
185 fn subtract_operand(&mut self, operand: ClassSetOperand) {
186 match operand {
187 ClassSetOperand::ClassSetCharacter(c) => {
188 self.codepoints.remove(&[Interval { first: c, last: c }]);
189 self.alternatives
190 .remove(ClassSetAlternativeStrings(Vec::from([Vec::from([c])])));
191 }
192 ClassSetOperand::CharacterClassEscape(cps) => {
193 self.codepoints.remove(cps.intervals());
194 let mut to_remove = ClassSetAlternativeStrings::new();
195 for alternative in &self.alternatives.0 {
196 if alternative.len() == 1 && cps.contains(alternative[0]) {
197 to_remove.0.push(alternative.clone());
198 }
199 }
200 self.alternatives.remove(to_remove);
201 }
202 ClassSetOperand::Class(class) => {
203 let mut codepoints_removed = CodePointSet::new();
204 for alternative in &class.alternatives.0 {
205 if alternative.len() == 1 && self.codepoints.contains(alternative[0]) {
206 codepoints_removed.add_one(alternative[0]);
207 }
208 }
209 let mut alternatives_removed = ClassSetAlternativeStrings::new();
210 for alternative in &self.alternatives.0 {
211 if alternative.len() == 1 && class.codepoints.contains(alternative[0]) {
212 alternatives_removed.0.push(alternative.clone());
213 }
214 }
215 self.codepoints.remove(codepoints_removed.intervals());
216 self.codepoints.remove(class.codepoints.intervals());
217 self.alternatives.remove(alternatives_removed);
218 self.alternatives.remove(class.alternatives);
219 }
220 ClassSetOperand::ClassStringDisjunction(s) => {
221 let mut to_remove = CodePointSet::new();
222 for alternative in &s.0 {
223 if alternative.len() == 1 && self.codepoints.contains(alternative[0]) {
224 to_remove.add_one(alternative[0]);
225 }
226 }
227 self.codepoints.remove(to_remove.intervals());
228 self.alternatives.remove(s);
229 }
230 }
231 }
232}
233
234#[derive(Debug, Clone)]
236enum ClassSetOperand {
237 ClassSetCharacter(u32),
238 CharacterClassEscape(CodePointSet),
239 Class(ClassSet),
240 ClassStringDisjunction(ClassSetAlternativeStrings),
241}
242
243#[derive(Debug, Clone)]
244struct ClassSetAlternativeStrings(Vec<Vec<u32>>);
245
246impl ClassSetAlternativeStrings {
247 fn new() -> Self {
248 ClassSetAlternativeStrings(Vec::new())
249 }
250
251 fn extend(&mut self, other: Self) {
252 self.0.extend(other.0);
253 }
254
255 fn intersect(&mut self, other: Self) {
256 let mut retained = Vec::new();
257 for string in &self.0 {
258 for other_string in &other.0 {
259 if string == other_string {
260 retained.push(string.clone());
261 break;
262 }
263 }
264 }
265 self.0 = retained;
266 }
267
268 fn remove(&mut self, other: Self) {
269 self.0.retain(|string| !other.0.contains(string));
270 }
271
272 fn to_nodes(&self, icase: bool) -> Vec<ir::Node> {
273 let mut nodes = Vec::new();
274 for string in &self.0 {
275 nodes.push(ir::Node::Cat(
276 string
277 .iter()
278 .map(|cp| ir::Node::Char { c: *cp, icase })
279 .collect::<Vec<_>>(),
280 ));
281 }
282 nodes
283 }
284}
285
286fn error<S, T>(text: S) -> Result<T, Error>
287where
288 S: ToString,
289{
290 Err(Error {
291 text: text.to_string(),
292 })
293}
294
295fn make_cat(nodes: ir::NodeList) -> ir::Node {
296 match nodes.len() {
297 0 => ir::Node::Empty,
298 1 => nodes.into_iter().next().unwrap(),
299 _ => ir::Node::Cat(nodes),
300 }
301}
302
303fn make_alt(nodes: ir::NodeList) -> ir::Node {
304 let mut mright = None;
305 for node in nodes.into_iter().rev() {
306 match mright {
307 None => mright = Some(node),
308 Some(right) => mright = Some(ir::Node::Alt(Box::new(node), Box::new(right))),
309 }
310 }
311 mright.unwrap_or(ir::Node::Empty)
312}
313
314fn codepoints_from_class(ct: CharacterClassType, positive: bool) -> CodePointSet {
317 let mut cps;
318 match ct {
319 CharacterClassType::Digits => {
320 cps = CodePointSet::from_sorted_disjoint_intervals(charclasses::DIGITS.to_vec())
321 }
322 CharacterClassType::Words => {
323 cps = CodePointSet::from_sorted_disjoint_intervals(charclasses::WORD_CHARS.to_vec())
324 }
325 CharacterClassType::Spaces => {
326 cps = CodePointSet::from_sorted_disjoint_intervals(charclasses::WHITESPACE.to_vec());
327 for &iv in charclasses::LINE_TERMINATOR.iter() {
328 cps.add(iv)
329 }
330 }
331 };
332 if !positive {
333 cps = cps.inverted()
334 }
335 cps
336}
337
338fn make_bracket_class(ct: CharacterClassType, positive: bool) -> ir::Node {
340 ir::Node::Bracket(BracketContents {
341 invert: false,
342 cps: codepoints_from_class(ct, positive),
343 })
344}
345
346fn add_class_atom(bc: &mut BracketContents, atom: ClassAtom) {
347 match atom {
348 ClassAtom::CodePoint(c) => bc.cps.add_one(c),
349 ClassAtom::CharacterClass {
350 class_type,
351 positive,
352 } => {
353 bc.cps.add_set(codepoints_from_class(class_type, positive));
354 }
355 ClassAtom::Range { iv, negate } => {
356 if negate {
357 bc.cps.add_set(iv.inverted());
358 } else {
359 bc.cps.add_set(iv);
360 }
361 }
362 }
363}
364
365struct LookaroundParams {
366 negate: bool,
367 backwards: bool,
368}
369
370struct Parser<I>
372where
373 I: Iterator<Item = u32>,
374{
375 input: Peekable<I>,
377
378 flags: api::Flags,
380
381 loop_count: u32,
383
384 group_count: CaptureGroupID,
386
387 group_count_max: u32,
389
390 named_group_indices: HashMap<CaptureGroupName, u32>,
392
393 has_lookbehind: bool,
395}
396
397impl<I> Parser<I>
398where
399 I: Iterator<Item = u32> + Clone,
400{
401 fn consume<C: Into<u32>>(&mut self, c: C) -> u32 {
403 let nc = self.input.next();
404 core::debug_assert!(nc == Some(c.into()), "char was not next");
405 nc.unwrap()
406 }
407
408 fn try_consume<C: Into<u32>>(&mut self, c: C) -> bool {
411 self.input.next_if_eq(&c.into()).is_some()
412 }
413
414 fn try_consume_str(&mut self, s: &str) -> bool {
417 let mut cursor = self.input.clone();
418 for c1 in s.chars() {
419 if cursor.next() != Some(c1 as u32) {
420 return false;
421 }
422 }
423 self.input = cursor;
424 true
425 }
426
427 fn fold_if_icase(&self, c: u32) -> u32 {
429 if self.flags.icase {
430 unicode::fold_code_point(c, self.flags.unicode)
431 } else {
432 c
433 }
434 }
435
436 fn peek(&mut self) -> Option<u32> {
438 self.input.peek().copied()
439 }
440
441 fn next(&mut self) -> Option<u32> {
443 self.input.next()
444 }
445
446 fn try_parse(&mut self) -> Result<ir::Regex, Error> {
447 self.parse_capture_groups()?;
448
449 let body = self.consume_disjunction()?;
452 match self.input.peek().copied() {
453 Some(c) if c == ')' as u32 => error("Unbalanced parenthesis"),
454 Some(c) => error(format!(
455 "Unexpected char: {}",
456 char::from_u32(c)
457 .map(String::from)
458 .unwrap_or_else(|| format!("\\u{c:04X}"))
459 )),
460 None => self.finalize(ir::Regex {
461 node: make_cat(vec![body, ir::Node::Goal]),
462 flags: self.flags,
463 }),
464 }
465 }
466
467 fn consume_disjunction(&mut self) -> Result<ir::Node, Error> {
469 let mut terms = vec![self.consume_term()?];
470 while self.try_consume('|') {
471 terms.push(self.consume_term()?)
472 }
473 Ok(make_alt(terms))
474 }
475
476 fn consume_term(&mut self) -> Result<ir::Node, Error> {
478 let mut result: Vec<ir::Node> = Vec::new();
479 loop {
480 let start_group = self.group_count;
481 let mut start_offset = result.len();
482 let mut quantifier_allowed = true;
483
484 let nc = self.peek();
485 if nc.is_none() {
486 return Ok(make_cat(result));
487 }
488 let c = nc.unwrap();
489 match to_char_sat(c) {
490 ')' | '|' => break,
492 '^' => {
494 self.consume('^');
495 result.push(ir::Node::Anchor(ir::AnchorType::StartOfLine));
496 quantifier_allowed = false;
497 }
498 '$' => {
500 self.consume('$');
501 result.push(ir::Node::Anchor(ir::AnchorType::EndOfLine));
502 quantifier_allowed = false;
503 }
504
505 '\\' => {
506 self.consume('\\');
507 let Some(c) = self.peek() else {
508 return error("Incomplete escape");
509 };
510 match to_char_sat(c) {
511 'b' => {
513 self.consume('b');
514 result.push(ir::Node::WordBoundary { invert: false });
515 }
516 'B' => {
518 self.consume('B');
519 result.push(ir::Node::WordBoundary { invert: true });
520 }
521 'c' if !self.flags.unicode => {
524 self.consume('c');
525 if self
526 .peek()
527 .and_then(char::from_u32)
528 .map(|c| c.is_ascii_alphabetic())
529 == Some(true)
530 {
531 result.push(ir::Node::Char {
532 c: self.next().expect("char was not next") % 32,
533 icase: self.flags.icase,
534 });
535 } else {
536 start_offset += 1;
537 result.push(ir::Node::Char {
538 c: u32::from('\\'),
539 icase: self.flags.icase,
540 });
541 result.push(ir::Node::Char {
542 c: u32::from('c'),
543 icase: self.flags.icase,
544 });
545 }
546 }
547 _ => {
549 result.push(self.consume_atom_escape()?);
550 }
551 }
552 }
553
554 '.' => {
556 self.consume('.');
557 result.push(if self.flags.dot_all {
558 ir::Node::MatchAny
559 } else {
560 ir::Node::MatchAnyExceptLineTerminator
561 });
562 }
563
564 '(' => {
565 if self.try_consume_str("(?=") {
566 quantifier_allowed = !self.flags.unicode;
568 result.push(self.consume_lookaround_assertion(LookaroundParams {
569 negate: false,
570 backwards: false,
571 })?);
572 } else if self.try_consume_str("(?!") {
573 quantifier_allowed = !self.flags.unicode;
575 result.push(self.consume_lookaround_assertion(LookaroundParams {
576 negate: true,
577 backwards: false,
578 })?);
579 } else if self.try_consume_str("(?<=") {
580 quantifier_allowed = false;
582 self.has_lookbehind = true;
583 result.push(self.consume_lookaround_assertion(LookaroundParams {
584 negate: false,
585 backwards: true,
586 })?);
587 } else if self.try_consume_str("(?<!") {
588 quantifier_allowed = false;
590 self.has_lookbehind = true;
591 result.push(self.consume_lookaround_assertion(LookaroundParams {
592 negate: true,
593 backwards: true,
594 })?);
595 } else if self.try_consume_str("(?:") {
596 result.push(self.consume_disjunction()?);
598 } else {
599 self.consume('(');
601 let group = self.group_count;
602 if self.group_count as usize >= MAX_CAPTURE_GROUPS {
603 return error("Capture group count limit exceeded");
604 }
605 self.group_count += 1;
606
607 if self.try_consume_str("?") {
609 let group_name = if let Some(group_name) =
610 self.try_consume_named_capture_group_name()
611 {
612 group_name
613 } else {
614 return error("Invalid token at named capture group identifier");
615 };
616 let contents = self.consume_disjunction()?;
617 result.push(ir::Node::NamedCaptureGroup(
618 Box::new(contents),
619 group,
620 group_name,
621 ))
622 } else {
623 let contents = self.consume_disjunction()?;
624 result.push(ir::Node::CaptureGroup(Box::new(contents), group))
625 }
626 }
627 if !self.try_consume(')') {
628 return error("Unbalanced parenthesis");
629 }
630 }
631
632 '[' if self.flags.unicode_sets => {
634 self.consume('[');
635 let negate_set = self.try_consume('^');
636 result.push(
637 self.consume_class_set_expression(negate_set)?
638 .node(self.flags.icase, negate_set),
639 );
640 }
641
642 '[' => {
643 result.push(self.consume_bracket()?);
644 }
645
646 '{' if !self.flags.unicode => {
648 if self.try_consume_braced_quantifier().is_some() {
649 return error("Invalid braced quantifier");
650 }
651
652 result.push(ir::Node::Char {
654 c: self.consume(c),
655 icase: self.flags.icase,
656 })
657 }
658
659 '*' | '+' | '?' | ']' | '{' | '}' if self.flags.unicode => {
661 return error("Invalid atom character");
662 }
663
664 '*' | '+' | '?' => {
666 return error("Invalid atom character");
667 }
668
669 _ => {
672 self.consume(c);
673 result.push(ir::Node::Char {
674 c: self.fold_if_icase(c),
675 icase: self.flags.icase,
676 })
677 }
678 }
679
680 if let Some(quant) = self.try_consume_quantifier()? {
682 if !quantifier_allowed {
683 return error("Quantifier not allowed here");
684 }
685 if quant.min > quant.max {
690 return error("Invalid quantifier");
691 }
692 let quantifee = result.split_off(start_offset);
693 if self.loop_count as usize >= MAX_LOOPS {
694 return error("Loop count limit exceeded");
695 }
696 self.loop_count += 1;
697 result.push(ir::Node::Loop {
698 loopee: Box::new(make_cat(quantifee)),
699 quant,
700 enclosed_groups: start_group..self.group_count,
701 });
702 }
703 }
704 Ok(make_cat(result))
705 }
706
707 fn consume_bracket(&mut self) -> Result<ir::Node, Error> {
709 self.consume('[');
710 let invert = self.try_consume('^');
711 let mut result = BracketContents {
712 invert,
713 cps: CodePointSet::default(),
714 };
715
716 loop {
717 match self.peek().map(to_char_sat) {
718 None => {
719 return error("Unbalanced bracket");
720 }
721 Some(']') => {
722 self.consume(']');
723 if self.flags.icase {
724 result.cps = unicode::add_icase_code_points(result.cps);
725 }
726 return Ok(ir::Node::Bracket(result));
727 }
728 _ => {}
729 }
730
731 let Some(first) = self.try_consume_bracket_class_atom()? else {
733 continue;
734 };
735
736 if !self.try_consume('-') {
738 add_class_atom(&mut result, first);
739 continue;
740 }
741
742 let Some(second) = self.try_consume_bracket_class_atom()? else {
743 add_class_atom(&mut result, first);
745 add_class_atom(&mut result, ClassAtom::CodePoint(u32::from('-')));
746 continue;
747 };
748
749 if let (ClassAtom::CodePoint(c1), ClassAtom::CodePoint(c2)) = (&first, &second) {
752 if c1 > c2 {
753 return error(
754 "Range values reversed, start char code is greater than end char code.",
755 );
756 }
757 result.cps.add(Interval {
758 first: *c1,
759 last: *c2,
760 });
761
762 continue;
763 }
764
765 if self.flags.unicode {
766 return error("Invalid character range");
767 }
768
769 add_class_atom(&mut result, first);
771 add_class_atom(&mut result, ClassAtom::CodePoint(u32::from('-')));
772 add_class_atom(&mut result, second);
773 }
774 }
775
776 fn try_consume_bracket_class_atom(&mut self) -> Result<Option<ClassAtom>, Error> {
777 let c = self.peek();
778 if c.is_none() {
779 return Ok(None);
780 }
781 let c = c.unwrap();
782 match to_char_sat(c) {
783 ']' => Ok(None),
785
786 '\\' => {
788 self.consume('\\');
789 let ec = if let Some(ec) = self.peek() {
790 ec
791 } else {
792 return error("Unterminated escape");
793 };
794 match to_char_sat(ec) {
795 'b' => {
797 self.consume('b');
798 Ok(Some(ClassAtom::CodePoint(u32::from('\x08'))))
799 }
800 '-' if self.flags.unicode => {
802 self.consume('-');
803 Ok(Some(ClassAtom::CodePoint(u32::from('-'))))
804 }
805 'c' if !self.flags.unicode => {
806 let input = self.input.clone();
807 self.consume('c');
808 match self.peek().map(to_char_sat) {
809 Some('0'..='9' | '_') => {
811 let next = self.next().expect("char was not next");
812 Ok(Some(ClassAtom::CodePoint(next & 0x1F)))
813 }
814 Some('a'..='z' | 'A'..='Z') => {
816 let next = self.next().expect("char was not next");
817 Ok(Some(ClassAtom::CodePoint(next % 32)))
818 }
819 _ => {
821 self.input = input;
822 Ok(Some(ClassAtom::CodePoint(u32::from('\\'))))
823 }
824 }
825 }
826 'd' => {
828 self.consume('d');
829 Ok(Some(ClassAtom::CharacterClass {
830 class_type: CharacterClassType::Digits,
831 positive: true,
832 }))
833 }
834 'D' => {
836 self.consume('D');
837 Ok(Some(ClassAtom::CharacterClass {
838 class_type: CharacterClassType::Digits,
839 positive: false,
840 }))
841 }
842 's' => {
844 self.consume('s');
845 Ok(Some(ClassAtom::CharacterClass {
846 class_type: CharacterClassType::Spaces,
847 positive: true,
848 }))
849 }
850 'S' => {
852 self.consume('S');
853 Ok(Some(ClassAtom::CharacterClass {
854 class_type: CharacterClassType::Spaces,
855 positive: false,
856 }))
857 }
858 'w' => {
860 self.consume('w');
861 Ok(Some(ClassAtom::CharacterClass {
862 class_type: CharacterClassType::Words,
863 positive: true,
864 }))
865 }
866 'W' => {
868 self.consume('W');
869 Ok(Some(ClassAtom::CharacterClass {
870 class_type: CharacterClassType::Words,
871 positive: false,
872 }))
873 }
874 'p' | 'P' if self.flags.unicode => {
877 self.consume(ec);
878 let negate = ec == 'P' as u32;
879 match self.try_consume_unicode_property_escape()? {
880 PropertyEscapeKind::CharacterClass(s) => Ok(Some(ClassAtom::Range {
881 iv: CodePointSet::from_sorted_disjoint_intervals(s.to_vec()),
882 negate,
883 })),
884 PropertyEscapeKind::StringSet(_) => error("Invalid property escape"),
885 }
886 }
887 _ => {
889 let cc = self.consume_character_escape()?;
890 Ok(Some(ClassAtom::CodePoint(cc)))
891 }
892 }
893 }
894
895 _ => Ok(Some(ClassAtom::CodePoint(self.consume(c)))),
896 }
897 }
898
899 fn consume_class_set_expression(&mut self, negate_set: bool) -> Result<ClassSet, Error> {
901 let mut result = ClassSet::new();
902
903 let first = match self.peek() {
904 Some(0x5D ) => {
905 self.consume(']');
906 return Ok(result);
907 }
908 Some(_) => self.consume_class_set_operand(negate_set)?,
909 None => {
910 return error("Unbalanced class set bracket");
911 }
912 };
913
914 enum ClassSetOperator {
915 Union,
916 Intersection,
917 Subtraction,
918 }
919
920 let op = match self.peek() {
921 Some(0x5D ) => {
922 self.consume(']');
923 result.union_operand(first);
924 return Ok(result);
925 }
926 Some(0x26 ) => {
927 self.consume('&');
928 if self.peek() == Some(0x26 ) {
929 self.consume('&');
930 result.union_operand(first.clone());
931 ClassSetOperator::Intersection
932 } else {
933 result.codepoints.add_one(0x26 );
934 ClassSetOperator::Union
935 }
936 }
937 Some(0x2D ) => {
938 self.consume('-');
939 if self.peek() == Some(0x2D ) {
940 self.consume('-');
941 result.union_operand(first.clone());
942 ClassSetOperator::Subtraction
943 } else {
944 match first {
945 ClassSetOperand::ClassSetCharacter(first) => {
946 let ClassSetOperand::ClassSetCharacter(last) =
947 self.consume_class_set_operand(negate_set)?
948 else {
949 return error("Invalid class set range");
950 };
951 if first > last {
952 return error("Invalid class set range");
953 }
954 result.codepoints.add(Interval { first, last });
955 }
956 _ => {
957 return error("Invalid class set range");
958 }
959 };
960 ClassSetOperator::Union
961 }
962 }
963 Some(_) => {
964 result.union_operand(first.clone());
965 ClassSetOperator::Union
966 }
967 None => {
968 return error("Unbalanced class set bracket");
969 }
970 };
971
972 match op {
973 ClassSetOperator::Union => {
974 loop {
975 let operand = match self.peek() {
976 Some(0x5D ) => {
977 self.consume(']');
978 return Ok(result);
979 }
980 Some(_) => self.consume_class_set_operand(negate_set)?,
981 None => return error("Unbalanced class set bracket"),
982 };
983 if self.peek() == Some(0x2D ) {
984 self.consume('-');
985 match operand {
986 ClassSetOperand::ClassSetCharacter(first) => {
987 let ClassSetOperand::ClassSetCharacter(last) =
988 self.consume_class_set_operand(negate_set)?
989 else {
990 return error("Invalid class set range");
991 };
992 if first > last {
993 return error("Invalid class set range");
994 }
995 result.codepoints.add(Interval { first, last });
996 }
997 _ => {
998 return error("Invalid class set range");
999 }
1000 };
1001 } else {
1002 result.union_operand(operand);
1003 }
1004 }
1005 }
1006 ClassSetOperator::Intersection => {
1008 loop {
1009 let operand = self.consume_class_set_operand(negate_set)?;
1010 result.intersect_operand(operand);
1011 match self.next() {
1012 Some(0x5D ) => return Ok(result),
1013 Some(0x26 ) => {}
1014 Some(_) => return error("Unexpected character in class set intersection"),
1015 _ => return error("Unbalanced class set bracket"),
1016 }
1017 if self.next() != Some(0x26 ) {
1018 return error("Unbalanced class set bracket");
1019 }
1020 }
1021 }
1022 ClassSetOperator::Subtraction => {
1024 loop {
1025 let operand = self.consume_class_set_operand(negate_set)?;
1026 result.subtract_operand(operand);
1027 match self.next() {
1028 Some(0x5D ) => return Ok(result),
1029 Some(0x2D ) => {}
1030 Some(_) => return error("Unexpected character in class set subtraction"),
1031 _ => return error("Unbalanced class set bracket"),
1032 }
1033 if self.next() != Some(0x2D ) {
1034 return error("Unbalanced class set bracket");
1035 }
1036 }
1037 }
1038 }
1039 }
1040
1041 fn consume_class_set_operand(&mut self, negate_set: bool) -> Result<ClassSetOperand, Error> {
1042 use ClassSetOperand::*;
1043 let Some(cp) = self.peek() else {
1044 return error("Empty class set operand");
1045 };
1046 match cp {
1047 0x5B => {
1050 self.consume('[');
1051 let negate_set = self.try_consume('^');
1052 let result = self.consume_class_set_expression(negate_set)?;
1053 if negate_set {
1054 result.codepoints.inverted();
1055 }
1056 Ok(Class(result))
1057 }
1058 0x5C => {
1063 self.consume('\\');
1064 let Some(cp) = self.peek() else {
1065 return error("Incomplete class set escape");
1066 };
1067 match cp {
1068 0x71 => {
1070 self.consume('q');
1071 if !self.try_consume('{') {
1072 return error("Invalid class set escape: expected {");
1073 }
1074 let mut alternatives = Vec::new();
1075 let mut alternative = Vec::new();
1076 loop {
1077 match self.peek() {
1078 Some(0x7D ) => {
1079 self.consume('}');
1080 if !alternative.is_empty() {
1081 alternatives.push(alternative.clone());
1082 alternative.clear();
1083 }
1084 break;
1085 }
1086 Some(0x7C ) => {
1087 self.consume('|');
1088 if !alternative.is_empty() {
1089 alternatives.push(alternative.clone());
1090 alternative.clear();
1091 }
1092 }
1093 Some(_) => {
1094 alternative.push(self.consume_class_set_character()?);
1095 }
1096 None => {
1097 return error("Unbalanced class set string disjunction");
1098 }
1099 }
1100 }
1101 Ok(ClassStringDisjunction(ClassSetAlternativeStrings(alternatives)))
1102 }
1103 0x64 => {
1105 self.consume('d');
1106 Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Digits, true)))
1107 }
1108 0x44 => {
1110 self.consume('D');
1111 Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Digits, false)))
1112 }
1113 0x73 => {
1115 self.consume('s');
1116 Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Spaces, true)))
1117 }
1118 0x53 => {
1120 self.consume('S');
1121 Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Spaces, false)))
1122 }
1123 0x77 => {
1125 self.consume('w');
1126 Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Words, true)))
1127 }
1128 0x57 => {
1130 self.consume('W');
1131 Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Words, false)))
1132 }
1133 0x70 => {
1135 self.consume('p');
1136 match self.try_consume_unicode_property_escape()? {
1137 PropertyEscapeKind::CharacterClass(intervals) => {
1138 Ok(CharacterClassEscape(CodePointSet::from_sorted_disjoint_intervals(
1139 intervals.to_vec(),
1140 )))
1141 }
1142 PropertyEscapeKind::StringSet(_) if negate_set => error("Invalid character escape"),
1143 PropertyEscapeKind::StringSet(strings) => {
1144 Ok(ClassStringDisjunction(ClassSetAlternativeStrings(strings.iter().map(|s| s.to_vec()).collect())))
1145 }
1146 }
1147 }
1148 0x50 => {
1150 self.consume('P');
1151 match self.try_consume_unicode_property_escape()? {
1152 PropertyEscapeKind::CharacterClass(s) => {
1153 Ok(CharacterClassEscape(CodePointSet::from_sorted_disjoint_intervals(
1154 s.to_vec(),
1155 ).inverted()))
1156 }
1157 PropertyEscapeKind::StringSet(_) => error("Invalid character escape"),
1158 }
1159 }
1160 0x62 => {
1162 Ok(ClassSetCharacter(self.consume(cp)))
1163 }
1164 _ if Self::is_class_set_reserved_punctuator(cp) => Ok(ClassSetCharacter(self.consume(cp))),
1166 _ => Ok(ClassSetCharacter(self.consume_character_escape()?))
1168 }
1169 }
1170 _ => Ok(ClassSetCharacter(self.consume_class_set_character()?)),
1173 }
1174 }
1175
1176 fn consume_class_set_character(&mut self) -> Result<u32, Error> {
1178 let Some(cp) = self.next() else {
1179 return error("Incomplete class set character");
1180 };
1181 match cp {
1182 0x5C => {
1183 let Some(cp) = self.peek() else {
1184 return error("Incomplete class set escape");
1185 };
1186 match cp {
1187 0x62 => {
1189 Ok(self.consume(cp))
1190 }
1191 _ if Self::is_class_set_reserved_punctuator(cp) => Ok(self.consume(cp)),
1193 _ => Ok(self.consume_character_escape()?)
1195 }
1196 }
1197 0x28 | 0x29 | 0x7B | 0x7D | 0x2F | 0x2D | 0x7C => error("Invalid class set character"),
1200 _ => {
1201 if Self::is_class_set_reserved_double_punctuator(cp) {
1202 if let Some(cp) = self.peek() {
1203 if Self::is_class_set_reserved_double_punctuator(cp) {
1204 return error("Invalid class set character");
1205 }
1206 }
1207 }
1208 Ok(cp)
1209 }
1210 }
1211 }
1212
1213 fn is_class_set_reserved_punctuator(cp: u32) -> bool {
1215 match cp {
1216 0x26 | 0x2D | 0x21 | 0x23 | 0x25 | 0x2C | 0x3A | 0x3B | 0x3C | 0x3D | 0x3E | 0x40 | 0x60 | 0x7E => true,
1219 _ => false,
1220 }
1221 }
1222
1223 fn is_class_set_reserved_double_punctuator(cp: u32) -> bool {
1224 match cp {
1225 0x26 | 0x21 | 0x23 | 0x24 | 0x25 | 0x2A | 0x2B | 0x2C | 0x2E | 0x3A | 0x3B | 0x3C | 0x3D | 0x3E | 0x3F | 0x40 | 0x5E | 0x60 | 0x7E => true,
1229 _ => false,
1230 }
1231 }
1232
1233 fn try_consume_quantifier(&mut self) -> Result<Option<ir::Quantifier>, Error> {
1234 if let Some(mut quant) = self.try_consume_quantifier_prefix()? {
1235 quant.greedy = !self.try_consume('?');
1236 Ok(Some(quant))
1237 } else {
1238 Ok(None)
1239 }
1240 }
1241
1242 fn try_consume_quantifier_prefix(&mut self) -> Result<Option<ir::Quantifier>, Error> {
1243 let nc = self.peek();
1244 if nc.is_none() {
1245 return Ok(None);
1246 }
1247 let c = nc.unwrap();
1248 match char::from_u32(c) {
1249 Some('+') => {
1250 self.consume('+');
1251 Ok(Some(ir::Quantifier {
1252 min: 1,
1253 max: usize::MAX,
1254 greedy: true,
1255 }))
1256 }
1257 Some('*') => {
1258 self.consume('*');
1259 Ok(Some(ir::Quantifier {
1260 min: 0,
1261 max: usize::MAX,
1262 greedy: true,
1263 }))
1264 }
1265 Some('?') => {
1266 self.consume('?');
1267 Ok(Some(ir::Quantifier {
1268 min: 0,
1269 max: 1,
1270 greedy: true,
1271 }))
1272 }
1273 Some('{') => {
1274 if let Some(quantifier) = self.try_consume_braced_quantifier() {
1275 Ok(Some(quantifier))
1276 } else if self.flags.unicode {
1277 error("Invalid quantifier")
1280 } else {
1281 Ok(None)
1282 }
1283 }
1284 _ => Ok(None),
1285 }
1286 }
1287
1288 fn try_consume_braced_quantifier(&mut self) -> Option<ir::Quantifier> {
1289 let pre_input = self.input.clone();
1291 self.consume('{');
1292 let optmin = self.try_consume_decimal_integer_literal();
1293 if optmin.is_none() {
1294 self.input = pre_input;
1296 return None;
1297 }
1298 let mut quant = ir::Quantifier {
1299 min: optmin.unwrap(),
1300 max: optmin.unwrap(),
1301 greedy: true,
1302 };
1303 if self.try_consume(',') {
1304 if let Some(max) = self.try_consume_decimal_integer_literal() {
1305 quant.max = max;
1307 } else {
1308 quant.max = usize::MAX;
1310 }
1311 } else {
1312 }
1314 if !self.try_consume('}') {
1315 self.input = pre_input;
1317 return None;
1318 }
1319 Some(quant)
1320 }
1321
1322 fn try_consume_decimal_integer_literal(&mut self) -> Option<usize> {
1326 let mut result: usize = 0;
1327 let mut char_count = 0;
1328 while let Some(c) = self.peek() {
1329 if let Some(digit) = char::from_u32(c).and_then(|c| char::to_digit(c, 10)) {
1330 self.consume(c);
1331 char_count += 1;
1332 result = result.saturating_mul(10);
1333 result = result.saturating_add(digit as usize);
1334 } else {
1335 break;
1336 }
1337 }
1338 if char_count > 0 {
1339 Some(result)
1340 } else {
1341 None
1342 }
1343 }
1344
1345 fn consume_lookaround_assertion(
1346 &mut self,
1347 params: LookaroundParams,
1348 ) -> Result<ir::Node, Error> {
1349 let start_group = self.group_count;
1350 let contents = self.consume_disjunction()?;
1351 let end_group = self.group_count;
1352 Ok(ir::Node::LookaroundAssertion {
1353 negate: params.negate,
1354 backwards: params.backwards,
1355 start_group,
1356 end_group,
1357 contents: Box::new(contents),
1358 })
1359 }
1360
1361 fn consume_character_escape(&mut self) -> Result<u32, Error> {
1362 let c = self.next().expect("Should have a character");
1363 let ch = to_char_sat(c);
1364 match ch {
1365 'f' => Ok(0xC),
1367 'n' => Ok(0xA),
1369 'r' => Ok(0xD),
1371 't' => Ok(0x9),
1373 'v' => Ok(0xB),
1375 'c' => {
1377 if let Some(nc) = self.next().and_then(char::from_u32) {
1378 if nc.is_ascii_lowercase() || nc.is_ascii_uppercase() {
1379 return Ok((nc as u32) % 32);
1380 }
1381 }
1382 error("Invalid character escape")
1383 }
1384 '0' if self
1386 .peek()
1387 .and_then(char::from_u32)
1388 .map(|c: char| c.is_ascii_digit())
1389 != Some(true) =>
1390 {
1391 Ok(0x0)
1392 }
1393 'x' => {
1395 let hex_to_digit = |c: char| c.to_digit(16);
1396 let x1 = self.next().and_then(char::from_u32).and_then(hex_to_digit);
1397 let x2 = self.next().and_then(char::from_u32).and_then(hex_to_digit);
1398 match (x1, x2) {
1399 (Some(x1), Some(x2)) => Ok(x1 * 16 + x2),
1400 _ if !self.flags.unicode => Ok(c),
1402 _ => error("Invalid character escape"),
1403 }
1404 }
1405 'u' => {
1407 if let Some(c) = self.try_escape_unicode_sequence() {
1408 Ok(c)
1409 } else if !self.flags.unicode {
1410 Ok(c)
1412 } else {
1413 error("Invalid unicode escape")
1414 }
1415 }
1416 '0'..='7' if !self.flags.unicode => {
1418 let Some(c1) = self.peek() else {
1419 return Ok(c - '0' as u32);
1420 };
1421 let ch1 = to_char_sat(c1);
1422
1423 match ch {
1424 '0' if ('8'..='9').contains(&ch1) => Ok(0x0),
1426 _ if !('0'..='7').contains(&ch1) => Ok(c - '0' as u32),
1428 '4'..='7' => {
1430 self.consume(c1);
1431 Ok((c - '0' as u32) * 8 + c1 - '0' as u32)
1432 }
1433 '0'..='3' => {
1436 self.consume(c1);
1437 if self.peek().map(|c2| ('0'..='7').contains(&to_char_sat(c2)))
1438 == Some(true)
1439 {
1440 let c2 = self.next().expect("char was not next");
1441 Ok((c - '0' as u32) * 64 + (c1 - '0' as u32) * 8 + c2 - '0' as u32)
1442 } else {
1443 Ok((c - '0' as u32) * 8 + c1 - '0' as u32)
1444 }
1445 }
1446 _ => unreachable!(),
1447 }
1448 }
1449 '^' | '$' | '\\' | '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|'
1452 | '/' => Ok(c),
1453 _ if !self.flags.unicode => Ok(c),
1455 _ => error("Invalid character escape"),
1456 }
1457 }
1458
1459 fn consume_atom_escape(&mut self) -> Result<ir::Node, Error> {
1461 let Some(c) = self.peek() else {
1462 return error("Incomplete escape");
1463 };
1464 match to_char_sat(c) {
1465 'd' | 'D' => {
1466 self.consume(c);
1467 Ok(make_bracket_class(
1468 CharacterClassType::Digits,
1469 c == 'd' as u32,
1470 ))
1471 }
1472
1473 's' | 'S' => {
1474 self.consume(c);
1475 Ok(make_bracket_class(
1476 CharacterClassType::Spaces,
1477 c == 's' as u32,
1478 ))
1479 }
1480
1481 'w' | 'W' => {
1482 self.consume(c);
1483 Ok(make_bracket_class(
1484 CharacterClassType::Words,
1485 c == 'w' as u32,
1486 ))
1487 }
1488
1489 'p' | 'P' if self.flags.unicode || self.flags.unicode_sets => {
1492 self.consume(c);
1493 let negate = c == 'P' as u32;
1494 let property_escape = self.try_consume_unicode_property_escape()?;
1495 match property_escape {
1496 PropertyEscapeKind::CharacterClass(s) => {
1497 Ok(ir::Node::Bracket(BracketContents {
1498 invert: negate,
1499 cps: CodePointSet::from_sorted_disjoint_intervals(s.to_vec()),
1500 }))
1501 }
1502 PropertyEscapeKind::StringSet(_) if negate => error("Invalid character escape"),
1503 PropertyEscapeKind::StringSet(strings) => Ok(make_alt(
1504 strings
1505 .iter()
1506 .map(|s| {
1507 ir::Node::Cat(
1508 s.iter()
1509 .map(|c| ir::Node::Char {
1510 c: *c,
1511 icase: self.flags.icase,
1512 })
1513 .collect(),
1514 )
1515 })
1516 .collect(),
1517 )),
1518 }
1519 }
1520
1521 '1'..='9' if self.flags.unicode => {
1524 let group = self.try_consume_decimal_integer_literal().unwrap();
1525 if group <= self.group_count_max as usize {
1526 Ok(ir::Node::BackRef(group as u32))
1527 } else {
1528 error("Invalid character escape")
1529 }
1530 }
1531
1532 '1'..='9' => {
1536 let input = self.input.clone();
1537 let group = self.try_consume_decimal_integer_literal().unwrap();
1538
1539 if group <= self.group_count_max as usize {
1540 Ok(ir::Node::BackRef(group as u32))
1541 } else {
1542 self.input = input;
1543 let c = self.consume_character_escape()?;
1544 Ok(ir::Node::Char {
1545 c: self.fold_if_icase(c),
1546 icase: self.flags.icase,
1547 })
1548 }
1549 }
1550
1551 'k' if self.flags.unicode || !self.named_group_indices.is_empty() => {
1553 self.consume('k');
1554
1555 if let Some(group_name) = self.try_consume_named_capture_group_name() {
1557 if let Some(index) = self.named_group_indices.get(&group_name) {
1558 Ok(ir::Node::BackRef(*index + 1))
1559 } else {
1560 error(format!(
1561 "Backreference to invalid named capture group: {}",
1562 &group_name
1563 ))
1564 }
1565 } else {
1566 error("Unexpected end of named backreference")
1567 }
1568 }
1569
1570 'k' => {
1572 self.consume('k');
1573 Ok(ir::Node::Char {
1574 c: self.fold_if_icase(c),
1575 icase: self.flags.icase,
1576 })
1577 }
1578
1579 _ => {
1580 let c = self.consume_character_escape()?;
1581 Ok(ir::Node::Char {
1582 c: self.fold_if_icase(c),
1583 icase: self.flags.icase,
1584 })
1585 }
1586 }
1587 }
1588
1589 #[allow(clippy::branches_sharing_code)]
1590 fn try_escape_unicode_sequence(&mut self) -> Option<u32> {
1591 let mut orig_input = self.input.clone();
1592
1593 if self.try_consume('{') {
1595 let mut s = String::new();
1596 loop {
1597 match self.next().and_then(char::from_u32) {
1598 Some('}') => break,
1599 Some(c) => s.push(c),
1600 None => {
1601 self.input = orig_input;
1603 return None;
1604 }
1605 }
1606 }
1607
1608 match u32::from_str_radix(&s, 16) {
1609 Ok(u) => {
1610 if u > 0x10_FFFF {
1611 self.input = orig_input;
1612 None
1613 } else {
1614 Some(u)
1615 }
1616 }
1617 _ => {
1618 self.input = orig_input;
1619 None
1620 }
1621 }
1622 } else {
1623 let mut s = String::new();
1625 for _ in 0..4 {
1626 if let Some(c) = self.next().and_then(char::from_u32) {
1627 s.push(c);
1628 } else {
1629 self.input = orig_input;
1631 return None;
1632 }
1633 }
1634 match u16::from_str_radix(&s, 16) {
1635 Ok(u) => {
1636 if (0xD800..=0xDBFF).contains(&u) {
1637 if !self.try_consume_str("\\u") {
1641 return Some(u as u32);
1642 }
1643 orig_input = self.input.clone();
1644
1645 let result = (|| {
1650 let mut s = String::new();
1651 for _ in 0..4 {
1652 let c = self.next().and_then(char::from_u32)?;
1653 s.push(c);
1654 }
1655
1656 let uu = u16::from_str_radix(&s, 16).ok()?;
1657 let ch = char::decode_utf16([u, uu]).next()?.ok()?;
1658 Some(u32::from(ch))
1659 })();
1660
1661 result.or_else(|| {
1662 self.input = orig_input;
1663 Some(u as u32)
1664 })
1665 } else {
1666 Some(u as u32)
1669 }
1670 }
1671 _ => {
1672 self.input = orig_input;
1673 None
1674 }
1675 }
1676 }
1677 }
1678
1679 fn try_consume_named_capture_group_name(&mut self) -> Option<String> {
1680 if !self.try_consume('<') {
1681 return None;
1682 }
1683
1684 let orig_input = self.input.clone();
1685 let mut group_name = String::new();
1686
1687 if let Some(mut c) = self.next().and_then(char::from_u32) {
1688 if c == '\\' && self.try_consume('u') {
1689 if let Some(escaped) = self.try_escape_unicode_sequence().and_then(char::from_u32) {
1690 c = escaped;
1691 } else {
1692 self.input = orig_input;
1693 return None;
1694 }
1695 }
1696
1697 if interval_contains(id_start_ranges(), c.into()) || c == '$' || c == '_' {
1698 group_name.push(c);
1699 } else {
1700 self.input = orig_input;
1701 return None;
1702 }
1703 } else {
1704 self.input = orig_input;
1705 return None;
1706 }
1707
1708 loop {
1709 if let Some(mut c) = self.next().and_then(char::from_u32) {
1710 if c == '\\' && self.try_consume('u') {
1711 if let Some(escaped) =
1712 self.try_escape_unicode_sequence().and_then(char::from_u32)
1713 {
1714 c = escaped;
1715 } else {
1716 self.input = orig_input;
1717 return None;
1718 }
1719 }
1720
1721 if c == '>' {
1722 break;
1723 }
1724
1725 if interval_contains(id_continue_ranges(), c.into()) || c == '$' || c == '_' || c == '\u{200C}' || c == '\u{200D}'
1726 {
1728 group_name.push(c);
1729 } else {
1730 self.input = orig_input;
1731 return None;
1732 }
1733 } else {
1734 self.input = orig_input;
1735 return None;
1736 }
1737 }
1738
1739 Some(group_name)
1740 }
1741
1742 fn parse_capture_groups(&mut self) -> Result<(), Error> {
1744 let orig_input = self.input.clone();
1745
1746 loop {
1747 match self.next().map(to_char_sat) {
1748 Some('\\') => {
1749 self.next();
1750 continue;
1751 }
1752 Some('[') => loop {
1753 match self.next().map(to_char_sat) {
1754 Some('\\') => {
1755 self.next();
1756 continue;
1757 }
1758 Some(']') => break,
1759 Some(_) => continue,
1760 None => break,
1761 }
1762 },
1763 Some('(') => {
1764 if self.try_consume_str("?") {
1765 if let Some(name) = self.try_consume_named_capture_group_name() {
1766 if self
1767 .named_group_indices
1768 .insert(name, self.group_count_max)
1769 .is_some()
1770 {
1771 return error("Duplicate capture group name");
1772 }
1773 }
1774 }
1775 self.group_count_max = if self.group_count_max + 1 > MAX_CAPTURE_GROUPS as u32 {
1776 MAX_CAPTURE_GROUPS as u32
1777 } else {
1778 self.group_count_max + 1
1779 };
1780 }
1781 Some(_) => continue,
1782 None => break,
1783 }
1784 }
1785
1786 self.input = orig_input;
1787
1788 Ok(())
1789 }
1790
1791 fn try_consume_unicode_property_escape(&mut self) -> Result<PropertyEscapeKind, Error> {
1792 if !self.try_consume('{') {
1793 return error("Invalid character at property escape start");
1794 }
1795
1796 let mut buffer = String::new();
1797 let mut name = None;
1798
1799 while let Some(c) = self.peek().and_then(char::from_u32) {
1800 match c {
1801 '}' => {
1802 self.consume(c);
1803 let Some(value) =
1804 unicode_property_from_str(&buffer, name, self.flags.unicode_sets)
1805 else {
1806 break;
1807 };
1808 return Ok(value);
1809 }
1810 '=' if name.is_none() => {
1811 self.consume(c);
1812 let Some(n) = unicode_property_name_from_str(&buffer) else {
1813 break;
1814 };
1815 name = Some(n);
1816 buffer.clear();
1817 }
1818 c if c.is_ascii_alphanumeric() || c == '_' => {
1819 self.consume(c);
1820 buffer.push(c);
1821 }
1822 _ => break,
1823 }
1824 }
1825
1826 error("Invalid property name")
1827 }
1828
1829 fn finalize(&self, mut re: ir::Regex) -> Result<ir::Regex, Error> {
1830 debug_assert!(self.loop_count <= MAX_LOOPS as u32);
1831 debug_assert!(self.group_count as usize <= MAX_CAPTURE_GROUPS);
1832 if self.has_lookbehind {
1833 ir::walk_mut(
1834 false,
1835 re.flags.unicode,
1836 &mut re.node,
1837 &mut ir::Node::reverse_cats,
1838 );
1839 }
1840 Ok(re)
1841 }
1842}
1843
1844pub fn try_parse<I>(pattern: I, flags: api::Flags) -> Result<ir::Regex, Error>
1847where
1848 I: Iterator<Item = u32> + Clone,
1849{
1850 let mut p = Parser {
1851 input: pattern.peekable(),
1852 flags,
1853 loop_count: 0,
1854 group_count: 0,
1855 named_group_indices: HashMap::new(),
1856 group_count_max: 0,
1857 has_lookbehind: false,
1858 };
1859 p.try_parse()
1860}