regress/
parse.rs

1//! Parser from regex patterns to IR
2
3use crate::{
4    api, charclasses,
5    codepointset::{interval_contains, CodePointSet, Interval},
6    ir,
7    types::{
8        BracketContents, CaptureGroupID, CaptureGroupName, CharacterClassType, MAX_CAPTURE_GROUPS,
9        MAX_LOOPS,
10    },
11    unicode::{
12        self, unicode_property_from_str, unicode_property_name_from_str, PropertyEscapeKind,
13    },
14    unicodetables::{id_continue_ranges, id_start_ranges},
15    util::to_char_sat,
16};
17use core::{fmt, iter::Peekable};
18#[cfg(feature = "std")]
19use std::collections::HashMap;
20#[cfg(not(feature = "std"))]
21use {
22    alloc::{
23        boxed::Box,
24        string::{String, ToString},
25        vec::Vec,
26    },
27    hashbrown::HashMap,
28};
29
30/// Represents an error encountered during regex compilation.
31///
32/// The text contains a human-readable error message.
33#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
34pub struct Error {
35    pub text: String,
36}
37
38impl fmt::Display for Error {
39    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
40        f.write_str(&self.text)
41    }
42}
43
44#[cfg(feature = "std")]
45impl std::error::Error for Error {}
46
47enum ClassAtom {
48    CodePoint(u32),
49    CharacterClass {
50        class_type: CharacterClassType,
51        positive: bool,
52    },
53    Range {
54        iv: CodePointSet,
55        negate: bool,
56    },
57}
58
59/// Represents the result of a class set.
60#[derive(Debug, Clone)]
61struct ClassSet {
62    codepoints: CodePointSet,
63    alternatives: ClassSetAlternativeStrings,
64}
65
66impl ClassSet {
67    fn new() -> Self {
68        ClassSet {
69            codepoints: CodePointSet::new(),
70            alternatives: ClassSetAlternativeStrings::new(),
71        }
72    }
73
74    fn node(self, icase: bool, negate_set: bool) -> ir::Node {
75        let codepoints = if icase {
76            unicode::add_icase_code_points(self.codepoints)
77        } else {
78            self.codepoints
79        };
80        if codepoints.is_empty() && self.alternatives.0.is_empty() {
81            ir::Node::Bracket(BracketContents {
82                invert: negate_set,
83                cps: codepoints,
84            })
85        } else if codepoints.is_empty() {
86            make_alt(self.alternatives.to_nodes(icase))
87        } else if self.alternatives.0.is_empty() {
88            ir::Node::Bracket(BracketContents {
89                invert: negate_set,
90                cps: codepoints,
91            })
92        } else {
93            let mut nodes = self.alternatives.to_nodes(icase);
94            nodes.push(ir::Node::Bracket(BracketContents {
95                invert: negate_set,
96                cps: codepoints,
97            }));
98            make_alt(nodes)
99        }
100    }
101
102    fn union_operand(&mut self, operand: ClassSetOperand) {
103        match operand {
104            ClassSetOperand::ClassSetCharacter(c) => {
105                self.codepoints.add_one(c);
106            }
107            ClassSetOperand::CharacterClassEscape(cps) => {
108                self.codepoints.add_set(cps);
109            }
110            ClassSetOperand::Class(class) => {
111                self.codepoints.add_set(class.codepoints);
112                self.alternatives.extend(class.alternatives);
113            }
114            ClassSetOperand::ClassStringDisjunction(s) => {
115                self.alternatives.extend(s);
116            }
117        }
118    }
119
120    fn intersect_operand(&mut self, operand: ClassSetOperand) {
121        match operand {
122            ClassSetOperand::ClassSetCharacter(c) => {
123                if self.codepoints.contains(c) {
124                    self.codepoints =
125                        CodePointSet::from_sorted_disjoint_intervals(Vec::from([Interval {
126                            first: c,
127                            last: c,
128                        }]));
129                } else {
130                    self.codepoints.clear();
131                }
132                let mut found_alternative = false;
133                for alternative in &mut self.alternatives.0 {
134                    if alternative.len() == 1 && alternative[0] == c {
135                        found_alternative = true;
136                        break;
137                    }
138                }
139                self.alternatives.0.clear();
140                if found_alternative {
141                    self.alternatives.0.push(Vec::from([c]));
142                }
143            }
144            ClassSetOperand::CharacterClassEscape(cps) => {
145                self.codepoints.intersect(cps.intervals());
146                let mut retained = Vec::new();
147                for alternative in &self.alternatives.0 {
148                    if alternative.len() == 1 && cps.contains(alternative[0]) {
149                        retained.push(alternative.clone());
150                    }
151                }
152                self.alternatives.0 = retained;
153            }
154            ClassSetOperand::Class(class) => {
155                let mut retained_codepoints = CodePointSet::new();
156                for alternative in &class.alternatives.0 {
157                    if alternative.len() == 1 && self.codepoints.contains(alternative[0]) {
158                        retained_codepoints.add_one(alternative[0]);
159                    }
160                }
161                let mut retained_alternatives = ClassSetAlternativeStrings::new();
162                for alternative in &self.alternatives.0 {
163                    if alternative.len() == 1 && class.codepoints.contains(alternative[0]) {
164                        retained_alternatives.0.push(alternative.clone());
165                    }
166                }
167                self.codepoints.intersect(class.codepoints.intervals());
168                self.codepoints.add_set(retained_codepoints);
169                self.alternatives.intersect(class.alternatives);
170                self.alternatives.extend(retained_alternatives);
171            }
172            ClassSetOperand::ClassStringDisjunction(s) => {
173                let mut retained = CodePointSet::new();
174                for alternative in &s.0 {
175                    if alternative.len() == 1 && self.codepoints.contains(alternative[0]) {
176                        retained.add_one(alternative[0]);
177                    }
178                }
179                self.codepoints = retained;
180                self.alternatives.intersect(s);
181            }
182        }
183    }
184
185    fn subtract_operand(&mut self, operand: ClassSetOperand) {
186        match operand {
187            ClassSetOperand::ClassSetCharacter(c) => {
188                self.codepoints.remove(&[Interval { first: c, last: c }]);
189                self.alternatives
190                    .remove(ClassSetAlternativeStrings(Vec::from([Vec::from([c])])));
191            }
192            ClassSetOperand::CharacterClassEscape(cps) => {
193                self.codepoints.remove(cps.intervals());
194                let mut to_remove = ClassSetAlternativeStrings::new();
195                for alternative in &self.alternatives.0 {
196                    if alternative.len() == 1 && cps.contains(alternative[0]) {
197                        to_remove.0.push(alternative.clone());
198                    }
199                }
200                self.alternatives.remove(to_remove);
201            }
202            ClassSetOperand::Class(class) => {
203                let mut codepoints_removed = CodePointSet::new();
204                for alternative in &class.alternatives.0 {
205                    if alternative.len() == 1 && self.codepoints.contains(alternative[0]) {
206                        codepoints_removed.add_one(alternative[0]);
207                    }
208                }
209                let mut alternatives_removed = ClassSetAlternativeStrings::new();
210                for alternative in &self.alternatives.0 {
211                    if alternative.len() == 1 && class.codepoints.contains(alternative[0]) {
212                        alternatives_removed.0.push(alternative.clone());
213                    }
214                }
215                self.codepoints.remove(codepoints_removed.intervals());
216                self.codepoints.remove(class.codepoints.intervals());
217                self.alternatives.remove(alternatives_removed);
218                self.alternatives.remove(class.alternatives);
219            }
220            ClassSetOperand::ClassStringDisjunction(s) => {
221                let mut to_remove = CodePointSet::new();
222                for alternative in &s.0 {
223                    if alternative.len() == 1 && self.codepoints.contains(alternative[0]) {
224                        to_remove.add_one(alternative[0]);
225                    }
226                }
227                self.codepoints.remove(to_remove.intervals());
228                self.alternatives.remove(s);
229            }
230        }
231    }
232}
233
234/// Represents all different types of class set operands.
235#[derive(Debug, Clone)]
236enum ClassSetOperand {
237    ClassSetCharacter(u32),
238    CharacterClassEscape(CodePointSet),
239    Class(ClassSet),
240    ClassStringDisjunction(ClassSetAlternativeStrings),
241}
242
243#[derive(Debug, Clone)]
244struct ClassSetAlternativeStrings(Vec<Vec<u32>>);
245
246impl ClassSetAlternativeStrings {
247    fn new() -> Self {
248        ClassSetAlternativeStrings(Vec::new())
249    }
250
251    fn extend(&mut self, other: Self) {
252        self.0.extend(other.0);
253    }
254
255    fn intersect(&mut self, other: Self) {
256        let mut retained = Vec::new();
257        for string in &self.0 {
258            for other_string in &other.0 {
259                if string == other_string {
260                    retained.push(string.clone());
261                    break;
262                }
263            }
264        }
265        self.0 = retained;
266    }
267
268    fn remove(&mut self, other: Self) {
269        self.0.retain(|string| !other.0.contains(string));
270    }
271
272    fn to_nodes(&self, icase: bool) -> Vec<ir::Node> {
273        let mut nodes = Vec::new();
274        for string in &self.0 {
275            nodes.push(ir::Node::Cat(
276                string
277                    .iter()
278                    .map(|cp| ir::Node::Char { c: *cp, icase })
279                    .collect::<Vec<_>>(),
280            ));
281        }
282        nodes
283    }
284}
285
286fn error<S, T>(text: S) -> Result<T, Error>
287where
288    S: ToString,
289{
290    Err(Error {
291        text: text.to_string(),
292    })
293}
294
295fn make_cat(nodes: ir::NodeList) -> ir::Node {
296    match nodes.len() {
297        0 => ir::Node::Empty,
298        1 => nodes.into_iter().next().unwrap(),
299        _ => ir::Node::Cat(nodes),
300    }
301}
302
303fn make_alt(nodes: ir::NodeList) -> ir::Node {
304    let mut mright = None;
305    for node in nodes.into_iter().rev() {
306        match mright {
307            None => mright = Some(node),
308            Some(right) => mright = Some(ir::Node::Alt(Box::new(node), Box::new(right))),
309        }
310    }
311    mright.unwrap_or(ir::Node::Empty)
312}
313
314/// \return a CodePointSet for a given character escape (positive or negative).
315/// See ES9 21.2.2.12.
316fn codepoints_from_class(ct: CharacterClassType, positive: bool) -> CodePointSet {
317    let mut cps;
318    match ct {
319        CharacterClassType::Digits => {
320            cps = CodePointSet::from_sorted_disjoint_intervals(charclasses::DIGITS.to_vec())
321        }
322        CharacterClassType::Words => {
323            cps = CodePointSet::from_sorted_disjoint_intervals(charclasses::WORD_CHARS.to_vec())
324        }
325        CharacterClassType::Spaces => {
326            cps = CodePointSet::from_sorted_disjoint_intervals(charclasses::WHITESPACE.to_vec());
327            for &iv in charclasses::LINE_TERMINATOR.iter() {
328                cps.add(iv)
329            }
330        }
331    };
332    if !positive {
333        cps = cps.inverted()
334    }
335    cps
336}
337
338/// \return a Bracket for a given character escape (positive or negative).
339fn make_bracket_class(ct: CharacterClassType, positive: bool) -> ir::Node {
340    ir::Node::Bracket(BracketContents {
341        invert: false,
342        cps: codepoints_from_class(ct, positive),
343    })
344}
345
346fn add_class_atom(bc: &mut BracketContents, atom: ClassAtom) {
347    match atom {
348        ClassAtom::CodePoint(c) => bc.cps.add_one(c),
349        ClassAtom::CharacterClass {
350            class_type,
351            positive,
352        } => {
353            bc.cps.add_set(codepoints_from_class(class_type, positive));
354        }
355        ClassAtom::Range { iv, negate } => {
356            if negate {
357                bc.cps.add_set(iv.inverted());
358            } else {
359                bc.cps.add_set(iv);
360            }
361        }
362    }
363}
364
365struct LookaroundParams {
366    negate: bool,
367    backwards: bool,
368}
369
370/// Represents the state used to parse a regex.
371struct Parser<I>
372where
373    I: Iterator<Item = u32>,
374{
375    /// The remaining input.
376    input: Peekable<I>,
377
378    /// Flags used.
379    flags: api::Flags,
380
381    /// Number of loops.
382    loop_count: u32,
383
384    /// Number of capturing groups.
385    group_count: CaptureGroupID,
386
387    /// Maximum number of capturing groups.
388    group_count_max: u32,
389
390    /// Named capture group references.
391    named_group_indices: HashMap<CaptureGroupName, u32>,
392
393    /// Whether a lookbehind was encountered.
394    has_lookbehind: bool,
395}
396
397impl<I> Parser<I>
398where
399    I: Iterator<Item = u32> + Clone,
400{
401    /// Consume a character, returning it.
402    fn consume<C: Into<u32>>(&mut self, c: C) -> u32 {
403        let nc = self.input.next();
404        core::debug_assert!(nc == Some(c.into()), "char was not next");
405        nc.unwrap()
406    }
407
408    /// If our contents begin with the char c, consume it from our contents
409    /// and return true. Otherwise return false.
410    fn try_consume<C: Into<u32>>(&mut self, c: C) -> bool {
411        self.input.next_if_eq(&c.into()).is_some()
412    }
413
414    /// If our contents begin with the string \p s, consume it from our contents
415    /// and return true. Otherwise return false.
416    fn try_consume_str(&mut self, s: &str) -> bool {
417        let mut cursor = self.input.clone();
418        for c1 in s.chars() {
419            if cursor.next() != Some(c1 as u32) {
420                return false;
421            }
422        }
423        self.input = cursor;
424        true
425    }
426
427    /// Fold a character if icase.
428    fn fold_if_icase(&self, c: u32) -> u32 {
429        if self.flags.icase {
430            unicode::fold_code_point(c, self.flags.unicode)
431        } else {
432            c
433        }
434    }
435
436    /// Peek at the next character.
437    fn peek(&mut self) -> Option<u32> {
438        self.input.peek().copied()
439    }
440
441    /// \return the next character.
442    fn next(&mut self) -> Option<u32> {
443        self.input.next()
444    }
445
446    fn try_parse(&mut self) -> Result<ir::Regex, Error> {
447        self.parse_capture_groups()?;
448
449        // Parse a catenation. If we consume everything, it's success. If there's
450        // something left, it's an error (for example, an excess closing paren).
451        let body = self.consume_disjunction()?;
452        match self.input.peek().copied() {
453            Some(c) if c == ')' as u32 => error("Unbalanced parenthesis"),
454            Some(c) => error(format!(
455                "Unexpected char: {}",
456                char::from_u32(c)
457                    .map(String::from)
458                    .unwrap_or_else(|| format!("\\u{c:04X}"))
459            )),
460            None => self.finalize(ir::Regex {
461                node: make_cat(vec![body, ir::Node::Goal]),
462                flags: self.flags,
463            }),
464        }
465    }
466
467    /// ES6 21.2.2.3 Disjunction.
468    fn consume_disjunction(&mut self) -> Result<ir::Node, Error> {
469        let mut terms = vec![self.consume_term()?];
470        while self.try_consume('|') {
471            terms.push(self.consume_term()?)
472        }
473        Ok(make_alt(terms))
474    }
475
476    /// ES6 21.2.2.5 Term.
477    fn consume_term(&mut self) -> Result<ir::Node, Error> {
478        let mut result: Vec<ir::Node> = Vec::new();
479        loop {
480            let start_group = self.group_count;
481            let mut start_offset = result.len();
482            let mut quantifier_allowed = true;
483
484            let nc = self.peek();
485            if nc.is_none() {
486                return Ok(make_cat(result));
487            }
488            let c = nc.unwrap();
489            match to_char_sat(c) {
490                // A concatenation is terminated by closing parens or vertical bar (alternations).
491                ')' | '|' => break,
492                // Term :: Assertion :: ^
493                '^' => {
494                    self.consume('^');
495                    result.push(ir::Node::Anchor(ir::AnchorType::StartOfLine));
496                    quantifier_allowed = false;
497                }
498                // Term :: Assertion :: $
499                '$' => {
500                    self.consume('$');
501                    result.push(ir::Node::Anchor(ir::AnchorType::EndOfLine));
502                    quantifier_allowed = false;
503                }
504
505                '\\' => {
506                    self.consume('\\');
507                    let Some(c) = self.peek() else {
508                        return error("Incomplete escape");
509                    };
510                    match to_char_sat(c) {
511                        // Term :: Assertion :: \b
512                        'b' => {
513                            self.consume('b');
514                            result.push(ir::Node::WordBoundary { invert: false });
515                        }
516                        // Term :: Assertion :: \B
517                        'B' => {
518                            self.consume('B');
519                            result.push(ir::Node::WordBoundary { invert: true });
520                        }
521                        // Term :: Atom :: \ AtomEscape :: CharacterEscape :: c AsciiLetter
522                        // Term :: ExtendedAtom :: \ [lookahead = c]
523                        'c' if !self.flags.unicode => {
524                            self.consume('c');
525                            if self
526                                .peek()
527                                .and_then(char::from_u32)
528                                .map(|c| c.is_ascii_alphabetic())
529                                == Some(true)
530                            {
531                                result.push(ir::Node::Char {
532                                    c: self.next().expect("char was not next") % 32,
533                                    icase: self.flags.icase,
534                                });
535                            } else {
536                                start_offset += 1;
537                                result.push(ir::Node::Char {
538                                    c: u32::from('\\'),
539                                    icase: self.flags.icase,
540                                });
541                                result.push(ir::Node::Char {
542                                    c: u32::from('c'),
543                                    icase: self.flags.icase,
544                                });
545                            }
546                        }
547                        // Term :: Atom :: \ AtomEscape
548                        _ => {
549                            result.push(self.consume_atom_escape()?);
550                        }
551                    }
552                }
553
554                // Term :: Atom :: .
555                '.' => {
556                    self.consume('.');
557                    result.push(if self.flags.dot_all {
558                        ir::Node::MatchAny
559                    } else {
560                        ir::Node::MatchAnyExceptLineTerminator
561                    });
562                }
563
564                '(' => {
565                    if self.try_consume_str("(?=") {
566                        // Positive lookahead.
567                        quantifier_allowed = !self.flags.unicode;
568                        result.push(self.consume_lookaround_assertion(LookaroundParams {
569                            negate: false,
570                            backwards: false,
571                        })?);
572                    } else if self.try_consume_str("(?!") {
573                        // Negative lookahead.
574                        quantifier_allowed = !self.flags.unicode;
575                        result.push(self.consume_lookaround_assertion(LookaroundParams {
576                            negate: true,
577                            backwards: false,
578                        })?);
579                    } else if self.try_consume_str("(?<=") {
580                        // Positive lookbehind.
581                        quantifier_allowed = false;
582                        self.has_lookbehind = true;
583                        result.push(self.consume_lookaround_assertion(LookaroundParams {
584                            negate: false,
585                            backwards: true,
586                        })?);
587                    } else if self.try_consume_str("(?<!") {
588                        // Negative lookbehind.
589                        quantifier_allowed = false;
590                        self.has_lookbehind = true;
591                        result.push(self.consume_lookaround_assertion(LookaroundParams {
592                            negate: true,
593                            backwards: true,
594                        })?);
595                    } else if self.try_consume_str("(?:") {
596                        // Non-capturing group.
597                        result.push(self.consume_disjunction()?);
598                    } else {
599                        // Capturing group.
600                        self.consume('(');
601                        let group = self.group_count;
602                        if self.group_count as usize >= MAX_CAPTURE_GROUPS {
603                            return error("Capture group count limit exceeded");
604                        }
605                        self.group_count += 1;
606
607                        // Parse capture group name.
608                        if self.try_consume_str("?") {
609                            let group_name = if let Some(group_name) =
610                                self.try_consume_named_capture_group_name()
611                            {
612                                group_name
613                            } else {
614                                return error("Invalid token at named capture group identifier");
615                            };
616                            let contents = self.consume_disjunction()?;
617                            result.push(ir::Node::NamedCaptureGroup(
618                                Box::new(contents),
619                                group,
620                                group_name,
621                            ))
622                        } else {
623                            let contents = self.consume_disjunction()?;
624                            result.push(ir::Node::CaptureGroup(Box::new(contents), group))
625                        }
626                    }
627                    if !self.try_consume(')') {
628                        return error("Unbalanced parenthesis");
629                    }
630                }
631
632                // CharacterClass :: ClassContents :: ClassSetExpression
633                '[' if self.flags.unicode_sets => {
634                    self.consume('[');
635                    let negate_set = self.try_consume('^');
636                    result.push(
637                        self.consume_class_set_expression(negate_set)?
638                            .node(self.flags.icase, negate_set),
639                    );
640                }
641
642                '[' => {
643                    result.push(self.consume_bracket()?);
644                }
645
646                // Term :: ExtendedAtom :: InvalidBracedQuantifier
647                '{' if !self.flags.unicode => {
648                    if self.try_consume_braced_quantifier().is_some() {
649                        return error("Invalid braced quantifier");
650                    }
651
652                    // Term :: ExtendedAtom :: ExtendedPatternCharacter
653                    result.push(ir::Node::Char {
654                        c: self.consume(c),
655                        icase: self.flags.icase,
656                    })
657                }
658
659                // Term :: Atom :: PatternCharacter :: SourceCharacter but not ^ $ \ . * + ? ( ) [ ] { } |
660                '*' | '+' | '?' | ']' | '{' | '}' if self.flags.unicode => {
661                    return error("Invalid atom character");
662                }
663
664                // Term :: ExtendedAtom :: SourceCharacter but not ^ $ \ . * + ? ( ) [ |
665                '*' | '+' | '?' => {
666                    return error("Invalid atom character");
667                }
668
669                // Term :: Atom :: PatternCharacter
670                // Term :: ExtendedAtom :: ExtendedPatternCharacter
671                _ => {
672                    self.consume(c);
673                    result.push(ir::Node::Char {
674                        c: self.fold_if_icase(c),
675                        icase: self.flags.icase,
676                    })
677                }
678            }
679
680            // We just parsed a term; try parsing a quantifier.
681            if let Some(quant) = self.try_consume_quantifier()? {
682                if !quantifier_allowed {
683                    return error("Quantifier not allowed here");
684                }
685                // Validate the quantifier.
686                // Note we don't want to do this as part of parsing the quantiifer in some cases
687                // an incomplete quantifier is not recognized as a quantifier, e.g. `/{3/` is
688                // valid.
689                if quant.min > quant.max {
690                    return error("Invalid quantifier");
691                }
692                let quantifee = result.split_off(start_offset);
693                if self.loop_count as usize >= MAX_LOOPS {
694                    return error("Loop count limit exceeded");
695                }
696                self.loop_count += 1;
697                result.push(ir::Node::Loop {
698                    loopee: Box::new(make_cat(quantifee)),
699                    quant,
700                    enclosed_groups: start_group..self.group_count,
701                });
702            }
703        }
704        Ok(make_cat(result))
705    }
706
707    /// ES6 21.2.2.13 CharacterClass.
708    fn consume_bracket(&mut self) -> Result<ir::Node, Error> {
709        self.consume('[');
710        let invert = self.try_consume('^');
711        let mut result = BracketContents {
712            invert,
713            cps: CodePointSet::default(),
714        };
715
716        loop {
717            match self.peek().map(to_char_sat) {
718                None => {
719                    return error("Unbalanced bracket");
720                }
721                Some(']') => {
722                    self.consume(']');
723                    if self.flags.icase {
724                        result.cps = unicode::add_icase_code_points(result.cps);
725                    }
726                    return Ok(ir::Node::Bracket(result));
727                }
728                _ => {}
729            }
730
731            // Parse a code point or character class.
732            let Some(first) = self.try_consume_bracket_class_atom()? else {
733                continue;
734            };
735
736            // Check for a dash; we may have a range.
737            if !self.try_consume('-') {
738                add_class_atom(&mut result, first);
739                continue;
740            }
741
742            let Some(second) = self.try_consume_bracket_class_atom()? else {
743                // No second atom. For example: [a-].
744                add_class_atom(&mut result, first);
745                add_class_atom(&mut result, ClassAtom::CodePoint(u32::from('-')));
746                continue;
747            };
748
749            // Ranges must also be in order: z-a is invalid.
750            // ES6 21.2.2.15.1 "If i > j, throw a SyntaxError exception"
751            if let (ClassAtom::CodePoint(c1), ClassAtom::CodePoint(c2)) = (&first, &second) {
752                if c1 > c2 {
753                    return error(
754                        "Range values reversed, start char code is greater than end char code.",
755                    );
756                }
757                result.cps.add(Interval {
758                    first: *c1,
759                    last: *c2,
760                });
761
762                continue;
763            }
764
765            if self.flags.unicode {
766                return error("Invalid character range");
767            }
768
769            // If it does not match a range treat as any match single characters.
770            add_class_atom(&mut result, first);
771            add_class_atom(&mut result, ClassAtom::CodePoint(u32::from('-')));
772            add_class_atom(&mut result, second);
773        }
774    }
775
776    fn try_consume_bracket_class_atom(&mut self) -> Result<Option<ClassAtom>, Error> {
777        let c = self.peek();
778        if c.is_none() {
779            return Ok(None);
780        }
781        let c = c.unwrap();
782        match to_char_sat(c) {
783            // End of bracket.
784            ']' => Ok(None),
785
786            // ClassEscape
787            '\\' => {
788                self.consume('\\');
789                let ec = if let Some(ec) = self.peek() {
790                    ec
791                } else {
792                    return error("Unterminated escape");
793                };
794                match to_char_sat(ec) {
795                    // ClassEscape :: b
796                    'b' => {
797                        self.consume('b');
798                        Ok(Some(ClassAtom::CodePoint(u32::from('\x08'))))
799                    }
800                    // ClassEscape :: [+UnicodeMode] -
801                    '-' if self.flags.unicode => {
802                        self.consume('-');
803                        Ok(Some(ClassAtom::CodePoint(u32::from('-'))))
804                    }
805                    'c' if !self.flags.unicode => {
806                        let input = self.input.clone();
807                        self.consume('c');
808                        match self.peek().map(to_char_sat) {
809                            // ClassEscape :: [~UnicodeMode] c ClassControlLetter
810                            Some('0'..='9' | '_') => {
811                                let next = self.next().expect("char was not next");
812                                Ok(Some(ClassAtom::CodePoint(next & 0x1F)))
813                            }
814                            // CharacterEscape :: c AsciiLetter
815                            Some('a'..='z' | 'A'..='Z') => {
816                                let next = self.next().expect("char was not next");
817                                Ok(Some(ClassAtom::CodePoint(next % 32)))
818                            }
819                            // ClassAtomNoDash :: \ [lookahead = c]
820                            _ => {
821                                self.input = input;
822                                Ok(Some(ClassAtom::CodePoint(u32::from('\\'))))
823                            }
824                        }
825                    }
826                    // ClassEscape :: CharacterClassEscape :: d
827                    'd' => {
828                        self.consume('d');
829                        Ok(Some(ClassAtom::CharacterClass {
830                            class_type: CharacterClassType::Digits,
831                            positive: true,
832                        }))
833                    }
834                    // ClassEscape :: CharacterClassEscape :: D
835                    'D' => {
836                        self.consume('D');
837                        Ok(Some(ClassAtom::CharacterClass {
838                            class_type: CharacterClassType::Digits,
839                            positive: false,
840                        }))
841                    }
842                    // ClassEscape :: CharacterClassEscape :: s
843                    's' => {
844                        self.consume('s');
845                        Ok(Some(ClassAtom::CharacterClass {
846                            class_type: CharacterClassType::Spaces,
847                            positive: true,
848                        }))
849                    }
850                    // ClassEscape :: CharacterClassEscape :: S
851                    'S' => {
852                        self.consume('S');
853                        Ok(Some(ClassAtom::CharacterClass {
854                            class_type: CharacterClassType::Spaces,
855                            positive: false,
856                        }))
857                    }
858                    // ClassEscape :: CharacterClassEscape :: w
859                    'w' => {
860                        self.consume('w');
861                        Ok(Some(ClassAtom::CharacterClass {
862                            class_type: CharacterClassType::Words,
863                            positive: true,
864                        }))
865                    }
866                    // ClassEscape :: CharacterClassEscape :: W
867                    'W' => {
868                        self.consume('W');
869                        Ok(Some(ClassAtom::CharacterClass {
870                            class_type: CharacterClassType::Words,
871                            positive: false,
872                        }))
873                    }
874                    // ClassEscape :: CharacterClassEscape :: [+UnicodeMode] p{ UnicodePropertyValueExpression }
875                    // ClassEscape :: CharacterClassEscape :: [+UnicodeMode] P{ UnicodePropertyValueExpression }
876                    'p' | 'P' if self.flags.unicode => {
877                        self.consume(ec);
878                        let negate = ec == 'P' as u32;
879                        match self.try_consume_unicode_property_escape()? {
880                            PropertyEscapeKind::CharacterClass(s) => Ok(Some(ClassAtom::Range {
881                                iv: CodePointSet::from_sorted_disjoint_intervals(s.to_vec()),
882                                negate,
883                            })),
884                            PropertyEscapeKind::StringSet(_) => error("Invalid property escape"),
885                        }
886                    }
887                    // ClassEscape :: CharacterEscape
888                    _ => {
889                        let cc = self.consume_character_escape()?;
890                        Ok(Some(ClassAtom::CodePoint(cc)))
891                    }
892                }
893            }
894
895            _ => Ok(Some(ClassAtom::CodePoint(self.consume(c)))),
896        }
897    }
898
899    // CharacterClass :: ClassContents :: ClassSetExpression
900    fn consume_class_set_expression(&mut self, negate_set: bool) -> Result<ClassSet, Error> {
901        let mut result = ClassSet::new();
902
903        let first = match self.peek() {
904            Some(0x5D /* ] */) => {
905                self.consume(']');
906                return Ok(result);
907            }
908            Some(_) => self.consume_class_set_operand(negate_set)?,
909            None => {
910                return error("Unbalanced class set bracket");
911            }
912        };
913
914        enum ClassSetOperator {
915            Union,
916            Intersection,
917            Subtraction,
918        }
919
920        let op = match self.peek() {
921            Some(0x5D /* ] */) => {
922                self.consume(']');
923                result.union_operand(first);
924                return Ok(result);
925            }
926            Some(0x26 /* & */) => {
927                self.consume('&');
928                if self.peek() == Some(0x26 /* & */) {
929                    self.consume('&');
930                    result.union_operand(first.clone());
931                    ClassSetOperator::Intersection
932                } else {
933                    result.codepoints.add_one(0x26 /* & */);
934                    ClassSetOperator::Union
935                }
936            }
937            Some(0x2D /* - */) => {
938                self.consume('-');
939                if self.peek() == Some(0x2D /* - */) {
940                    self.consume('-');
941                    result.union_operand(first.clone());
942                    ClassSetOperator::Subtraction
943                } else {
944                    match first {
945                        ClassSetOperand::ClassSetCharacter(first) => {
946                            let ClassSetOperand::ClassSetCharacter(last) =
947                                self.consume_class_set_operand(negate_set)?
948                            else {
949                                return error("Invalid class set range");
950                            };
951                            if first > last {
952                                return error("Invalid class set range");
953                            }
954                            result.codepoints.add(Interval { first, last });
955                        }
956                        _ => {
957                            return error("Invalid class set range");
958                        }
959                    };
960                    ClassSetOperator::Union
961                }
962            }
963            Some(_) => {
964                result.union_operand(first.clone());
965                ClassSetOperator::Union
966            }
967            None => {
968                return error("Unbalanced class set bracket");
969            }
970        };
971
972        match op {
973            ClassSetOperator::Union => {
974                loop {
975                    let operand = match self.peek() {
976                        Some(0x5D /* ] */) => {
977                            self.consume(']');
978                            return Ok(result);
979                        }
980                        Some(_) => self.consume_class_set_operand(negate_set)?,
981                        None => return error("Unbalanced class set bracket"),
982                    };
983                    if self.peek() == Some(0x2D /* - */) {
984                        self.consume('-');
985                        match operand {
986                            ClassSetOperand::ClassSetCharacter(first) => {
987                                let ClassSetOperand::ClassSetCharacter(last) =
988                                    self.consume_class_set_operand(negate_set)?
989                                else {
990                                    return error("Invalid class set range");
991                                };
992                                if first > last {
993                                    return error("Invalid class set range");
994                                }
995                                result.codepoints.add(Interval { first, last });
996                            }
997                            _ => {
998                                return error("Invalid class set range");
999                            }
1000                        };
1001                    } else {
1002                        result.union_operand(operand);
1003                    }
1004                }
1005            }
1006            // ClassIntersection :: ClassSetOperand && [lookahead ≠ &]
1007            ClassSetOperator::Intersection => {
1008                loop {
1009                    let operand = self.consume_class_set_operand(negate_set)?;
1010                    result.intersect_operand(operand);
1011                    match self.next() {
1012                        Some(0x5D /* ] */) => return Ok(result),
1013                        Some(0x26 /* & */) => {}
1014                        Some(_) => return error("Unexpected character in class set intersection"),
1015                        _ => return error("Unbalanced class set bracket"),
1016                    }
1017                    if self.next() != Some(0x26 /* & */) {
1018                        return error("Unbalanced class set bracket");
1019                    }
1020                }
1021            }
1022            // ClassSubtraction :: ClassSubtraction -- ClassSetOperand
1023            ClassSetOperator::Subtraction => {
1024                loop {
1025                    let operand = self.consume_class_set_operand(negate_set)?;
1026                    result.subtract_operand(operand);
1027                    match self.next() {
1028                        Some(0x5D /* ] */) => return Ok(result),
1029                        Some(0x2D /* - */) => {}
1030                        Some(_) => return error("Unexpected character in class set subtraction"),
1031                        _ => return error("Unbalanced class set bracket"),
1032                    }
1033                    if self.next() != Some(0x2D /* - */) {
1034                        return error("Unbalanced class set bracket");
1035                    }
1036                }
1037            }
1038        }
1039    }
1040
1041    fn consume_class_set_operand(&mut self, negate_set: bool) -> Result<ClassSetOperand, Error> {
1042        use ClassSetOperand::*;
1043        let Some(cp) = self.peek() else {
1044            return error("Empty class set operand");
1045        };
1046        match cp {
1047            // ClassSetOperand :: NestedClass :: [ [lookahead ≠ ^] ClassContents[+UnicodeMode, +UnicodeSetsMode] ]
1048            // ClassSetOperand :: NestedClass :: [^ ClassContents[+UnicodeMode, +UnicodeSetsMode] ]
1049            0x5B /* [ */ => {
1050                self.consume('[');
1051                let negate_set = self.try_consume('^');
1052                let result = self.consume_class_set_expression(negate_set)?;
1053                if negate_set {
1054                    result.codepoints.inverted();
1055                }
1056                Ok(Class(result))
1057            }
1058            // ClassSetOperand :: NestedClass :: \ CharacterClassEscape
1059            // ClassSetOperand :: ClassStringDisjunction
1060            // ClassSetOperand :: ClassSetCharacter :: \...
1061            // ClassSetRange :: ClassSetCharacter :: \...
1062            0x5C /* \ */ => {
1063                self.consume('\\');
1064                let Some(cp) = self.peek() else {
1065                    return error("Incomplete class set escape");
1066                };
1067                match cp {
1068                    // ClassStringDisjunction  \q{ ClassStringDisjunctionContents }
1069                    0x71 /* q */ => {
1070                        self.consume('q');
1071                        if !self.try_consume('{') {
1072                            return error("Invalid class set escape: expected {");
1073                        }
1074                        let mut alternatives = Vec::new();
1075                        let mut alternative = Vec::new();
1076                        loop {
1077                            match self.peek() {
1078                                Some(0x7D /* } */) => {
1079                                    self.consume('}');
1080                                    if !alternative.is_empty() {
1081                                        alternatives.push(alternative.clone());
1082                                        alternative.clear();
1083                                    }
1084                                    break;
1085                                }
1086                                Some(0x7C /* | */) => {
1087                                    self.consume('|');
1088                                    if !alternative.is_empty() {
1089                                        alternatives.push(alternative.clone());
1090                                        alternative.clear();
1091                                    }
1092                                }
1093                                Some(_) => {
1094                                    alternative.push(self.consume_class_set_character()?);
1095                                }
1096                                None => {
1097                                    return error("Unbalanced class set string disjunction");
1098                                }
1099                            }
1100                        }
1101                        Ok(ClassStringDisjunction(ClassSetAlternativeStrings(alternatives)))
1102                    }
1103                    // CharacterClassEscape :: d
1104                    0x64 /* d */ => {
1105                        self.consume('d');
1106                        Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Digits, true)))
1107                    }
1108                    // CharacterClassEscape :: D
1109                    0x44 /* D */ => {
1110                        self.consume('D');
1111                        Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Digits, false)))
1112                    }
1113                    // CharacterClassEscape :: s
1114                    0x73 /* s */ => {
1115                        self.consume('s');
1116                        Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Spaces, true)))
1117                    }
1118                    // CharacterClassEscape :: S
1119                    0x53 /* S */ => {
1120                        self.consume('S');
1121                        Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Spaces, false)))
1122                    }
1123                    // CharacterClassEscape :: w
1124                    0x77 /* w */ => {
1125                        self.consume('w');
1126                        Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Words, true)))
1127                    }
1128                    // CharacterClassEscape :: W
1129                    0x57 /* W */ => {
1130                        self.consume('W');
1131                        Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Words, false)))
1132                    }
1133                    // CharacterClassEscape :: [+UnicodeMode] p{ UnicodePropertyValueExpression }
1134                    0x70 /* p */ => {
1135                        self.consume('p');
1136                        match self.try_consume_unicode_property_escape()? {
1137                            PropertyEscapeKind::CharacterClass(intervals) => {
1138                                Ok(CharacterClassEscape(CodePointSet::from_sorted_disjoint_intervals(
1139                                    intervals.to_vec(),
1140                                )))
1141                            }
1142                            PropertyEscapeKind::StringSet(_) if negate_set => error("Invalid character escape"),
1143                            PropertyEscapeKind::StringSet(strings) => {
1144                                Ok(ClassStringDisjunction(ClassSetAlternativeStrings(strings.iter().map(|s| s.to_vec()).collect())))
1145                            }
1146                        }
1147                    }
1148                    // CharacterClassEscape :: [+UnicodeMode] P{ UnicodePropertyValueExpression }
1149                    0x50 /* P */ => {
1150                        self.consume('P');
1151                        match self.try_consume_unicode_property_escape()? {
1152                            PropertyEscapeKind::CharacterClass(s) => {
1153                                Ok(CharacterClassEscape(CodePointSet::from_sorted_disjoint_intervals(
1154                                    s.to_vec(),
1155                                ).inverted()))
1156                            }
1157                            PropertyEscapeKind::StringSet(_) => error("Invalid character escape"),
1158                        }
1159                    }
1160                    // ClassSetCharacter:: \b
1161                    0x62 /* b */ => {
1162                        Ok(ClassSetCharacter(self.consume(cp)))
1163                    }
1164                    // ClassSetCharacter:: \ ClassSetReservedPunctuator
1165                    _ if Self::is_class_set_reserved_punctuator(cp) => Ok(ClassSetCharacter(self.consume(cp))),
1166                    // ClassSetCharacter:: \ CharacterEscape[+UnicodeMode]
1167                    _ => Ok(ClassSetCharacter(self.consume_character_escape()?))
1168                }
1169            }
1170            // ClassSetOperand :: ClassSetCharacter
1171            // ClassSetRange :: ClassSetCharacter
1172            _ => Ok(ClassSetCharacter(self.consume_class_set_character()?)),
1173        }
1174    }
1175
1176    // ClassSetCharacter
1177    fn consume_class_set_character(&mut self) -> Result<u32, Error> {
1178        let Some(cp) = self.next() else {
1179            return error("Incomplete class set character");
1180        };
1181        match cp {
1182            0x5C /* \ */ => {
1183                let Some(cp) = self.peek() else {
1184                    return error("Incomplete class set escape");
1185                };
1186                match cp {
1187                    // \b
1188                    0x62 /* b */ => {
1189                        Ok(self.consume(cp))
1190                    }
1191                    // \ ClassSetReservedPunctuator
1192                    _ if Self::is_class_set_reserved_punctuator(cp) => Ok(self.consume(cp)),
1193                    // \ CharacterEscape[+UnicodeMode]
1194                    _ => Ok(self.consume_character_escape()?)
1195                }
1196            }
1197            // [lookahead ∉ ClassSetReservedDoublePunctuator] SourceCharacter but not ClassSetSyntaxCharacter
1198            0x28 /* ( */ | 0x29 /* ) */ | 0x7B /* { */ | 0x7D /* } */ | 0x2F /* / */
1199            | 0x2D /* - */ | 0x7C /* | */ => error("Invalid class set character"),
1200            _ => {
1201                if Self::is_class_set_reserved_double_punctuator(cp) {
1202                    if let Some(cp) = self.peek() {
1203                        if Self::is_class_set_reserved_double_punctuator(cp) {
1204                            return error("Invalid class set character");
1205                        }
1206                    }
1207                }
1208                Ok(cp)
1209            }
1210        }
1211    }
1212
1213    // ClassSetReservedPunctuator
1214    fn is_class_set_reserved_punctuator(cp: u32) -> bool {
1215        match cp {
1216            0x26 /* & */ | 0x2D /* - */ | 0x21 /* ! */ | 0x23 /* # */ | 0x25 /* % */
1217            | 0x2C /* , */ | 0x3A /* : */ | 0x3B /* ; */ | 0x3C /* < */ | 0x3D /* = */
1218            | 0x3E /* > */ | 0x40 /* @ */ | 0x60 /* ` */ | 0x7E /* ~ */ => true,
1219            _ => false,
1220        }
1221    }
1222
1223    fn is_class_set_reserved_double_punctuator(cp: u32) -> bool {
1224        match cp {
1225            0x26 /* & */ | 0x21 /* ! */ | 0x23 /* # */ | 0x24 /* $ */ | 0x25 /* % */
1226            | 0x2A /* * */ | 0x2B /* + */ | 0x2C /* , */ | 0x2E /* . */ | 0x3A /* : */
1227            | 0x3B /* ; */ | 0x3C /* < */ | 0x3D /* = */ | 0x3E /* > */ | 0x3F /* ? */
1228            | 0x40 /* @ */ | 0x5E /* ^ */ | 0x60 /* ` */ | 0x7E /* ~ */ => true,
1229            _ => false,
1230        }
1231    }
1232
1233    fn try_consume_quantifier(&mut self) -> Result<Option<ir::Quantifier>, Error> {
1234        if let Some(mut quant) = self.try_consume_quantifier_prefix()? {
1235            quant.greedy = !self.try_consume('?');
1236            Ok(Some(quant))
1237        } else {
1238            Ok(None)
1239        }
1240    }
1241
1242    fn try_consume_quantifier_prefix(&mut self) -> Result<Option<ir::Quantifier>, Error> {
1243        let nc = self.peek();
1244        if nc.is_none() {
1245            return Ok(None);
1246        }
1247        let c = nc.unwrap();
1248        match char::from_u32(c) {
1249            Some('+') => {
1250                self.consume('+');
1251                Ok(Some(ir::Quantifier {
1252                    min: 1,
1253                    max: usize::MAX,
1254                    greedy: true,
1255                }))
1256            }
1257            Some('*') => {
1258                self.consume('*');
1259                Ok(Some(ir::Quantifier {
1260                    min: 0,
1261                    max: usize::MAX,
1262                    greedy: true,
1263                }))
1264            }
1265            Some('?') => {
1266                self.consume('?');
1267                Ok(Some(ir::Quantifier {
1268                    min: 0,
1269                    max: 1,
1270                    greedy: true,
1271                }))
1272            }
1273            Some('{') => {
1274                if let Some(quantifier) = self.try_consume_braced_quantifier() {
1275                    Ok(Some(quantifier))
1276                } else if self.flags.unicode {
1277                    // if there was a brace '{' that doesn't parse into a valid quantifier,
1278                    // it's not valid with the unicode flag
1279                    error("Invalid quantifier")
1280                } else {
1281                    Ok(None)
1282                }
1283            }
1284            _ => Ok(None),
1285        }
1286    }
1287
1288    fn try_consume_braced_quantifier(&mut self) -> Option<ir::Quantifier> {
1289        // if parsed input is actually invalid, keep the previous one for rollback
1290        let pre_input = self.input.clone();
1291        self.consume('{');
1292        let optmin = self.try_consume_decimal_integer_literal();
1293        if optmin.is_none() {
1294            // not a valid quantifier, rollback consumption
1295            self.input = pre_input;
1296            return None;
1297        }
1298        let mut quant = ir::Quantifier {
1299            min: optmin.unwrap(),
1300            max: optmin.unwrap(),
1301            greedy: true,
1302        };
1303        if self.try_consume(',') {
1304            if let Some(max) = self.try_consume_decimal_integer_literal() {
1305                // Like {3,4}
1306                quant.max = max;
1307            } else {
1308                // Like {3,}
1309                quant.max = usize::MAX;
1310            }
1311        } else {
1312            // Like {3}.
1313        }
1314        if !self.try_consume('}') {
1315            // not a valid quantifier, rollback consumption
1316            self.input = pre_input;
1317            return None;
1318        }
1319        Some(quant)
1320    }
1321
1322    /// ES6 11.8.3 DecimalIntegerLiteral.
1323    /// If the value would overflow, usize::MAX is returned.
1324    /// All decimal digits are consumed regardless.
1325    fn try_consume_decimal_integer_literal(&mut self) -> Option<usize> {
1326        let mut result: usize = 0;
1327        let mut char_count = 0;
1328        while let Some(c) = self.peek() {
1329            if let Some(digit) = char::from_u32(c).and_then(|c| char::to_digit(c, 10)) {
1330                self.consume(c);
1331                char_count += 1;
1332                result = result.saturating_mul(10);
1333                result = result.saturating_add(digit as usize);
1334            } else {
1335                break;
1336            }
1337        }
1338        if char_count > 0 {
1339            Some(result)
1340        } else {
1341            None
1342        }
1343    }
1344
1345    fn consume_lookaround_assertion(
1346        &mut self,
1347        params: LookaroundParams,
1348    ) -> Result<ir::Node, Error> {
1349        let start_group = self.group_count;
1350        let contents = self.consume_disjunction()?;
1351        let end_group = self.group_count;
1352        Ok(ir::Node::LookaroundAssertion {
1353            negate: params.negate,
1354            backwards: params.backwards,
1355            start_group,
1356            end_group,
1357            contents: Box::new(contents),
1358        })
1359    }
1360
1361    fn consume_character_escape(&mut self) -> Result<u32, Error> {
1362        let c = self.next().expect("Should have a character");
1363        let ch = to_char_sat(c);
1364        match ch {
1365            // CharacterEscape :: ControlEscape :: f
1366            'f' => Ok(0xC),
1367            // CharacterEscape :: ControlEscape :: n
1368            'n' => Ok(0xA),
1369            // CharacterEscape :: ControlEscape :: r
1370            'r' => Ok(0xD),
1371            // CharacterEscape :: ControlEscape :: t
1372            't' => Ok(0x9),
1373            // CharacterEscape :: ControlEscape :: v
1374            'v' => Ok(0xB),
1375            // CharacterEscape :: c AsciiLetter
1376            'c' => {
1377                if let Some(nc) = self.next().and_then(char::from_u32) {
1378                    if nc.is_ascii_lowercase() || nc.is_ascii_uppercase() {
1379                        return Ok((nc as u32) % 32);
1380                    }
1381                }
1382                error("Invalid character escape")
1383            }
1384            // CharacterEscape :: 0 [lookahead ∉ DecimalDigit]
1385            '0' if self
1386                .peek()
1387                .and_then(char::from_u32)
1388                .map(|c: char| c.is_ascii_digit())
1389                != Some(true) =>
1390            {
1391                Ok(0x0)
1392            }
1393            // CharacterEscape :: HexEscapeSequence :: x HexDigit HexDigit
1394            'x' => {
1395                let hex_to_digit = |c: char| c.to_digit(16);
1396                let x1 = self.next().and_then(char::from_u32).and_then(hex_to_digit);
1397                let x2 = self.next().and_then(char::from_u32).and_then(hex_to_digit);
1398                match (x1, x2) {
1399                    (Some(x1), Some(x2)) => Ok(x1 * 16 + x2),
1400                    // CharacterEscape :: IdentityEscape :: SourceCharacterIdentityEscape
1401                    _ if !self.flags.unicode => Ok(c),
1402                    _ => error("Invalid character escape"),
1403                }
1404            }
1405            // CharacterEscape :: RegExpUnicodeEscapeSequence
1406            'u' => {
1407                if let Some(c) = self.try_escape_unicode_sequence() {
1408                    Ok(c)
1409                } else if !self.flags.unicode {
1410                    // CharacterEscape :: IdentityEscape :: SourceCharacterIdentityEscape
1411                    Ok(c)
1412                } else {
1413                    error("Invalid unicode escape")
1414                }
1415            }
1416            // CharacterEscape :: [~UnicodeMode] LegacyOctalEscapeSequence
1417            '0'..='7' if !self.flags.unicode => {
1418                let Some(c1) = self.peek() else {
1419                    return Ok(c - '0' as u32);
1420                };
1421                let ch1 = to_char_sat(c1);
1422
1423                match ch {
1424                    // 0 [lookahead ∈ { 8, 9 }]
1425                    '0' if ('8'..='9').contains(&ch1) => Ok(0x0),
1426                    // NonZeroOctalDigit [lookahead ∉ OctalDigit]
1427                    _ if !('0'..='7').contains(&ch1) => Ok(c - '0' as u32),
1428                    // FourToSeven OctalDigit
1429                    '4'..='7' => {
1430                        self.consume(c1);
1431                        Ok((c - '0' as u32) * 8 + c1 - '0' as u32)
1432                    }
1433                    // ZeroToThree OctalDigit [lookahead ∉ OctalDigit]
1434                    // ZeroToThree OctalDigit OctalDigit
1435                    '0'..='3' => {
1436                        self.consume(c1);
1437                        if self.peek().map(|c2| ('0'..='7').contains(&to_char_sat(c2)))
1438                            == Some(true)
1439                        {
1440                            let c2 = self.next().expect("char was not next");
1441                            Ok((c - '0' as u32) * 64 + (c1 - '0' as u32) * 8 + c2 - '0' as u32)
1442                        } else {
1443                            Ok((c - '0' as u32) * 8 + c1 - '0' as u32)
1444                        }
1445                    }
1446                    _ => unreachable!(),
1447                }
1448            }
1449            // CharacterEscape :: IdentityEscape :: [+UnicodeMode] SyntaxCharacter
1450            // CharacterEscape :: IdentityEscape :: [+UnicodeMode] /
1451            '^' | '$' | '\\' | '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|'
1452            | '/' => Ok(c),
1453            // CharacterEscape :: IdentityEscape :: SourceCharacterIdentityEscape
1454            _ if !self.flags.unicode => Ok(c),
1455            _ => error("Invalid character escape"),
1456        }
1457    }
1458
1459    // AtomEscape
1460    fn consume_atom_escape(&mut self) -> Result<ir::Node, Error> {
1461        let Some(c) = self.peek() else {
1462            return error("Incomplete escape");
1463        };
1464        match to_char_sat(c) {
1465            'd' | 'D' => {
1466                self.consume(c);
1467                Ok(make_bracket_class(
1468                    CharacterClassType::Digits,
1469                    c == 'd' as u32,
1470                ))
1471            }
1472
1473            's' | 'S' => {
1474                self.consume(c);
1475                Ok(make_bracket_class(
1476                    CharacterClassType::Spaces,
1477                    c == 's' as u32,
1478                ))
1479            }
1480
1481            'w' | 'W' => {
1482                self.consume(c);
1483                Ok(make_bracket_class(
1484                    CharacterClassType::Words,
1485                    c == 'w' as u32,
1486                ))
1487            }
1488
1489            // ClassEscape :: CharacterClassEscape :: [+UnicodeMode] p{ UnicodePropertyValueExpression }
1490            // ClassEscape :: CharacterClassEscape :: [+UnicodeMode] P{ UnicodePropertyValueExpression }
1491            'p' | 'P' if self.flags.unicode || self.flags.unicode_sets => {
1492                self.consume(c);
1493                let negate = c == 'P' as u32;
1494                let property_escape = self.try_consume_unicode_property_escape()?;
1495                match property_escape {
1496                    PropertyEscapeKind::CharacterClass(s) => {
1497                        Ok(ir::Node::Bracket(BracketContents {
1498                            invert: negate,
1499                            cps: CodePointSet::from_sorted_disjoint_intervals(s.to_vec()),
1500                        }))
1501                    }
1502                    PropertyEscapeKind::StringSet(_) if negate => error("Invalid character escape"),
1503                    PropertyEscapeKind::StringSet(strings) => Ok(make_alt(
1504                        strings
1505                            .iter()
1506                            .map(|s| {
1507                                ir::Node::Cat(
1508                                    s.iter()
1509                                        .map(|c| ir::Node::Char {
1510                                            c: *c,
1511                                            icase: self.flags.icase,
1512                                        })
1513                                        .collect(),
1514                                )
1515                            })
1516                            .collect(),
1517                    )),
1518                }
1519            }
1520
1521            // [+UnicodeMode] DecimalEscape
1522            // Note: This is a backreference.
1523            '1'..='9' if self.flags.unicode => {
1524                let group = self.try_consume_decimal_integer_literal().unwrap();
1525                if group <= self.group_count_max as usize {
1526                    Ok(ir::Node::BackRef(group as u32))
1527                } else {
1528                    error("Invalid character escape")
1529                }
1530            }
1531
1532            // [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber of DecimalEscape
1533            //    is ≤ CountLeftCapturingParensWithin(the Pattern containing DecimalEscape)
1534            // Note: This could be either a backreference, a legacy octal escape or an identity escape.
1535            '1'..='9' => {
1536                let input = self.input.clone();
1537                let group = self.try_consume_decimal_integer_literal().unwrap();
1538
1539                if group <= self.group_count_max as usize {
1540                    Ok(ir::Node::BackRef(group as u32))
1541                } else {
1542                    self.input = input;
1543                    let c = self.consume_character_escape()?;
1544                    Ok(ir::Node::Char {
1545                        c: self.fold_if_icase(c),
1546                        icase: self.flags.icase,
1547                    })
1548                }
1549            }
1550
1551            // [+NamedCaptureGroups] k GroupName
1552            'k' if self.flags.unicode || !self.named_group_indices.is_empty() => {
1553                self.consume('k');
1554
1555                // The sequence `\k` must be the start of a backreference to a named capture group.
1556                if let Some(group_name) = self.try_consume_named_capture_group_name() {
1557                    if let Some(index) = self.named_group_indices.get(&group_name) {
1558                        Ok(ir::Node::BackRef(*index + 1))
1559                    } else {
1560                        error(format!(
1561                            "Backreference to invalid named capture group: {}",
1562                            &group_name
1563                        ))
1564                    }
1565                } else {
1566                    error("Unexpected end of named backreference")
1567                }
1568            }
1569
1570            // [~NamedCaptureGroups] k GroupName
1571            'k' => {
1572                self.consume('k');
1573                Ok(ir::Node::Char {
1574                    c: self.fold_if_icase(c),
1575                    icase: self.flags.icase,
1576                })
1577            }
1578
1579            _ => {
1580                let c = self.consume_character_escape()?;
1581                Ok(ir::Node::Char {
1582                    c: self.fold_if_icase(c),
1583                    icase: self.flags.icase,
1584                })
1585            }
1586        }
1587    }
1588
1589    #[allow(clippy::branches_sharing_code)]
1590    fn try_escape_unicode_sequence(&mut self) -> Option<u32> {
1591        let mut orig_input = self.input.clone();
1592
1593        // Support \u{X..X} (Unicode CodePoint)
1594        if self.try_consume('{') {
1595            let mut s = String::new();
1596            loop {
1597                match self.next().and_then(char::from_u32) {
1598                    Some('}') => break,
1599                    Some(c) => s.push(c),
1600                    None => {
1601                        // Surrogates not supported in code point escapes.
1602                        self.input = orig_input;
1603                        return None;
1604                    }
1605                }
1606            }
1607
1608            match u32::from_str_radix(&s, 16) {
1609                Ok(u) => {
1610                    if u > 0x10_FFFF {
1611                        self.input = orig_input;
1612                        None
1613                    } else {
1614                        Some(u)
1615                    }
1616                }
1617                _ => {
1618                    self.input = orig_input;
1619                    None
1620                }
1621            }
1622        } else {
1623            // Hex4Digits
1624            let mut s = String::new();
1625            for _ in 0..4 {
1626                if let Some(c) = self.next().and_then(char::from_u32) {
1627                    s.push(c);
1628                } else {
1629                    // Surrogates are not hex digits.
1630                    self.input = orig_input;
1631                    return None;
1632                }
1633            }
1634            match u16::from_str_radix(&s, 16) {
1635                Ok(u) => {
1636                    if (0xD800..=0xDBFF).contains(&u) {
1637                        // Found a high surrogate. Try to parse a low surrogate next
1638                        // to see if we can rebuild the original `char`
1639
1640                        if !self.try_consume_str("\\u") {
1641                            return Some(u as u32);
1642                        }
1643                        orig_input = self.input.clone();
1644
1645                        // A poor man's try block to handle the backtracking
1646                        // in a single place instead of every time we want to return.
1647                        // This allows us to use `?` within the inner block without returning
1648                        // from the entire parent function.
1649                        let result = (|| {
1650                            let mut s = String::new();
1651                            for _ in 0..4 {
1652                                let c = self.next().and_then(char::from_u32)?;
1653                                s.push(c);
1654                            }
1655
1656                            let uu = u16::from_str_radix(&s, 16).ok()?;
1657                            let ch = char::decode_utf16([u, uu]).next()?.ok()?;
1658                            Some(u32::from(ch))
1659                        })();
1660
1661                        result.or_else(|| {
1662                            self.input = orig_input;
1663                            Some(u as u32)
1664                        })
1665                    } else {
1666                        // If `u` is not a surrogate or is a low surrogate we can directly return it,
1667                        // since all paired low surrogates should have been handled above.
1668                        Some(u as u32)
1669                    }
1670                }
1671                _ => {
1672                    self.input = orig_input;
1673                    None
1674                }
1675            }
1676        }
1677    }
1678
1679    fn try_consume_named_capture_group_name(&mut self) -> Option<String> {
1680        if !self.try_consume('<') {
1681            return None;
1682        }
1683
1684        let orig_input = self.input.clone();
1685        let mut group_name = String::new();
1686
1687        if let Some(mut c) = self.next().and_then(char::from_u32) {
1688            if c == '\\' && self.try_consume('u') {
1689                if let Some(escaped) = self.try_escape_unicode_sequence().and_then(char::from_u32) {
1690                    c = escaped;
1691                } else {
1692                    self.input = orig_input;
1693                    return None;
1694                }
1695            }
1696
1697            if interval_contains(id_start_ranges(), c.into()) || c == '$' || c == '_' {
1698                group_name.push(c);
1699            } else {
1700                self.input = orig_input;
1701                return None;
1702            }
1703        } else {
1704            self.input = orig_input;
1705            return None;
1706        }
1707
1708        loop {
1709            if let Some(mut c) = self.next().and_then(char::from_u32) {
1710                if c == '\\' && self.try_consume('u') {
1711                    if let Some(escaped) =
1712                        self.try_escape_unicode_sequence().and_then(char::from_u32)
1713                    {
1714                        c = escaped;
1715                    } else {
1716                        self.input = orig_input;
1717                        return None;
1718                    }
1719                }
1720
1721                if c == '>' {
1722                    break;
1723                }
1724
1725                if interval_contains(id_continue_ranges(), c.into()) || c == '$' || c == '_' || c == '\u{200C}' /* <ZWNJ> */ || c == '\u{200D}'
1726                /* <ZWJ> */
1727                {
1728                    group_name.push(c);
1729                } else {
1730                    self.input = orig_input;
1731                    return None;
1732                }
1733            } else {
1734                self.input = orig_input;
1735                return None;
1736            }
1737        }
1738
1739        Some(group_name)
1740    }
1741
1742    // Quickly parse all capture groups.
1743    fn parse_capture_groups(&mut self) -> Result<(), Error> {
1744        let orig_input = self.input.clone();
1745
1746        loop {
1747            match self.next().map(to_char_sat) {
1748                Some('\\') => {
1749                    self.next();
1750                    continue;
1751                }
1752                Some('[') => loop {
1753                    match self.next().map(to_char_sat) {
1754                        Some('\\') => {
1755                            self.next();
1756                            continue;
1757                        }
1758                        Some(']') => break,
1759                        Some(_) => continue,
1760                        None => break,
1761                    }
1762                },
1763                Some('(') => {
1764                    if self.try_consume_str("?") {
1765                        if let Some(name) = self.try_consume_named_capture_group_name() {
1766                            if self
1767                                .named_group_indices
1768                                .insert(name, self.group_count_max)
1769                                .is_some()
1770                            {
1771                                return error("Duplicate capture group name");
1772                            }
1773                        }
1774                    }
1775                    self.group_count_max = if self.group_count_max + 1 > MAX_CAPTURE_GROUPS as u32 {
1776                        MAX_CAPTURE_GROUPS as u32
1777                    } else {
1778                        self.group_count_max + 1
1779                    };
1780                }
1781                Some(_) => continue,
1782                None => break,
1783            }
1784        }
1785
1786        self.input = orig_input;
1787
1788        Ok(())
1789    }
1790
1791    fn try_consume_unicode_property_escape(&mut self) -> Result<PropertyEscapeKind, Error> {
1792        if !self.try_consume('{') {
1793            return error("Invalid character at property escape start");
1794        }
1795
1796        let mut buffer = String::new();
1797        let mut name = None;
1798
1799        while let Some(c) = self.peek().and_then(char::from_u32) {
1800            match c {
1801                '}' => {
1802                    self.consume(c);
1803                    let Some(value) =
1804                        unicode_property_from_str(&buffer, name, self.flags.unicode_sets)
1805                    else {
1806                        break;
1807                    };
1808                    return Ok(value);
1809                }
1810                '=' if name.is_none() => {
1811                    self.consume(c);
1812                    let Some(n) = unicode_property_name_from_str(&buffer) else {
1813                        break;
1814                    };
1815                    name = Some(n);
1816                    buffer.clear();
1817                }
1818                c if c.is_ascii_alphanumeric() || c == '_' => {
1819                    self.consume(c);
1820                    buffer.push(c);
1821                }
1822                _ => break,
1823            }
1824        }
1825
1826        error("Invalid property name")
1827    }
1828
1829    fn finalize(&self, mut re: ir::Regex) -> Result<ir::Regex, Error> {
1830        debug_assert!(self.loop_count <= MAX_LOOPS as u32);
1831        debug_assert!(self.group_count as usize <= MAX_CAPTURE_GROUPS);
1832        if self.has_lookbehind {
1833            ir::walk_mut(
1834                false,
1835                re.flags.unicode,
1836                &mut re.node,
1837                &mut ir::Node::reverse_cats,
1838            );
1839        }
1840        Ok(re)
1841    }
1842}
1843
1844/// Try parsing a given pattern.
1845/// Return the resulting IR regex, or an error.
1846pub fn try_parse<I>(pattern: I, flags: api::Flags) -> Result<ir::Regex, Error>
1847where
1848    I: Iterator<Item = u32> + Clone,
1849{
1850    let mut p = Parser {
1851        input: pattern.peekable(),
1852        flags,
1853        loop_count: 0,
1854        group_count: 0,
1855        named_group_indices: HashMap::new(),
1856        group_count_max: 0,
1857        has_lookbehind: false,
1858    };
1859    p.try_parse()
1860}