Skip to main content

substrait_explain/parser/
structural.rs

1//! Parser for the structural part of the Substrait file format.
2//!
3//! This is the overall parser for parsing the text format. It is responsible
4//! for tracking which section of the file we are currently parsing, and parsing
5//! each line separately.
6
7use std::fmt;
8
9use pest::iterators::Pair;
10use substrait::proto::extensions::AdvancedExtension;
11use substrait::proto::{
12    AggregateRel, FetchRel, FilterRel, JoinRel, Plan, PlanRel, ProjectRel, ReadRel, Rel, RelRoot,
13    SortRel, plan_rel,
14};
15
16use crate::extensions::any::Any;
17use crate::extensions::{AddendumKind, ExtensionRegistry, SimpleExtensions, simple};
18use crate::parser::common::{MessageParseError, ParsePair, ScopedParsePair};
19use crate::parser::errors::{ParseContext, ParseError, ParseResult};
20use crate::parser::expressions::Name;
21use crate::parser::extensions::{
22    AddendumInvocation, ExtensionInvocation, ExtensionParseError, ExtensionParser,
23};
24use crate::parser::relations::{ExtensionReadRel, RelationParsingContext, VirtualReadRel};
25use crate::parser::{ErrorKind, ExpressionParser, RelationParsePair, Rule, unwrap_single_pair};
26
27pub const PLAN_HEADER: &str = "=== Plan";
28
29/// Represents an input line, trimmed of leading two-space indents and final
30/// whitespace. Contains the number of indents and the trimmed line.
31#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32pub struct IndentedLine<'a>(pub usize, pub &'a str);
33
34impl<'a> From<&'a str> for IndentedLine<'a> {
35    fn from(line: &'a str) -> Self {
36        let line = line.trim_end();
37        let mut spaces = 0;
38        for c in line.chars() {
39            if c == ' ' {
40                spaces += 1;
41            } else {
42                break;
43            }
44        }
45
46        let indents = spaces / 2;
47
48        let (_, trimmed) = line.split_at(indents * 2);
49
50        IndentedLine(indents, trimmed)
51    }
52}
53
54/// A `+`-prefixed addendum line attached to a relation node. The `pair` holds
55/// the grammar rule directly (already unwrapped from the outer `planNode`).
56#[derive(Debug, Clone)]
57pub struct Addendum<'a> {
58    pub pair: Pair<'a, Rule>, // Rule::addendum
59    pub line_no: i64,
60}
61
62/// A relation node in the plan tree, before conversion to a Substrait proto.
63#[derive(Debug, Clone)]
64pub struct RelationNode<'a> {
65    pub pair: Pair<'a, Rule>,
66    pub line_no: i64,
67    pub addenda: Vec<Addendum<'a>>,
68    pub children: Vec<RelationNode<'a>>,
69}
70
71impl<'a> RelationNode<'a> {
72    pub fn context(&self) -> ParseContext {
73        ParseContext {
74            line_no: self.line_no,
75            line: self.pair.as_str().to_string(),
76        }
77    }
78}
79
80/// A parsed plan line: either a relation or a `+`-prefixed addendum line.
81///
82/// Classification happens at construction time by inspecting the inner grammar
83/// rule, so downstream code can use standard Rust pattern matching rather than
84/// runtime rule inspection.
85#[derive(Debug, Clone)]
86pub enum LineNode<'a> {
87    Relation(RelationNode<'a>),
88    Addendum(Addendum<'a>),
89}
90
91impl<'a> LineNode<'a> {
92    pub fn parse(line: &'a str, line_no: i64) -> Result<Self, ParseError> {
93        let mut pairs: pest::iterators::Pairs<'a, Rule> =
94            <ExpressionParser as pest::Parser<Rule>>::parse(Rule::planNode, line).map_err(|e| {
95                ParseError::Plan(
96                    ParseContext {
97                        line_no,
98                        line: line.to_string(),
99                    },
100                    MessageParseError::new("planNode", ErrorKind::InvalidValue, Box::new(e)),
101                )
102            })?;
103
104        let outer = pairs.next().unwrap();
105        assert!(pairs.next().is_none()); // Should be exactly one pair
106        let inner = unwrap_single_pair(outer);
107
108        Ok(match inner.as_rule() {
109            Rule::addendum => LineNode::Addendum(Addendum {
110                pair: inner,
111                line_no,
112            }),
113            _ => LineNode::Relation(RelationNode {
114                pair: inner,
115                line_no,
116                addenda: Vec::new(),
117                children: Vec::new(),
118            }),
119        })
120    }
121
122    /// Parse the line as a top-level relation at depth 0 (either root_relation or regular relation)
123    pub fn parse_root(line: &'a str, line_no: i64) -> Result<Self, ParseError> {
124        let mut pairs: pest::iterators::Pairs<'a, Rule> = <ExpressionParser as pest::Parser<
125            Rule,
126        >>::parse(
127            Rule::top_level_relation, line
128        )
129        .map_err(|e| {
130            ParseError::Plan(
131                ParseContext::new(line_no, line.to_string()),
132                MessageParseError::new("top_level_relation", ErrorKind::Syntax, Box::new(e)),
133            )
134        })?;
135
136        let outer = pairs.next().unwrap();
137        assert!(pairs.next().is_none());
138
139        // top_level_relation is either root_relation or planNode.
140        // If planNode, unwrap one more level to obtain the specific relation rule.
141        let inner = unwrap_single_pair(outer);
142        let pair = if inner.as_rule() == Rule::planNode {
143            unwrap_single_pair(inner)
144        } else {
145            inner // root_relation
146        };
147
148        // planNode can include addenda (+Enh:, +Opt:, +Ext:); surface them so
149        // TreeBuilder::add_line can produce the appropriate depth-0 error.
150        if pair.as_rule() == Rule::addendum {
151            return Ok(LineNode::Addendum(Addendum { pair, line_no }));
152        }
153
154        Ok(LineNode::Relation(RelationNode {
155            pair,
156            line_no,
157            addenda: Vec::new(),
158            children: Vec::new(),
159        }))
160    }
161}
162
163#[derive(Copy, Clone, Debug)]
164pub enum State {
165    // The initial state, before we have parsed any lines.
166    Initial,
167    // The extensions section, after parsing the header and any other Extension lines.
168    Extensions,
169    // The plan section, after parsing the header and any other Plan lines.
170    Plan,
171}
172
173impl fmt::Display for State {
174    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
175        write!(f, "{self:?}")
176    }
177}
178
179// An in-progress tree builder, building the tree of relations.
180#[derive(Debug, Clone, Default)]
181pub struct TreeBuilder<'a> {
182    // Current tree of nodes being built. These have been successfully parsed
183    // into Pest pairs, but have not yet been converted to substrait plans.
184    current: Option<RelationNode<'a>>,
185    // Completed trees that have been built.
186    completed: Vec<RelationNode<'a>>,
187}
188
189impl<'a> TreeBuilder<'a> {
190    /// Traverse down the tree, always taking the last child at each level, until reaching the specified depth.
191    pub fn get_at_depth(&mut self, depth: usize) -> Option<&mut RelationNode<'a>> {
192        let mut node = self.current.as_mut()?;
193        for _ in 0..depth {
194            node = node.children.last_mut()?;
195        }
196        Some(node)
197    }
198
199    pub fn add_line(&mut self, depth: usize, node: LineNode<'a>) -> Result<(), ParseError> {
200        match node {
201            LineNode::Relation(rel_node) => {
202                if depth == 0 {
203                    if let Some(prev) = self.current.take() {
204                        self.completed.push(prev);
205                    }
206                    self.current = Some(rel_node);
207                    return Ok(());
208                }
209
210                let parent = match self.get_at_depth(depth - 1) {
211                    None => {
212                        return Err(ParseError::Plan(
213                            rel_node.context(),
214                            MessageParseError::invalid(
215                                "relation",
216                                rel_node.pair.as_span(),
217                                format!("No parent found for depth {depth}"),
218                            ),
219                        ));
220                    }
221                    Some(parent) => parent,
222                };
223
224                parent.children.push(rel_node);
225            }
226            LineNode::Addendum(addendum) => {
227                let context =
228                    ParseContext::new(addendum.line_no, addendum.pair.as_str().to_string());
229                if depth == 0 {
230                    return Err(ParseError::ValidationError(
231                        context,
232                        "addenda (+ Enh: / + Opt: / + Ext:) cannot appear at the top level"
233                            .to_string(),
234                    ));
235                }
236
237                let parent = match self.get_at_depth(depth - 1) {
238                    None => {
239                        return Err(ParseError::ValidationError(
240                            context,
241                            format!("no parent found for addendum at depth {depth}"),
242                        ));
243                    }
244                    Some(parent) => parent,
245                };
246
247                if !parent.children.is_empty() {
248                    return Err(ParseError::ValidationError(
249                        context,
250                        "addenda (+ Enh: / + Opt: / + Ext:) must appear before child relations, \
251                         not after"
252                            .to_string(),
253                    ));
254                }
255
256                parent.addenda.push(addendum);
257            }
258        }
259        Ok(())
260    }
261
262    /// End of input - move any remaining nodes from stack to completed and
263    /// return any trees in progress. Resets the builder to its initial state
264    /// (empty)
265    /// Move any remaining nodes from stack to completed
266    pub fn finish(&mut self) -> Vec<RelationNode<'a>> {
267        if let Some(node) = self.current.take() {
268            self.completed.push(node);
269        }
270        std::mem::take(&mut self.completed)
271    }
272}
273
274/// Intermediate state for relation parsing: the structural tree data
275/// (children, addenda) has been parsed, but the relation's own grammar pair
276/// hasn't been converted to a protobuf relation yet.
277struct RelationContext<'a> {
278    pair: Pair<'a, Rule>,
279    line_no: i64,
280    #[allow(clippy::vec_box)]
281    children: Vec<Box<Rel>>,
282    input_field_count: usize,
283    addenda: Addenda<'a>,
284}
285
286/// A parsed addendum line plus enough source location to build later errors.
287#[derive(Debug, Clone)]
288struct ParsedAddendum<'a> {
289    line_no: i64,
290    line: &'a str,
291    invocation: AddendumInvocation,
292}
293
294impl<'a> ParsedAddendum<'a> {
295    fn parse(extensions: &SimpleExtensions, addendum: Addendum<'a>) -> Result<Self, ParseError> {
296        let line_no = addendum.line_no;
297        let line = addendum.pair.as_str();
298        let invocation = AddendumInvocation::parse_pair(extensions, addendum.pair)
299            .map_err(|e| ParseError::Plan(ParseContext::new(line_no, line.to_string()), e))?;
300        Ok(Self {
301            line_no,
302            line,
303            invocation,
304        })
305    }
306
307    fn context(&self) -> ParseContext {
308        ParseContext::new(self.line_no, self.line.to_string())
309    }
310
311    fn relation_context<'b>(
312        &'b self,
313        extensions: &'b SimpleExtensions,
314        registry: &'b ExtensionRegistry,
315    ) -> RelationParsingContext<'b> {
316        RelationParsingContext {
317            extensions,
318            registry,
319            line_no: self.line_no,
320            line: self.line,
321        }
322    }
323    fn resolve_detail(
324        &self,
325        extensions: &SimpleExtensions,
326        registry: &ExtensionRegistry,
327    ) -> Result<Any, ParseError> {
328        self.relation_context(extensions, registry)
329            .resolve_addendum_detail(
330                self.invocation.kind,
331                &self.invocation.name,
332                &self.invocation.args,
333            )
334    }
335}
336
337/// Parsed `+` lines attached to a relation.
338#[derive(Debug, Clone, Default)]
339struct Addenda<'a> {
340    items: Vec<ParsedAddendum<'a>>,
341}
342
343impl<'a> Addenda<'a> {
344    fn parse(
345        extensions: &SimpleExtensions,
346        addenda: Vec<Addendum<'a>>,
347    ) -> Result<Self, ParseError> {
348        let items = addenda
349            .into_iter()
350            .map(|addendum| ParsedAddendum::parse(extensions, addendum))
351            .collect::<Result<Vec<_>, ParseError>>()?;
352        Ok(Self { items })
353    }
354
355    fn first(&self) -> Option<&ParsedAddendum<'a>> {
356        self.items.first()
357    }
358
359    fn reject_all(&self, message: &'static str) -> Result<(), ParseError> {
360        if let Some(addendum) = self.first() {
361            return Err(ParseError::ValidationError(
362                addendum.context(),
363                message.to_string(),
364            ));
365        }
366        Ok(())
367    }
368
369    fn into_standard_advanced_extension(
370        self,
371        extensions: &SimpleExtensions,
372        registry: &ExtensionRegistry,
373    ) -> Result<Option<AdvancedExtension>, ParseError> {
374        let mut enhancement = None;
375        let mut optimizations = Vec::new();
376
377        for addendum in self.items {
378            match addendum.invocation.kind {
379                AddendumKind::Enhancement => {
380                    if enhancement.is_some() {
381                        return Err(ParseError::ValidationError(
382                            addendum.context(),
383                            "at most one enhancement per relation is allowed".to_string(),
384                        ));
385                    }
386                    enhancement = Some(addendum.resolve_detail(extensions, registry)?.into());
387                }
388                AddendumKind::Optimization => {
389                    optimizations.push(addendum.resolve_detail(extensions, registry)?.into());
390                }
391                AddendumKind::ExtensionTable => {
392                    return Err(ParseError::ValidationError(
393                        addendum.context(),
394                        "+ Ext addenda can only be used with Read:Extension".to_string(),
395                    ));
396                }
397            }
398        }
399
400        if enhancement.is_none() && optimizations.is_empty() {
401            return Ok(None);
402        }
403
404        Ok(Some(AdvancedExtension {
405            enhancement,
406            optimization: optimizations,
407        }))
408    }
409
410    fn into_extension_read_parts(
411        self,
412        extensions: &SimpleExtensions,
413        registry: &ExtensionRegistry,
414        relation_context: ParseContext,
415    ) -> Result<(Any, Option<AdvancedExtension>), ParseError> {
416        let mut extension_table = None;
417        let mut advanced_addenda = Vec::new();
418
419        for addendum in self.items {
420            match addendum.invocation.kind {
421                AddendumKind::ExtensionTable => {
422                    if extension_table.is_some() {
423                        return Err(ParseError::ValidationError(
424                            addendum.context(),
425                            "Read:Extension allows exactly one + Ext addendum".to_string(),
426                        ));
427                    }
428                    extension_table = Some(addendum);
429                }
430                AddendumKind::Enhancement | AddendumKind::Optimization => {
431                    advanced_addenda.push(addendum);
432                }
433            }
434        }
435
436        let extension_table = extension_table.ok_or_else(|| {
437            ParseError::ValidationError(
438                relation_context,
439                "Read:Extension requires exactly one + Ext addendum".to_string(),
440            )
441        })?;
442
443        let detail = extension_table.resolve_detail(extensions, registry)?;
444        let advanced_extension = Addenda {
445            items: advanced_addenda,
446        }
447        .into_standard_advanced_extension(extensions, registry)?;
448
449        Ok((detail, advanced_extension))
450    }
451}
452
453// Relation parsing component - handles converting LineNodes to Relations
454#[derive(Debug, Clone, Default)]
455pub struct RelationParser<'a> {
456    tree: TreeBuilder<'a>,
457}
458
459impl<'a> RelationParser<'a> {
460    pub fn parse_line(&mut self, line: IndentedLine<'a>, line_no: i64) -> Result<(), ParseError> {
461        let IndentedLine(depth, line) = line;
462
463        // Use parse_root for depth 0 (top-level relations), parse for other depths
464        let node = if depth == 0 {
465            LineNode::parse_root(line, line_no)?
466        } else {
467            LineNode::parse(line, line_no)?
468        };
469
470        self.tree.add_line(depth, node)
471    }
472
473    /// Dispatch by grammar rule after validating addenda constraints.
474    /// Standard relations go through [`parse_rel`](Self::parse_rel);
475    /// extension relations go through
476    /// [`parse_extension_relation`](Self::parse_extension_relation).
477    fn parse_relation(
478        &self,
479        extensions: &SimpleExtensions,
480        registry: &ExtensionRegistry,
481        ctx: RelationContext,
482    ) -> Result<(Rel, usize), ParseError> {
483        match ctx.pair.as_rule() {
484            Rule::extension_read_relation => {
485                self.parse_extension_read_relation(extensions, registry, ctx)
486            }
487            Rule::virtual_read_relation => {
488                self.parse_rel::<VirtualReadRel>(extensions, registry, ctx)
489            }
490            Rule::read_relation => self.parse_rel::<ReadRel>(extensions, registry, ctx),
491            Rule::filter_relation => self.parse_rel::<FilterRel>(extensions, registry, ctx),
492            Rule::project_relation => self.parse_rel::<ProjectRel>(extensions, registry, ctx),
493            Rule::aggregate_relation => self.parse_rel::<AggregateRel>(extensions, registry, ctx),
494            Rule::sort_relation => self.parse_rel::<SortRel>(extensions, registry, ctx),
495            Rule::fetch_relation => self.parse_rel::<FetchRel>(extensions, registry, ctx),
496            Rule::join_relation => self.parse_rel::<JoinRel>(extensions, registry, ctx),
497            Rule::extension_relation => self.parse_extension_relation(extensions, registry, ctx),
498            _ => unreachable!("unhandled relation rule: {:?}", ctx.pair.as_rule()),
499        }
500    }
501
502    /// Generic bridge between [`parse_relation`](Self::parse_relation) and
503    /// the [`RelationParsePair`] trait: wraps `MessageParseError` with line
504    /// context and calls [`into_rel`](RelationParsePair::into_rel) to apply
505    /// addenda and produce the final [`Rel`].
506    fn parse_rel<T: RelationParsePair>(
507        &self,
508        extensions: &SimpleExtensions,
509        registry: &ExtensionRegistry,
510        ctx: RelationContext,
511    ) -> Result<(Rel, usize), ParseError> {
512        let RelationContext {
513            pair,
514            line_no,
515            children,
516            input_field_count,
517            addenda,
518        } = ctx;
519        assert_eq!(pair.as_rule(), T::rule());
520        let line = pair.as_str();
521        let advanced_extension = addenda.into_standard_advanced_extension(extensions, registry)?;
522
523        match T::parse_pair_with_context(extensions, pair, children, input_field_count) {
524            Ok((parsed, count)) => Ok((parsed.into_rel(advanced_extension), count)),
525            Err(e) => Err(ParseError::Plan(
526                ParseContext::new(line_no, line.to_string()),
527                e,
528            )),
529        }
530    }
531
532    /// Handle extension relations separately from [`parse_rel`](Self::parse_rel)
533    /// because they need registry lookups that [`RelationParsePair`] doesn't
534    /// support.
535    fn parse_extension_relation(
536        &self,
537        extensions: &SimpleExtensions,
538        registry: &ExtensionRegistry,
539        ctx: RelationContext,
540    ) -> Result<(Rel, usize), ParseError> {
541        assert_eq!(ctx.pair.as_rule(), Rule::extension_relation);
542        let line_no = ctx.line_no;
543        let line = ctx.pair.as_str().to_string();
544        let pair_span = ctx.pair.as_span();
545
546        ctx.addenda
547            .reject_all("extension relations do not support addenda (+ Enh / + Opt / + Ext)")?;
548
549        let ExtensionInvocation {
550            relation_kind,
551            name,
552            args: extension_args,
553        } = ExtensionInvocation::parse_pair(extensions, ctx.pair.clone())
554            .map_err(|e| ParseError::Plan(ParseContext::new(line_no, line.clone()), e))?;
555
556        let child_count = ctx.children.len();
557        relation_kind
558            .validate_child_count(child_count)
559            .map_err(|e| {
560                ParseError::Plan(
561                    ParseContext::new(line_no, line.to_string()),
562                    MessageParseError::invalid("extension_relation", pair_span, e),
563                )
564            })?;
565
566        let context = RelationParsingContext {
567            extensions,
568            registry,
569            line_no,
570            line: &line,
571        };
572
573        let detail = context.resolve_extension_detail(&name, &extension_args)?;
574        let output_column_count = extension_args.output_columns.len();
575
576        let children = ctx.children.into_iter().map(|child| *child).collect();
577        let rel = relation_kind.create_rel(detail, children);
578
579        Ok((rel, output_column_count))
580    }
581
582    /// Parse `Read:Extension[...]`, whose table detail is supplied by exactly
583    /// one `+ Ext:Name[...]` addendum.
584    fn parse_extension_read_relation(
585        &self,
586        extensions: &SimpleExtensions,
587        registry: &ExtensionRegistry,
588        ctx: RelationContext,
589    ) -> Result<(Rel, usize), ParseError> {
590        assert_eq!(ctx.pair.as_rule(), Rule::extension_read_relation);
591        let context = ParseContext::new(ctx.line_no, ctx.pair.as_str().to_string());
592        let (detail, advanced_extension) =
593            ctx.addenda
594                .into_extension_read_parts(extensions, registry, context.clone())?;
595
596        ExtensionReadRel::parse_pair_with_detail(
597            extensions,
598            ctx.pair,
599            ctx.children,
600            ctx.input_field_count,
601            detail,
602            advanced_extension,
603        )
604        .map_err(|e| ParseError::Plan(context, e))
605    }
606
607    /// Walk the relation tree depth-first, converting structural types
608    /// (children, addenda) into proto types via [`RelationContext`].
609    /// Delegates grammar-rule-specific work to
610    /// [`parse_relation`](Self::parse_relation).
611    fn build_rel(
612        &self,
613        extensions: &SimpleExtensions,
614        registry: &ExtensionRegistry,
615        node: RelationNode,
616    ) -> Result<(Rel, usize), ParseError> {
617        let mut children: Vec<Box<Rel>> = Vec::new();
618        let mut input_field_count: usize = 0;
619        for child in node.children {
620            let (rel, count) = self.build_rel(extensions, registry, child)?;
621            input_field_count += count;
622            children.push(Box::new(rel));
623        }
624
625        let addenda = Addenda::parse(extensions, node.addenda)?;
626
627        self.parse_relation(
628            extensions,
629            registry,
630            RelationContext {
631                pair: node.pair,
632                line_no: node.line_no,
633                children,
634                input_field_count,
635                addenda,
636            },
637        )
638    }
639
640    /// Build a tree of relations.
641    fn build_plan_rel(
642        &self,
643        extensions: &SimpleExtensions,
644        registry: &ExtensionRegistry,
645        node: RelationNode,
646    ) -> Result<PlanRel, ParseError> {
647        // Plain relations are allowed as root relations; they just don't have names.
648        if node.pair.as_rule() != Rule::root_relation {
649            let (rel, _) = self.build_rel(extensions, registry, node)?;
650            return Ok(PlanRel {
651                rel_type: Some(plan_rel::RelType::Rel(rel)),
652            });
653        }
654
655        // Root relations don't support addenda — reject rather than silently discard.
656        if !node.addenda.is_empty() {
657            let first = &node.addenda[0];
658            let context = ParseContext::new(first.line_no, first.pair.as_str().to_string());
659            return Err(ParseError::ValidationError(
660                context,
661                "addenda (+ Enh: / + Opt: / + Ext:) are not supported on Root relations"
662                    .to_string(),
663            ));
664        }
665
666        // Named root relation.
667        let context = node.context();
668        let span = node.pair.as_span();
669
670        // Parse the column names
671        let column_names_pair = unwrap_single_pair(node.pair);
672        assert_eq!(column_names_pair.as_rule(), Rule::root_name_list);
673
674        let names: Vec<String> = column_names_pair
675            .into_inner()
676            .map(|name_pair| {
677                assert_eq!(name_pair.as_rule(), Rule::name);
678                Name::parse_pair(name_pair).0
679            })
680            .collect();
681
682        let mut children = node.children;
683        let child = match children.len() {
684            1 => {
685                let (rel, _) = self.build_rel(extensions, registry, children.pop().unwrap())?;
686                rel
687            }
688            n => {
689                return Err(ParseError::Plan(
690                    context,
691                    MessageParseError::invalid(
692                        "root_relation",
693                        span,
694                        format!("Root relation must have exactly one child, found {n}"),
695                    ),
696                ));
697            }
698        };
699
700        Ok(PlanRel {
701            rel_type: Some(plan_rel::RelType::Root(RelRoot {
702                names,
703                input: Some(child),
704            })),
705        })
706    }
707
708    /// Build all the trees.
709    fn build(
710        mut self,
711        extensions: &SimpleExtensions,
712        registry: &ExtensionRegistry,
713    ) -> Result<Vec<PlanRel>, ParseError> {
714        let nodes = self.tree.finish();
715        nodes
716            .into_iter()
717            .map(|n| self.build_plan_rel(extensions, registry, n))
718            .collect::<Result<Vec<PlanRel>, ParseError>>()
719    }
720}
721
722/// A parser for Substrait query plans in text format.
723///
724/// The `Parser` converts human-readable Substrait text format into Substrait
725/// protobuf plans. It handles both the extensions section (which defines
726/// functions, types, etc.) and the plan section (which defines the actual query
727/// structure).
728///
729/// ## Usage
730///
731/// The simplest entry point is the static `parse()` method:
732///
733/// ```rust
734/// use substrait_explain::parser::Parser;
735///
736/// let plan_text = r#"
737/// === Plan
738/// Root[c, d]
739///   Project[$1, 42]
740///     Read[schema.table => a:i64, b:string?]
741/// "#;
742///
743/// let plan = Parser::parse(plan_text).unwrap();
744/// ```
745///
746/// ## Input Format
747///
748/// The parser expects input in the following format:
749///
750/// ```text
751/// === Extensions
752/// URNs:
753///   @  1: https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml
754/// Functions:
755///   # 10 @  1: add
756/// === Plan
757/// Root[columns]
758///   Relation[arguments => columns]
759///     ChildRelation[arguments => columns]
760/// ```
761///
762/// - **Extensions section** (optional): Defines URNs and function/type declarations
763/// - **Plan section** (required): Defines the query structure with indented relations
764///
765/// ## Error Handling
766///
767/// The parser provides detailed error information including:
768/// - Line number where the error occurred
769/// - The actual line content that failed to parse
770/// - Specific error type and description
771///
772/// ```rust
773/// use substrait_explain::parser::Parser;
774///
775/// let invalid_plan = r#"
776/// === Plan
777/// InvalidRelation[invalid syntax]
778/// "#;
779///
780/// match Parser::parse(invalid_plan) {
781///     Ok(plan) => println!("Successfully parsed"),
782///     Err(e) => eprintln!("Parse error: {}", e),
783/// }
784/// ```
785///
786/// ## Supported Relations
787///
788/// The parser supports all standard Substrait relations:
789/// - `Read[table => columns]` - Read from a table
790/// - `Project[expressions]` - Project columns/expressions
791/// - `Filter[condition => columns]` - Filter rows
792/// - `Root[columns]` - Root relation with output columns
793/// - And more...
794///
795/// ## Extensions Support
796///
797/// The parser fully supports Substrait Simple Extensions, allowing you to:
798/// - Define custom functions with URNs and anchors
799/// - Reference functions by name in expressions
800/// - Use custom types and type variations
801///
802/// ```rust
803/// use substrait_explain::parser::Parser;
804///
805/// let plan_with_extensions = r#"
806/// === Extensions
807/// URNs:
808///   @  1: https://example.com/functions.yaml
809/// Functions:
810///   ## 10 @  1: my_custom_function
811/// === Plan
812/// Root[result]
813///   Project[my_custom_function($0, $1)]
814///     Read[table => col1:i32, col2:i32]
815/// "#;
816///
817/// let plan = Parser::parse(plan_with_extensions).unwrap();
818/// ```
819///
820/// ## Performance
821///
822/// The parser is designed for efficiency:
823/// - Single-pass parsing with minimal allocations
824/// - Early error detection and reporting
825/// - Memory-efficient tree building
826///
827/// ## Thread Safety
828///
829/// `Parser` instances are not thread-safe and should not be shared between threads.
830/// However, the static `parse()` method is safe to call from multiple threads.
831#[derive(Debug)]
832pub struct Parser<'a> {
833    line_no: i64,
834    state: State,
835    extension_parser: ExtensionParser,
836    extension_registry: ExtensionRegistry,
837    relation_parser: RelationParser<'a>,
838}
839impl<'a> Default for Parser<'a> {
840    fn default() -> Self {
841        Self::new()
842    }
843}
844
845impl<'a> Parser<'a> {
846    /// Parse a Substrait plan from text format.
847    ///
848    /// This is the main entry point for parsing.
849    ///
850    /// The input should be in the Substrait text format, which consists of:
851    /// - An optional extensions section starting with "=== Extensions"
852    /// - A plan section starting with "=== Plan"
853    /// - Indented relation definitions
854    ///
855    /// # Examples
856    ///
857    /// Simple parsing:
858    /// ```rust
859    /// use substrait_explain::parser::Parser;
860    ///
861    /// let plan_text = r#"
862    /// === Plan
863    /// Root[result]
864    ///   Read[table => col:i32]
865    /// "#;
866    ///
867    /// let plan = Parser::parse(plan_text).unwrap();
868    /// assert_eq!(plan.relations.len(), 1);
869    /// ```
870    ///
871    /// # Errors
872    ///
873    /// Returns a [`ParseError`] if the input cannot be parsed.
874    pub fn parse(input: &str) -> ParseResult {
875        Self::new().parse_plan(input)
876    }
877
878    /// Create a new parser with default configuration.
879    pub fn new() -> Self {
880        Self {
881            line_no: 1,
882            state: State::Initial,
883            extension_parser: ExtensionParser::default(),
884            extension_registry: ExtensionRegistry::new(),
885            relation_parser: RelationParser::default(),
886        }
887    }
888
889    /// Configure the parser to use the specified extension registry.
890    pub fn with_extension_registry(mut self, registry: ExtensionRegistry) -> Self {
891        self.extension_registry = registry;
892        self
893    }
894
895    /// Parse a Substrait plan with the current parser configuration.
896    pub fn parse_plan(mut self, input: &'a str) -> ParseResult {
897        for line in input.lines() {
898            if line.trim().is_empty() {
899                self.line_no += 1;
900                continue;
901            }
902
903            self.parse_line(line)?;
904            self.line_no += 1;
905        }
906
907        let plan = self.build_plan()?;
908        Ok(plan)
909    }
910
911    /// Parse a single line of input.
912    fn parse_line(&mut self, line: &'a str) -> Result<(), ParseError> {
913        let indented_line = IndentedLine::from(line);
914        let line_no = self.line_no;
915        let ctx = || ParseContext {
916            line_no,
917            line: line.to_string(),
918        };
919
920        match self.state {
921            State::Initial => self.parse_initial(indented_line),
922            State::Extensions => self
923                .parse_extensions(indented_line)
924                .map_err(|e| ParseError::Extension(ctx(), e)),
925            State::Plan => {
926                let IndentedLine(depth, line_str) = indented_line;
927
928                // Parse the line
929                let node = if depth == 0 {
930                    LineNode::parse_root(line_str, line_no)?
931                } else {
932                    LineNode::parse(line_str, line_no)?
933                };
934
935                self.relation_parser.tree.add_line(depth, node)
936            }
937        }
938    }
939
940    /// Parse the initial line(s) of the input, which is either a blank line or
941    /// the extensions or plan header.
942    fn parse_initial(&mut self, line: IndentedLine) -> Result<(), ParseError> {
943        match line {
944            IndentedLine(0, l) if l.trim().is_empty() => {}
945            IndentedLine(0, simple::EXTENSIONS_HEADER) => {
946                self.state = State::Extensions;
947            }
948            IndentedLine(0, PLAN_HEADER) => {
949                self.state = State::Plan;
950            }
951            IndentedLine(n, l) => {
952                return Err(ParseError::Initial(
953                    ParseContext::new(n as i64, l.to_string()),
954                    MessageParseError::invalid(
955                        "initial",
956                        pest::Span::new(l, 0, l.len()).expect("Invalid span?!"),
957                        format!("Unknown initial line: {l:?}"),
958                    ),
959                ));
960            }
961        }
962        Ok(())
963    }
964
965    /// Parse a single line from the extensions section of the input, updating
966    /// the parser state.
967    fn parse_extensions(&mut self, line: IndentedLine<'_>) -> Result<(), ExtensionParseError> {
968        if line == IndentedLine(0, PLAN_HEADER) {
969            self.state = State::Plan;
970            return Ok(());
971        }
972        self.extension_parser.parse_line(line)
973    }
974
975    /// Build the plan from the parser state with warning collection.
976    fn build_plan(self) -> Result<Plan, ParseError> {
977        let Parser {
978            relation_parser,
979            extension_parser,
980            extension_registry,
981            ..
982        } = self;
983
984        let extensions = extension_parser.extensions();
985
986        // Parse the tree into relations
987        let root_relations = relation_parser.build(extensions, &extension_registry)?;
988
989        // Build the final plan
990        Ok(Plan {
991            extension_urns: extensions.to_extension_urns(),
992            extensions: extensions.to_extension_declarations(),
993            relations: root_relations,
994            ..Default::default()
995        })
996    }
997}
998
999#[cfg(test)]
1000mod tests {
1001    use substrait::proto::extensions::simple_extension_declaration::MappingType;
1002    use substrait::proto::rel::RelType;
1003
1004    use super::*;
1005    use crate::extensions::simple::ExtensionKind;
1006    use crate::parser::extensions::ExtensionParserState;
1007
1008    #[test]
1009    fn test_parse_basic_block() {
1010        let mut expected_extensions = SimpleExtensions::new();
1011        expected_extensions
1012            .add_extension_urn("/urn/common".to_string(), 1)
1013            .unwrap();
1014        expected_extensions
1015            .add_extension_urn("/urn/specific_funcs".to_string(), 2)
1016            .unwrap();
1017        expected_extensions
1018            .add_extension(ExtensionKind::Function, 1, 10, "func_a".to_string())
1019            .unwrap();
1020        expected_extensions
1021            .add_extension(ExtensionKind::Function, 2, 11, "func_b_special".to_string())
1022            .unwrap();
1023        expected_extensions
1024            .add_extension(ExtensionKind::Type, 1, 20, "SomeType".to_string())
1025            .unwrap();
1026        expected_extensions
1027            .add_extension(ExtensionKind::TypeVariation, 2, 30, "VarX".to_string())
1028            .unwrap();
1029
1030        let mut parser = ExtensionParser::default();
1031        let input_block = r#"
1032URNs:
1033  @  1: /urn/common
1034  @  2: /urn/specific_funcs
1035Functions:
1036  # 10 @  1: func_a
1037  # 11 @  2: func_b_special
1038Types:
1039  # 20 @  1: SomeType
1040Type Variations:
1041  # 30 @  2: VarX
1042"#;
1043
1044        for line_str in input_block.trim().lines() {
1045            parser
1046                .parse_line(IndentedLine::from(line_str))
1047                .unwrap_or_else(|e| panic!("Failed to parse line \'{line_str}\': {e:?}"));
1048        }
1049
1050        assert_eq!(*parser.extensions(), expected_extensions);
1051
1052        let extensions_str = parser.extensions().to_string("  ");
1053        // The writer adds the header; the ExtensionParser does not parse the
1054        // header, so we add it here for comparison.
1055        let expected_str = format!(
1056            "{}\n{}",
1057            simple::EXTENSIONS_HEADER,
1058            input_block.trim_start()
1059        );
1060        assert_eq!(extensions_str.trim(), expected_str.trim());
1061        // Check final state after all lines are processed.
1062        // The last significant line in input_block is a TypeVariation declaration.
1063        assert_eq!(
1064            parser.state(),
1065            ExtensionParserState::ExtensionDeclarations(ExtensionKind::TypeVariation)
1066        );
1067
1068        // Check that a subsequent blank line correctly resets state to Extensions.
1069        parser.parse_line(IndentedLine(0, "")).unwrap();
1070        assert_eq!(parser.state(), ExtensionParserState::Extensions);
1071    }
1072
1073    /// Test that we can parse a larger extensions block and it matches the input.
1074    #[test]
1075    fn test_parse_complete_extension_block() {
1076        let mut parser = ExtensionParser::default();
1077        let input_block = r#"
1078URNs:
1079  @  1: /urn/common
1080  @  2: /urn/specific_funcs
1081  @  3: /urn/types_lib
1082  @  4: /urn/variations_lib
1083Functions:
1084  # 10 @  1: func_a
1085  # 11 @  2: func_b_special
1086  # 12 @  1: func_c_common
1087Types:
1088  # 20 @  1: CommonType
1089  # 21 @  3: LibraryType
1090  # 22 @  1: AnotherCommonType
1091Type Variations:
1092  # 30 @  4: VarX
1093  # 31 @  4: VarY
1094"#;
1095
1096        for line_str in input_block.trim().lines() {
1097            parser
1098                .parse_line(IndentedLine::from(line_str))
1099                .unwrap_or_else(|e| panic!("Failed to parse line \'{line_str}\': {e:?}"));
1100        }
1101
1102        let extensions_str = parser.extensions().to_string("  ");
1103        // The writer adds the header; the ExtensionParser does not parse the
1104        // header, so we add it here for comparison.
1105        let expected_str = format!(
1106            "{}\n{}",
1107            simple::EXTENSIONS_HEADER,
1108            input_block.trim_start()
1109        );
1110        assert_eq!(extensions_str.trim(), expected_str.trim());
1111    }
1112
1113    #[test]
1114    fn test_parse_relation_tree() {
1115        // Example plan with a Project, a Filter, and a Read, nested by indentation
1116        let plan = r#"=== Plan
1117Project[$0, $1, 42, 84]
1118  Filter[$2 => $0, $1]
1119    Read[my.table => a:i32, b:string?, c:boolean]
1120"#;
1121        let mut parser = Parser::default();
1122        for line in plan.lines() {
1123            parser.parse_line(line).unwrap();
1124        }
1125
1126        // Complete the current tree to convert it to relations
1127        let plan = parser.build_plan().unwrap();
1128
1129        let root_rel = &plan.relations[0].rel_type;
1130        let first_rel = match root_rel {
1131            Some(plan_rel::RelType::Rel(rel)) => rel,
1132            _ => panic!("Expected Rel type, got {root_rel:?}"),
1133        };
1134        // Root should be Project
1135        let project = match &first_rel.rel_type {
1136            Some(RelType::Project(p)) => p,
1137            other => panic!("Expected Project at root, got {other:?}"),
1138        };
1139
1140        // Check that Project has Filter as input
1141        assert!(project.input.is_some());
1142        let filter_input = project.input.as_ref().unwrap();
1143
1144        // Check that Filter has Read as input
1145        match &filter_input.rel_type {
1146            Some(RelType::Filter(_)) => {
1147                match &filter_input.rel_type {
1148                    Some(RelType::Filter(filter)) => {
1149                        assert!(filter.input.is_some());
1150                        let read_input = filter.input.as_ref().unwrap();
1151
1152                        // Check that Read has no input (it's a leaf)
1153                        match &read_input.rel_type {
1154                            Some(RelType::Read(_)) => {}
1155                            other => panic!("Expected Read relation, got {other:?}"),
1156                        }
1157                    }
1158                    other => panic!("Expected Filter relation, got {other:?}"),
1159                }
1160            }
1161            other => panic!("Expected Filter relation, got {other:?}"),
1162        }
1163    }
1164
1165    #[test]
1166    fn test_parse_root_relation() {
1167        // Test a plan with a Root relation
1168        let plan = r#"=== Plan
1169Root[result]
1170  Project[$0, $1]
1171    Read[my.table => a:i32, b:string?]
1172"#;
1173        let mut parser = Parser::default();
1174        for line in plan.lines() {
1175            parser.parse_line(line).unwrap();
1176        }
1177
1178        let plan = parser.build_plan().unwrap();
1179
1180        // Check that we have exactly one relation
1181        assert_eq!(plan.relations.len(), 1);
1182
1183        let root_rel = &plan.relations[0].rel_type;
1184        let rel_root = match root_rel {
1185            Some(plan_rel::RelType::Root(rel_root)) => rel_root,
1186            other => panic!("Expected Root type, got {other:?}"),
1187        };
1188
1189        // Check that the root has the correct name
1190        assert_eq!(rel_root.names, vec!["result"]);
1191
1192        // Check that the root has a Project as input
1193        let project_input = match &rel_root.input {
1194            Some(rel) => rel,
1195            None => panic!("Root should have an input"),
1196        };
1197
1198        let project = match &project_input.rel_type {
1199            Some(RelType::Project(p)) => p,
1200            other => panic!("Expected Project as root input, got {other:?}"),
1201        };
1202
1203        // Check that Project has Read as input
1204        let read_input = match &project.input {
1205            Some(rel) => rel,
1206            None => panic!("Project should have an input"),
1207        };
1208
1209        match &read_input.rel_type {
1210            Some(RelType::Read(_)) => {}
1211            other => panic!("Expected Read relation, got {other:?}"),
1212        }
1213    }
1214
1215    #[test]
1216    fn test_parse_root_relation_no_names() {
1217        // Test a plan with a Root relation with no names
1218        let plan = r#"=== Plan
1219Root[]
1220  Project[$0, $1]
1221    Read[my.table => a:i32, b:string?]
1222"#;
1223        let mut parser = Parser::default();
1224        for line in plan.lines() {
1225            parser.parse_line(line).unwrap();
1226        }
1227
1228        let plan = parser.build_plan().unwrap();
1229
1230        let root_rel = &plan.relations[0].rel_type;
1231        let rel_root = match root_rel {
1232            Some(plan_rel::RelType::Root(rel_root)) => rel_root,
1233            other => panic!("Expected Root type, got {other:?}"),
1234        };
1235
1236        // Check that the root has no names
1237        assert_eq!(rel_root.names, Vec::<String>::new());
1238    }
1239
1240    #[test]
1241    fn test_parse_full_plan() {
1242        // Test a complete Substrait plan with extensions and relations
1243        let input = r#"
1244=== Extensions
1245URNs:
1246  @  1: /urn/common
1247  @  2: /urn/specific_funcs
1248Functions:
1249  # 10 @  1: func_a
1250  # 11 @  2: func_b_special
1251Types:
1252  # 20 @  1: SomeType
1253Type Variations:
1254  # 30 @  2: VarX
1255
1256=== Plan
1257Project[$0, $1, 42, 84]
1258  Filter[$2 => $0, $1]
1259    Read[my.table => a:i32, b:string?, c:boolean]
1260"#;
1261
1262        let plan = Parser::parse(input).unwrap();
1263
1264        // Verify the plan structure
1265        assert_eq!(plan.extension_urns.len(), 2);
1266        assert_eq!(plan.extensions.len(), 4);
1267        assert_eq!(plan.relations.len(), 1);
1268
1269        // Verify extension URIs
1270        let urn1 = &plan.extension_urns[0];
1271        assert_eq!(urn1.extension_urn_anchor, 1);
1272        assert_eq!(urn1.urn, "/urn/common");
1273
1274        let urn2 = &plan.extension_urns[1];
1275        assert_eq!(urn2.extension_urn_anchor, 2);
1276        assert_eq!(urn2.urn, "/urn/specific_funcs");
1277
1278        // Verify extensions
1279        let func1 = &plan.extensions[0];
1280        match &func1.mapping_type {
1281            Some(MappingType::ExtensionFunction(f)) => {
1282                assert_eq!(f.function_anchor, 10);
1283                assert_eq!(f.extension_urn_reference, 1);
1284                assert_eq!(f.name, "func_a");
1285            }
1286            other => panic!("Expected ExtensionFunction, got {other:?}"),
1287        }
1288
1289        let func2 = &plan.extensions[1];
1290        match &func2.mapping_type {
1291            Some(MappingType::ExtensionFunction(f)) => {
1292                assert_eq!(f.function_anchor, 11);
1293                assert_eq!(f.extension_urn_reference, 2);
1294                assert_eq!(f.name, "func_b_special");
1295            }
1296            other => panic!("Expected ExtensionFunction, got {other:?}"),
1297        }
1298
1299        let type1 = &plan.extensions[2];
1300        match &type1.mapping_type {
1301            Some(MappingType::ExtensionType(t)) => {
1302                assert_eq!(t.type_anchor, 20);
1303                assert_eq!(t.extension_urn_reference, 1);
1304                assert_eq!(t.name, "SomeType");
1305            }
1306            other => panic!("Expected ExtensionType, got {other:?}"),
1307        }
1308
1309        let var1 = &plan.extensions[3];
1310        match &var1.mapping_type {
1311            Some(MappingType::ExtensionTypeVariation(v)) => {
1312                assert_eq!(v.type_variation_anchor, 30);
1313                assert_eq!(v.extension_urn_reference, 2);
1314                assert_eq!(v.name, "VarX");
1315            }
1316            other => panic!("Expected ExtensionTypeVariation, got {other:?}"),
1317        }
1318
1319        // Verify the relation tree structure
1320        let root_rel = &plan.relations[0];
1321        match &root_rel.rel_type {
1322            Some(plan_rel::RelType::Rel(rel)) => {
1323                match &rel.rel_type {
1324                    Some(RelType::Project(project)) => {
1325                        // Verify Project relation
1326                        assert_eq!(project.expressions.len(), 2); // 42 and 84
1327                        assert!(project.input.is_some()); // Should have Filter as input
1328
1329                        // Check the Filter input
1330                        let filter_input = project.input.as_ref().unwrap();
1331                        match &filter_input.rel_type {
1332                            Some(RelType::Filter(filter)) => {
1333                                assert!(filter.input.is_some()); // Should have Read as input
1334
1335                                // Check the Read input
1336                                let read_input = filter.input.as_ref().unwrap();
1337                                match &read_input.rel_type {
1338                                    Some(RelType::Read(read)) => {
1339                                        // Verify Read relation
1340                                        let schema = read.base_schema.as_ref().unwrap();
1341                                        assert_eq!(schema.names.len(), 3);
1342                                        assert_eq!(schema.names[0], "a");
1343                                        assert_eq!(schema.names[1], "b");
1344                                        assert_eq!(schema.names[2], "c");
1345
1346                                        let struct_ = schema.r#struct.as_ref().unwrap();
1347                                        assert_eq!(struct_.types.len(), 3);
1348                                    }
1349                                    other => panic!("Expected Read relation, got {other:?}"),
1350                                }
1351                            }
1352                            other => panic!("Expected Filter relation, got {other:?}"),
1353                        }
1354                    }
1355                    other => panic!("Expected Project relation, got {other:?}"),
1356                }
1357            }
1358            other => panic!("Expected Rel type, got {other:?}"),
1359        }
1360    }
1361}