Skip to main content

substrait_explain/parser/
structural.rs

1//! Parser for the structural part of the Substrait file format.
2//!
3//! This is the overall parser for parsing the text format. It is responsible
4//! for tracking which section of the file we are currently parsing, and parsing
5//! each line separately.
6
7use std::fmt;
8
9use pest::iterators::Pair;
10use substrait::proto::extensions::AdvancedExtension;
11use substrait::proto::{
12    AggregateRel, FetchRel, FilterRel, JoinRel, Plan, PlanRel, ProjectRel, ReadRel, Rel, RelRoot,
13    SortRel, plan_rel,
14};
15
16use crate::extensions::any::Any;
17use crate::extensions::{AddendumKind, ExtensionRegistry, SimpleExtensions, simple};
18use crate::parser::common::{MessageParseError, ParsePair, ScopedParsePair};
19use crate::parser::errors::{ParseContext, ParseError, ParseResult};
20use crate::parser::expressions::Name;
21use crate::parser::extensions::{
22    AddendumInvocation, ExtensionInvocation, ExtensionParseError, ExtensionParser,
23};
24use crate::parser::relations::{ExtensionReadRel, RelationParsingContext, VirtualReadRel};
25use crate::parser::{ErrorKind, ExpressionParser, RelationParsePair, Rule, unwrap_single_pair};
26
27pub const PLAN_HEADER: &str = "=== Plan";
28
29/// Represents an input line, trimmed of leading two-space indents and final
30/// whitespace. Contains the number of indents and the trimmed line.
31#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32pub struct IndentedLine<'a>(pub usize, pub &'a str);
33
34impl<'a> From<&'a str> for IndentedLine<'a> {
35    fn from(line: &'a str) -> Self {
36        let line = line.trim_end();
37        let mut spaces = 0;
38        for c in line.chars() {
39            if c == ' ' {
40                spaces += 1;
41            } else {
42                break;
43            }
44        }
45
46        let indents = spaces / 2;
47
48        let (_, trimmed) = line.split_at(indents * 2);
49
50        IndentedLine(indents, trimmed)
51    }
52}
53
54/// A `+`-prefixed addendum line attached to a relation node. The `pair` holds
55/// the grammar rule directly (already unwrapped from the outer `planNode`).
56#[derive(Debug, Clone)]
57pub struct Addendum<'a> {
58    pub pair: Pair<'a, Rule>, // Rule::addendum
59    pub line_no: i64,
60}
61
62/// A relation node in the plan tree, before conversion to a Substrait proto.
63#[derive(Debug, Clone)]
64pub struct RelationNode<'a> {
65    pub pair: Pair<'a, Rule>,
66    pub line_no: i64,
67    pub addenda: Vec<Addendum<'a>>,
68    pub children: Vec<RelationNode<'a>>,
69}
70
71impl<'a> RelationNode<'a> {
72    pub fn context(&self) -> ParseContext {
73        ParseContext {
74            line_no: self.line_no,
75            line: self.pair.as_str().to_string(),
76        }
77    }
78}
79
80/// A parsed plan line: either a relation or a `+`-prefixed addendum line.
81///
82/// Classification happens at construction time by inspecting the inner grammar
83/// rule, so downstream code can use standard Rust pattern matching rather than
84/// runtime rule inspection.
85#[derive(Debug, Clone)]
86pub enum LineNode<'a> {
87    Relation(RelationNode<'a>),
88    Addendum(Addendum<'a>),
89}
90
91impl<'a> LineNode<'a> {
92    pub fn parse(line: &'a str, line_no: i64) -> Result<Self, ParseError> {
93        let mut pairs: pest::iterators::Pairs<'a, Rule> =
94            <ExpressionParser as pest::Parser<Rule>>::parse(Rule::planNode, line).map_err(|e| {
95                ParseError::Plan(
96                    ParseContext {
97                        line_no,
98                        line: line.to_string(),
99                    },
100                    MessageParseError::new("planNode", ErrorKind::InvalidValue, Box::new(e)),
101                )
102            })?;
103
104        let outer = pairs.next().unwrap();
105        assert!(pairs.next().is_none()); // Should be exactly one pair
106        let inner = unwrap_single_pair(outer);
107
108        Ok(match inner.as_rule() {
109            Rule::addendum => LineNode::Addendum(Addendum {
110                pair: inner,
111                line_no,
112            }),
113            _ => LineNode::Relation(RelationNode {
114                pair: inner,
115                line_no,
116                addenda: Vec::new(),
117                children: Vec::new(),
118            }),
119        })
120    }
121
122    /// Parse the line as a top-level relation at depth 0 (either root_relation or regular relation)
123    pub fn parse_root(line: &'a str, line_no: i64) -> Result<Self, ParseError> {
124        let mut pairs: pest::iterators::Pairs<'a, Rule> = <ExpressionParser as pest::Parser<
125            Rule,
126        >>::parse(
127            Rule::top_level_relation, line
128        )
129        .map_err(|e| {
130            ParseError::Plan(
131                ParseContext::new(line_no, line.to_string()),
132                MessageParseError::new("top_level_relation", ErrorKind::Syntax, Box::new(e)),
133            )
134        })?;
135
136        let outer = pairs.next().unwrap();
137        assert!(pairs.next().is_none());
138
139        // top_level_relation is either root_relation or planNode.
140        // If planNode, unwrap one more level to obtain the specific relation rule.
141        let inner = unwrap_single_pair(outer);
142        let pair = if inner.as_rule() == Rule::planNode {
143            unwrap_single_pair(inner)
144        } else {
145            inner // root_relation
146        };
147
148        // planNode can include addenda (+Enh:, +Opt:, +Ext:); surface them so
149        // TreeBuilder::add_line can produce the appropriate depth-0 error.
150        if pair.as_rule() == Rule::addendum {
151            return Ok(LineNode::Addendum(Addendum { pair, line_no }));
152        }
153
154        Ok(LineNode::Relation(RelationNode {
155            pair,
156            line_no,
157            addenda: Vec::new(),
158            children: Vec::new(),
159        }))
160    }
161}
162
163#[derive(Copy, Clone, Debug)]
164pub enum State {
165    // The initial state, before we have parsed any lines.
166    Initial,
167    // The extensions section, after parsing the header and any other Extension lines.
168    Extensions,
169    // The plan section, after parsing the header and any other Plan lines.
170    Plan,
171}
172
173impl fmt::Display for State {
174    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
175        write!(f, "{self:?}")
176    }
177}
178
179// An in-progress tree builder, building the tree of relations.
180#[derive(Debug, Clone, Default)]
181pub struct TreeBuilder<'a> {
182    // Current tree of nodes being built. These have been successfully parsed
183    // into Pest pairs, but have not yet been converted to substrait plans.
184    current: Option<RelationNode<'a>>,
185    // Completed trees that have been built.
186    completed: Vec<RelationNode<'a>>,
187}
188
189impl<'a> TreeBuilder<'a> {
190    /// Traverse down the tree, always taking the last child at each level, until reaching the specified depth.
191    pub fn get_at_depth(&mut self, depth: usize) -> Option<&mut RelationNode<'a>> {
192        let mut node = self.current.as_mut()?;
193        for _ in 0..depth {
194            node = node.children.last_mut()?;
195        }
196        Some(node)
197    }
198
199    pub fn add_line(&mut self, depth: usize, node: LineNode<'a>) -> Result<(), ParseError> {
200        match node {
201            LineNode::Relation(rel_node) => {
202                if depth == 0 {
203                    if let Some(prev) = self.current.take() {
204                        self.completed.push(prev);
205                    }
206                    self.current = Some(rel_node);
207                    return Ok(());
208                }
209
210                let parent = match self.get_at_depth(depth - 1) {
211                    None => {
212                        return Err(ParseError::Plan(
213                            rel_node.context(),
214                            MessageParseError::invalid(
215                                "relation",
216                                rel_node.pair.as_span(),
217                                format!("No parent found for depth {depth}"),
218                            ),
219                        ));
220                    }
221                    Some(parent) => parent,
222                };
223
224                parent.children.push(rel_node);
225            }
226            LineNode::Addendum(addendum) => {
227                let context =
228                    ParseContext::new(addendum.line_no, addendum.pair.as_str().to_string());
229                if depth == 0 {
230                    return Err(ParseError::ValidationError(
231                        context,
232                        "addenda (+ Enh: / + Opt: / + Ext:) cannot appear at the top level"
233                            .to_string(),
234                    ));
235                }
236
237                let parent = match self.get_at_depth(depth - 1) {
238                    None => {
239                        return Err(ParseError::ValidationError(
240                            context,
241                            format!("no parent found for addendum at depth {depth}"),
242                        ));
243                    }
244                    Some(parent) => parent,
245                };
246
247                if !parent.children.is_empty() {
248                    return Err(ParseError::ValidationError(
249                        context,
250                        "addenda (+ Enh: / + Opt: / + Ext:) must appear before child relations, \
251                         not after"
252                            .to_string(),
253                    ));
254                }
255
256                parent.addenda.push(addendum);
257            }
258        }
259        Ok(())
260    }
261
262    /// End of input - move any remaining nodes from stack to completed and
263    /// return any trees in progress. Resets the builder to its initial state
264    /// (empty)
265    /// Move any remaining nodes from stack to completed
266    pub fn finish(&mut self) -> Vec<RelationNode<'a>> {
267        if let Some(node) = self.current.take() {
268            self.completed.push(node);
269        }
270        std::mem::take(&mut self.completed)
271    }
272}
273
274/// Intermediate state for relation parsing: the structural tree data
275/// (children, addenda) has been parsed, but the relation's own grammar pair
276/// hasn't been converted to a protobuf relation yet.
277struct RelationContext<'a> {
278    pair: Pair<'a, Rule>,
279    line_no: i64,
280    children: Vec<Rel>,
281    input_field_count: usize,
282    addenda: Addenda<'a>,
283}
284
285/// A parsed addendum line plus enough source location to build later errors.
286#[derive(Debug, Clone)]
287struct ParsedAddendum<'a> {
288    line_no: i64,
289    line: &'a str,
290    invocation: AddendumInvocation,
291}
292
293impl<'a> ParsedAddendum<'a> {
294    fn parse(extensions: &SimpleExtensions, addendum: Addendum<'a>) -> Result<Self, ParseError> {
295        let line_no = addendum.line_no;
296        let line = addendum.pair.as_str();
297        let invocation = AddendumInvocation::parse_pair(extensions, addendum.pair)
298            .map_err(|e| ParseError::Plan(ParseContext::new(line_no, line.to_string()), e))?;
299        Ok(Self {
300            line_no,
301            line,
302            invocation,
303        })
304    }
305
306    fn context(&self) -> ParseContext {
307        ParseContext::new(self.line_no, self.line.to_string())
308    }
309
310    fn relation_context<'b>(
311        &'b self,
312        registry: &'b ExtensionRegistry,
313    ) -> RelationParsingContext<'b> {
314        RelationParsingContext {
315            registry,
316            line_no: self.line_no,
317            line: self.line,
318        }
319    }
320    fn resolve_detail(&self, registry: &ExtensionRegistry) -> Result<Any, ParseError> {
321        self.relation_context(registry).resolve_addendum_detail(
322            self.invocation.kind,
323            &self.invocation.name,
324            &self.invocation.args,
325        )
326    }
327}
328
329/// Parsed `+` lines attached to a relation.
330#[derive(Debug, Clone, Default)]
331struct Addenda<'a> {
332    items: Vec<ParsedAddendum<'a>>,
333}
334
335impl<'a> Addenda<'a> {
336    fn parse(
337        extensions: &SimpleExtensions,
338        addenda: Vec<Addendum<'a>>,
339    ) -> Result<Self, ParseError> {
340        let items = addenda
341            .into_iter()
342            .map(|addendum| ParsedAddendum::parse(extensions, addendum))
343            .collect::<Result<Vec<_>, ParseError>>()?;
344        Ok(Self { items })
345    }
346
347    fn first(&self) -> Option<&ParsedAddendum<'a>> {
348        self.items.first()
349    }
350
351    fn reject_all(&self, message: &'static str) -> Result<(), ParseError> {
352        if let Some(addendum) = self.first() {
353            return Err(ParseError::ValidationError(
354                addendum.context(),
355                message.to_string(),
356            ));
357        }
358        Ok(())
359    }
360
361    fn into_standard_advanced_extension(
362        self,
363        registry: &ExtensionRegistry,
364    ) -> Result<Option<AdvancedExtension>, ParseError> {
365        let mut enhancement = None;
366        let mut optimizations = Vec::new();
367
368        for addendum in self.items {
369            match addendum.invocation.kind {
370                AddendumKind::Enhancement => {
371                    if enhancement.is_some() {
372                        return Err(ParseError::ValidationError(
373                            addendum.context(),
374                            "at most one enhancement per relation is allowed".to_string(),
375                        ));
376                    }
377                    enhancement = Some(addendum.resolve_detail(registry)?.into());
378                }
379                AddendumKind::Optimization => {
380                    optimizations.push(addendum.resolve_detail(registry)?.into());
381                }
382                AddendumKind::ExtensionTable => {
383                    return Err(ParseError::ValidationError(
384                        addendum.context(),
385                        "+ Ext addenda can only be used with Read:Extension".to_string(),
386                    ));
387                }
388            }
389        }
390
391        if enhancement.is_none() && optimizations.is_empty() {
392            return Ok(None);
393        }
394
395        Ok(Some(AdvancedExtension {
396            enhancement,
397            optimization: optimizations,
398        }))
399    }
400
401    fn into_extension_read_parts(
402        self,
403        registry: &ExtensionRegistry,
404        relation_context: ParseContext,
405    ) -> Result<(Any, Option<AdvancedExtension>), ParseError> {
406        let mut extension_table = None;
407        let mut advanced_addenda = Vec::new();
408
409        for addendum in self.items {
410            match addendum.invocation.kind {
411                AddendumKind::ExtensionTable => {
412                    if extension_table.is_some() {
413                        return Err(ParseError::ValidationError(
414                            addendum.context(),
415                            "Read:Extension allows exactly one + Ext addendum".to_string(),
416                        ));
417                    }
418                    extension_table = Some(addendum);
419                }
420                AddendumKind::Enhancement | AddendumKind::Optimization => {
421                    advanced_addenda.push(addendum);
422                }
423            }
424        }
425
426        let extension_table = extension_table.ok_or_else(|| {
427            ParseError::ValidationError(
428                relation_context,
429                "Read:Extension requires exactly one + Ext addendum".to_string(),
430            )
431        })?;
432
433        let detail = extension_table.resolve_detail(registry)?;
434        let advanced_extension = Addenda {
435            items: advanced_addenda,
436        }
437        .into_standard_advanced_extension(registry)?;
438
439        Ok((detail, advanced_extension))
440    }
441}
442
443// Relation parsing component - handles converting LineNodes to Relations
444#[derive(Debug, Clone, Default)]
445pub struct RelationParser<'a> {
446    tree: TreeBuilder<'a>,
447}
448
449impl<'a> RelationParser<'a> {
450    /// Dispatch by grammar rule after validating addenda constraints.
451    /// Standard relations go through [`parse_rel`](Self::parse_rel);
452    /// extension relations go through
453    /// [`parse_extension_relation`](Self::parse_extension_relation).
454    fn parse_relation(
455        &self,
456        extensions: &SimpleExtensions,
457        registry: &ExtensionRegistry,
458        ctx: RelationContext,
459    ) -> Result<(Rel, usize), ParseError> {
460        match ctx.pair.as_rule() {
461            Rule::extension_read_relation => {
462                self.parse_extension_read_relation(extensions, registry, ctx)
463            }
464            Rule::virtual_read_relation => {
465                self.parse_rel::<VirtualReadRel>(extensions, registry, ctx)
466            }
467            Rule::read_relation => self.parse_rel::<ReadRel>(extensions, registry, ctx),
468            Rule::filter_relation => self.parse_rel::<FilterRel>(extensions, registry, ctx),
469            Rule::project_relation => self.parse_rel::<ProjectRel>(extensions, registry, ctx),
470            Rule::aggregate_relation => self.parse_rel::<AggregateRel>(extensions, registry, ctx),
471            Rule::sort_relation => self.parse_rel::<SortRel>(extensions, registry, ctx),
472            Rule::fetch_relation => self.parse_rel::<FetchRel>(extensions, registry, ctx),
473            Rule::join_relation => self.parse_rel::<JoinRel>(extensions, registry, ctx),
474            Rule::extension_relation => self.parse_extension_relation(extensions, registry, ctx),
475            _ => unreachable!("unhandled relation rule: {:?}", ctx.pair.as_rule()),
476        }
477    }
478
479    /// Generic bridge between [`parse_relation`](Self::parse_relation) and
480    /// the [`RelationParsePair`] trait: wraps `MessageParseError` with line
481    /// context and calls [`into_rel`](RelationParsePair::into_rel) to apply
482    /// addenda and produce the final [`Rel`].
483    fn parse_rel<T: RelationParsePair>(
484        &self,
485        extensions: &SimpleExtensions,
486        registry: &ExtensionRegistry,
487        ctx: RelationContext,
488    ) -> Result<(Rel, usize), ParseError> {
489        let RelationContext {
490            pair,
491            line_no,
492            children,
493            input_field_count,
494            addenda,
495        } = ctx;
496        assert_eq!(pair.as_rule(), T::rule());
497        let line = pair.as_str();
498        let advanced_extension = addenda.into_standard_advanced_extension(registry)?;
499
500        match T::parse_pair_with_context(extensions, pair, children, input_field_count) {
501            Ok((parsed, count)) => Ok((parsed.into_rel(advanced_extension), count)),
502            Err(e) => Err(ParseError::Plan(
503                ParseContext::new(line_no, line.to_string()),
504                e,
505            )),
506        }
507    }
508
509    /// Handle extension relations separately from [`parse_rel`](Self::parse_rel)
510    /// because they need registry lookups that [`RelationParsePair`] doesn't
511    /// support.
512    fn parse_extension_relation(
513        &self,
514        extensions: &SimpleExtensions,
515        registry: &ExtensionRegistry,
516        ctx: RelationContext,
517    ) -> Result<(Rel, usize), ParseError> {
518        assert_eq!(ctx.pair.as_rule(), Rule::extension_relation);
519        let line_no = ctx.line_no;
520        let line = ctx.pair.as_str().to_string();
521        let pair_span = ctx.pair.as_span();
522
523        ctx.addenda
524            .reject_all("extension relations do not support addenda (+ Enh / + Opt / + Ext)")?;
525
526        let ExtensionInvocation {
527            relation_kind,
528            name,
529            args: extension_args,
530        } = ExtensionInvocation::parse_pair(extensions, ctx.pair.clone())
531            .map_err(|e| ParseError::Plan(ParseContext::new(line_no, line.clone()), e))?;
532
533        let child_count = ctx.children.len();
534        relation_kind
535            .validate_child_count(child_count)
536            .map_err(|e| {
537                ParseError::Plan(
538                    ParseContext::new(line_no, line.to_string()),
539                    MessageParseError::invalid("extension_relation", pair_span, e),
540                )
541            })?;
542
543        let context = RelationParsingContext {
544            registry,
545            line_no,
546            line: &line,
547        };
548
549        let detail = context.resolve_extension_detail(&name, &extension_args)?;
550        let output_column_count = extension_args.output_columns.len();
551
552        let rel = relation_kind.create_rel(detail, ctx.children);
553
554        Ok((rel, output_column_count))
555    }
556
557    /// Parse `Read:Extension[...]`, whose table detail is supplied by exactly
558    /// one `+ Ext:Name[...]` addendum.
559    fn parse_extension_read_relation(
560        &self,
561        extensions: &SimpleExtensions,
562        registry: &ExtensionRegistry,
563        ctx: RelationContext,
564    ) -> Result<(Rel, usize), ParseError> {
565        assert_eq!(ctx.pair.as_rule(), Rule::extension_read_relation);
566        let context = ParseContext::new(ctx.line_no, ctx.pair.as_str().to_string());
567        let (detail, advanced_extension) = ctx
568            .addenda
569            .into_extension_read_parts(registry, context.clone())?;
570
571        ExtensionReadRel::parse_pair_with_detail(
572            extensions,
573            ctx.pair,
574            ctx.children,
575            ctx.input_field_count,
576            detail,
577            advanced_extension,
578        )
579        .map_err(|e| ParseError::Plan(context, e))
580    }
581
582    /// Walk the relation tree depth-first, converting structural types
583    /// (children, addenda) into proto types via [`RelationContext`].
584    /// Delegates grammar-rule-specific work to
585    /// [`parse_relation`](Self::parse_relation).
586    fn build_rel(
587        &self,
588        extensions: &SimpleExtensions,
589        registry: &ExtensionRegistry,
590        node: RelationNode,
591    ) -> Result<(Rel, usize), ParseError> {
592        let mut children: Vec<Rel> = Vec::new();
593        let mut input_field_count: usize = 0;
594        for child in node.children {
595            let (rel, count) = self.build_rel(extensions, registry, child)?;
596            input_field_count += count;
597            children.push(rel);
598        }
599
600        let addenda = Addenda::parse(extensions, node.addenda)?;
601
602        self.parse_relation(
603            extensions,
604            registry,
605            RelationContext {
606                pair: node.pair,
607                line_no: node.line_no,
608                children,
609                input_field_count,
610                addenda,
611            },
612        )
613    }
614
615    /// Build a tree of relations.
616    fn build_plan_rel(
617        &self,
618        extensions: &SimpleExtensions,
619        registry: &ExtensionRegistry,
620        node: RelationNode,
621    ) -> Result<PlanRel, ParseError> {
622        // Plain relations are allowed as root relations; they just don't have names.
623        if node.pair.as_rule() != Rule::root_relation {
624            let (rel, _) = self.build_rel(extensions, registry, node)?;
625            return Ok(PlanRel {
626                rel_type: Some(plan_rel::RelType::Rel(rel)),
627            });
628        }
629
630        // Root relations don't support addenda — reject rather than silently discard.
631        if !node.addenda.is_empty() {
632            let first = &node.addenda[0];
633            let context = ParseContext::new(first.line_no, first.pair.as_str().to_string());
634            return Err(ParseError::ValidationError(
635                context,
636                "addenda (+ Enh: / + Opt: / + Ext:) are not supported on Root relations"
637                    .to_string(),
638            ));
639        }
640
641        // Named root relation.
642        let context = node.context();
643        let span = node.pair.as_span();
644
645        // Parse the column names
646        let column_names_pair = unwrap_single_pair(node.pair);
647        assert_eq!(column_names_pair.as_rule(), Rule::root_name_list);
648
649        let names: Vec<String> = column_names_pair
650            .into_inner()
651            .map(|name_pair| {
652                assert_eq!(name_pair.as_rule(), Rule::name);
653                Name::parse_pair(name_pair).0
654            })
655            .collect();
656
657        let mut children = node.children;
658        let child = match children.len() {
659            1 => {
660                let (rel, _) = self.build_rel(extensions, registry, children.pop().unwrap())?;
661                rel
662            }
663            n => {
664                return Err(ParseError::Plan(
665                    context,
666                    MessageParseError::invalid(
667                        "root_relation",
668                        span,
669                        format!("Root relation must have exactly one child, found {n}"),
670                    ),
671                ));
672            }
673        };
674
675        Ok(PlanRel {
676            rel_type: Some(plan_rel::RelType::Root(RelRoot {
677                names,
678                input: Some(child),
679            })),
680        })
681    }
682
683    /// Build all the trees.
684    fn build(
685        mut self,
686        extensions: &SimpleExtensions,
687        registry: &ExtensionRegistry,
688    ) -> Result<Vec<PlanRel>, ParseError> {
689        let nodes = self.tree.finish();
690        nodes
691            .into_iter()
692            .map(|n| self.build_plan_rel(extensions, registry, n))
693            .collect::<Result<Vec<PlanRel>, ParseError>>()
694    }
695}
696
697/// A parser for Substrait query plans in text format.
698///
699/// The `Parser` converts human-readable Substrait text format into Substrait
700/// protobuf plans. It handles both the extensions section (which defines
701/// functions, types, etc.) and the plan section (which defines the actual query
702/// structure).
703///
704/// ## Usage
705///
706/// The simplest entry point is the static `parse()` method:
707///
708/// ```rust
709/// use substrait_explain::Parser;
710///
711/// let plan_text = r#"
712/// === Plan
713/// Root[c, d]
714///   Project[$1, 42]
715///     Read[schema.table => a:i64, b:string?]
716/// "#;
717///
718/// let plan = Parser::parse(plan_text).unwrap();
719/// ```
720///
721/// ## Input Format
722///
723/// The parser expects input in the following format:
724///
725/// ```text
726/// === Extensions
727/// URNs:
728///   @  1: https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml
729/// Functions:
730///   # 10 @  1: add
731/// === Plan
732/// Root[columns]
733///   Relation[arguments => columns]
734///     ChildRelation[arguments => columns]
735/// ```
736///
737/// - **Extensions section** (optional): Defines URNs and function/type declarations
738/// - **Plan section** (required): Defines the query structure with indented relations
739///
740/// ## Error Handling
741///
742/// The parser provides detailed error information including:
743/// - Line number where the error occurred
744/// - The actual line content that failed to parse
745/// - Specific error type and description
746///
747/// ```rust
748/// use substrait_explain::Parser;
749///
750/// let invalid_plan = r#"
751/// === Plan
752/// InvalidRelation[invalid syntax]
753/// "#;
754///
755/// match Parser::parse(invalid_plan) {
756///     Ok(plan) => println!("Successfully parsed"),
757///     Err(e) => eprintln!("Parse error: {}", e),
758/// }
759/// ```
760///
761/// ## Supported Relations
762///
763/// The parser supports all standard Substrait relations:
764/// - `Read[table => columns]` - Read from a table
765/// - `Project[expressions]` - Project columns/expressions
766/// - `Filter[condition => columns]` - Filter rows
767/// - `Root[columns]` - Root relation with output columns
768/// - And more...
769///
770/// ## Extensions Support
771///
772/// The parser fully supports Substrait Simple Extensions, allowing you to:
773/// - Define custom functions with URNs and anchors
774/// - Reference functions by name in expressions
775/// - Use custom types and type variations
776///
777/// ```rust
778/// use substrait_explain::Parser;
779///
780/// let plan_with_extensions = r#"
781/// === Extensions
782/// URNs:
783///   @  1: https://example.com/functions.yaml
784/// Functions:
785///   ## 10 @  1: my_custom_function
786/// === Plan
787/// Root[result]
788///   Project[my_custom_function($0, $1):i32]
789///     Read[table => col1:i32, col2:i32]
790/// "#;
791///
792/// let plan = Parser::parse(plan_with_extensions).unwrap();
793/// ```
794///
795/// ## Performance
796///
797/// The parser is designed for efficiency:
798/// - Single-pass parsing with minimal allocations
799/// - Early error detection and reporting
800/// - Memory-efficient tree building
801///
802/// ## Thread Safety
803///
804/// `Parser` instances are not thread-safe and should not be shared between threads.
805/// However, the static `parse()` method is safe to call from multiple threads.
806#[derive(Debug)]
807pub struct Parser<'a> {
808    line_no: i64,
809    state: State,
810    extension_parser: ExtensionParser,
811    extension_registry: ExtensionRegistry,
812    relation_parser: RelationParser<'a>,
813}
814impl<'a> Default for Parser<'a> {
815    fn default() -> Self {
816        Self::new()
817    }
818}
819
820impl<'a> Parser<'a> {
821    /// Parse a Substrait plan from text format.
822    ///
823    /// This is the main entry point for parsing.
824    ///
825    /// The input should be in the Substrait text format, which consists of:
826    /// - An optional extensions section starting with "=== Extensions"
827    /// - A plan section starting with "=== Plan"
828    /// - Indented relation definitions
829    ///
830    /// # Examples
831    ///
832    /// Simple parsing:
833    /// ```rust
834    /// use substrait_explain::Parser;
835    ///
836    /// let plan_text = r#"
837    /// === Plan
838    /// Root[result]
839    ///   Read[table => col:i32]
840    /// "#;
841    ///
842    /// let plan = Parser::parse(plan_text).unwrap();
843    /// assert_eq!(plan.relations.len(), 1);
844    /// ```
845    ///
846    /// # Errors
847    ///
848    /// Returns a [`ParseError`] if the input cannot be parsed.
849    pub fn parse(input: &str) -> ParseResult {
850        Self::new().parse_plan(input)
851    }
852
853    /// Create a new parser with default configuration.
854    pub fn new() -> Self {
855        Self {
856            line_no: 1,
857            state: State::Initial,
858            extension_parser: ExtensionParser::default(),
859            extension_registry: ExtensionRegistry::new(),
860            relation_parser: RelationParser::default(),
861        }
862    }
863
864    /// Configure the parser to use the specified extension registry.
865    pub fn with_extension_registry(mut self, registry: ExtensionRegistry) -> Self {
866        self.extension_registry = registry;
867        self
868    }
869
870    /// Parse a Substrait plan with the current parser configuration.
871    pub fn parse_plan(mut self, input: &'a str) -> ParseResult {
872        for line in input.lines() {
873            if line.trim().is_empty() {
874                self.line_no += 1;
875                continue;
876            }
877
878            self.parse_line(line)?;
879            self.line_no += 1;
880        }
881
882        let plan = self.build_plan()?;
883        Ok(plan)
884    }
885
886    /// Parse a single line of input.
887    fn parse_line(&mut self, line: &'a str) -> Result<(), ParseError> {
888        let indented_line = IndentedLine::from(line);
889        let line_no = self.line_no;
890        let ctx = || ParseContext {
891            line_no,
892            line: line.to_string(),
893        };
894
895        match self.state {
896            State::Initial => self.parse_initial(indented_line),
897            State::Extensions => self
898                .parse_extensions(indented_line)
899                .map_err(|e| ParseError::Extension(ctx(), e)),
900            State::Plan => {
901                let IndentedLine(depth, line_str) = indented_line;
902
903                // Parse the line
904                let node = if depth == 0 {
905                    LineNode::parse_root(line_str, line_no)?
906                } else {
907                    LineNode::parse(line_str, line_no)?
908                };
909
910                self.relation_parser.tree.add_line(depth, node)
911            }
912        }
913    }
914
915    /// Parse the initial line(s) of the input, which is either a blank line or
916    /// the extensions or plan header.
917    fn parse_initial(&mut self, line: IndentedLine) -> Result<(), ParseError> {
918        match line {
919            IndentedLine(0, l) if l.trim().is_empty() => {}
920            IndentedLine(0, simple::EXTENSIONS_HEADER) => {
921                self.state = State::Extensions;
922            }
923            IndentedLine(0, PLAN_HEADER) => {
924                self.state = State::Plan;
925            }
926            IndentedLine(n, l) => {
927                return Err(ParseError::Initial(
928                    ParseContext::new(n as i64, l.to_string()),
929                    MessageParseError::invalid(
930                        "initial",
931                        pest::Span::new(l, 0, l.len()).expect("Invalid span?!"),
932                        format!("Unknown initial line: {l:?}"),
933                    ),
934                ));
935            }
936        }
937        Ok(())
938    }
939
940    /// Parse a single line from the extensions section of the input, updating
941    /// the parser state.
942    fn parse_extensions(&mut self, line: IndentedLine<'_>) -> Result<(), ExtensionParseError> {
943        if line == IndentedLine(0, PLAN_HEADER) {
944            self.state = State::Plan;
945            return Ok(());
946        }
947        self.extension_parser.parse_line(line)
948    }
949
950    /// Build the plan from the parser state with warning collection.
951    fn build_plan(self) -> Result<Plan, ParseError> {
952        let Parser {
953            relation_parser,
954            extension_parser,
955            extension_registry,
956            ..
957        } = self;
958
959        let extensions = extension_parser.extensions();
960
961        // Parse the tree into relations
962        let root_relations = relation_parser.build(extensions, &extension_registry)?;
963
964        // Build the final plan
965        Ok(Plan {
966            extension_urns: extensions.to_extension_urns(),
967            extensions: extensions.to_extension_declarations(),
968            relations: root_relations,
969            ..Default::default()
970        })
971    }
972}
973
974#[cfg(test)]
975mod tests {
976    use substrait::proto::extensions::simple_extension_declaration::MappingType;
977    use substrait::proto::rel::RelType;
978
979    use super::*;
980    use crate::extensions::simple::ExtensionKind;
981    use crate::parser::extensions::ExpectedExtensionLine;
982
983    #[test]
984    fn test_parse_basic_block() {
985        let mut expected_extensions = SimpleExtensions::new();
986        expected_extensions
987            .add_extension_urn("/urn/common".to_string(), 1)
988            .unwrap();
989        expected_extensions
990            .add_extension_urn("/urn/specific_funcs".to_string(), 2)
991            .unwrap();
992        expected_extensions
993            .add_extension(ExtensionKind::Function, 1, 10, "func_a".to_string())
994            .unwrap();
995        expected_extensions
996            .add_extension(ExtensionKind::Function, 2, 11, "func_b_special".to_string())
997            .unwrap();
998        expected_extensions
999            .add_extension(ExtensionKind::Type, 1, 20, "SomeType".to_string())
1000            .unwrap();
1001        expected_extensions
1002            .add_extension(ExtensionKind::TypeVariation, 2, 30, "VarX".to_string())
1003            .unwrap();
1004
1005        let mut parser = ExtensionParser::default();
1006        let input_block = r#"
1007URNs:
1008  @  1: /urn/common
1009  @  2: /urn/specific_funcs
1010Functions:
1011  # 10 @  1: func_a
1012  # 11 @  2: func_b_special
1013Types:
1014  # 20 @  1: SomeType
1015Type Variations:
1016  # 30 @  2: VarX
1017"#;
1018
1019        for line_str in input_block.trim().lines() {
1020            parser
1021                .parse_line(IndentedLine::from(line_str))
1022                .unwrap_or_else(|e| panic!("Failed to parse line \'{line_str}\': {e:?}"));
1023        }
1024
1025        assert_eq!(*parser.extensions(), expected_extensions);
1026
1027        let extensions_str = parser.extensions().to_string("  ");
1028        // The writer adds the header; the ExtensionParser does not parse the
1029        // header, so we add it here for comparison.
1030        let expected_str = format!(
1031            "{}\n{}",
1032            simple::EXTENSIONS_HEADER,
1033            input_block.trim_start()
1034        );
1035        assert_eq!(extensions_str.trim(), expected_str.trim());
1036        // Check final state after all lines are processed.
1037        // The last significant line in input_block is a TypeVariation declaration.
1038        assert_eq!(
1039            parser.state(),
1040            ExpectedExtensionLine::ExtensionDeclarations(ExtensionKind::TypeVariation)
1041        );
1042
1043        // Check that a subsequent blank line correctly resets state to Extensions.
1044        parser.parse_line(IndentedLine(0, "")).unwrap();
1045        assert_eq!(parser.state(), ExpectedExtensionLine::Extensions);
1046    }
1047
1048    /// Test that we can parse a larger extensions block and it matches the input.
1049    #[test]
1050    fn test_parse_complete_extension_block() {
1051        let mut parser = ExtensionParser::default();
1052        let input_block = r#"
1053URNs:
1054  @  1: /urn/common
1055  @  2: /urn/specific_funcs
1056  @  3: /urn/types_lib
1057  @  4: /urn/variations_lib
1058Functions:
1059  # 10 @  1: func_a
1060  # 11 @  2: func_b_special
1061  # 12 @  1: func_c_common
1062Types:
1063  # 20 @  1: CommonType
1064  # 21 @  3: LibraryType
1065  # 22 @  1: AnotherCommonType
1066Type Variations:
1067  # 30 @  4: VarX
1068  # 31 @  4: VarY
1069"#;
1070
1071        for line_str in input_block.trim().lines() {
1072            parser
1073                .parse_line(IndentedLine::from(line_str))
1074                .unwrap_or_else(|e| panic!("Failed to parse line \'{line_str}\': {e:?}"));
1075        }
1076
1077        let extensions_str = parser.extensions().to_string("  ");
1078        // The writer adds the header; the ExtensionParser does not parse the
1079        // header, so we add it here for comparison.
1080        let expected_str = format!(
1081            "{}\n{}",
1082            simple::EXTENSIONS_HEADER,
1083            input_block.trim_start()
1084        );
1085        assert_eq!(extensions_str.trim(), expected_str.trim());
1086    }
1087
1088    #[test]
1089    fn test_parse_relation_tree() {
1090        // Example plan with a Project, a Filter, and a Read, nested by indentation
1091        let plan = r#"=== Plan
1092Project[$0, $1, 42, 84]
1093  Filter[$2 => $0, $1]
1094    Read[my.table => a:i32, b:string?, c:boolean]
1095"#;
1096        let mut parser = Parser::default();
1097        for line in plan.lines() {
1098            parser.parse_line(line).unwrap();
1099        }
1100
1101        // Complete the current tree to convert it to relations
1102        let plan = parser.build_plan().unwrap();
1103
1104        let root_rel = &plan.relations[0].rel_type;
1105        let first_rel = match root_rel {
1106            Some(plan_rel::RelType::Rel(rel)) => rel,
1107            _ => panic!("Expected Rel type, got {root_rel:?}"),
1108        };
1109        // Root should be Project
1110        let project = match &first_rel.rel_type {
1111            Some(RelType::Project(p)) => p,
1112            other => panic!("Expected Project at root, got {other:?}"),
1113        };
1114
1115        // Check that Project has Filter as input
1116        assert!(project.input.is_some());
1117        let filter_input = project.input.as_ref().unwrap();
1118
1119        // Check that Filter has Read as input
1120        match &filter_input.rel_type {
1121            Some(RelType::Filter(_)) => {
1122                match &filter_input.rel_type {
1123                    Some(RelType::Filter(filter)) => {
1124                        assert!(filter.input.is_some());
1125                        let read_input = filter.input.as_ref().unwrap();
1126
1127                        // Check that Read has no input (it's a leaf)
1128                        match &read_input.rel_type {
1129                            Some(RelType::Read(_)) => {}
1130                            other => panic!("Expected Read relation, got {other:?}"),
1131                        }
1132                    }
1133                    other => panic!("Expected Filter relation, got {other:?}"),
1134                }
1135            }
1136            other => panic!("Expected Filter relation, got {other:?}"),
1137        }
1138    }
1139
1140    #[test]
1141    fn test_parse_root_relation() {
1142        // Test a plan with a Root relation
1143        let plan = r#"=== Plan
1144Root[result]
1145  Project[$0, $1]
1146    Read[my.table => a:i32, b:string?]
1147"#;
1148        let mut parser = Parser::default();
1149        for line in plan.lines() {
1150            parser.parse_line(line).unwrap();
1151        }
1152
1153        let plan = parser.build_plan().unwrap();
1154
1155        // Check that we have exactly one relation
1156        assert_eq!(plan.relations.len(), 1);
1157
1158        let root_rel = &plan.relations[0].rel_type;
1159        let rel_root = match root_rel {
1160            Some(plan_rel::RelType::Root(rel_root)) => rel_root,
1161            other => panic!("Expected Root type, got {other:?}"),
1162        };
1163
1164        // Check that the root has the correct name
1165        assert_eq!(rel_root.names, vec!["result"]);
1166
1167        // Check that the root has a Project as input
1168        let project_input = match &rel_root.input {
1169            Some(rel) => rel,
1170            None => panic!("Root should have an input"),
1171        };
1172
1173        let project = match &project_input.rel_type {
1174            Some(RelType::Project(p)) => p,
1175            other => panic!("Expected Project as root input, got {other:?}"),
1176        };
1177
1178        // Check that Project has Read as input
1179        let read_input = match &project.input {
1180            Some(rel) => rel,
1181            None => panic!("Project should have an input"),
1182        };
1183
1184        match &read_input.rel_type {
1185            Some(RelType::Read(_)) => {}
1186            other => panic!("Expected Read relation, got {other:?}"),
1187        }
1188    }
1189
1190    #[test]
1191    fn test_parse_root_relation_no_names() {
1192        // Test a plan with a Root relation with no names
1193        let plan = r#"=== Plan
1194Root[]
1195  Project[$0, $1]
1196    Read[my.table => a:i32, b:string?]
1197"#;
1198        let mut parser = Parser::default();
1199        for line in plan.lines() {
1200            parser.parse_line(line).unwrap();
1201        }
1202
1203        let plan = parser.build_plan().unwrap();
1204
1205        let root_rel = &plan.relations[0].rel_type;
1206        let rel_root = match root_rel {
1207            Some(plan_rel::RelType::Root(rel_root)) => rel_root,
1208            other => panic!("Expected Root type, got {other:?}"),
1209        };
1210
1211        // Check that the root has no names
1212        assert_eq!(rel_root.names, Vec::<String>::new());
1213    }
1214
1215    #[test]
1216    fn test_parse_full_plan() {
1217        // Test a complete Substrait plan with extensions and relations
1218        let input = r#"
1219=== Extensions
1220URNs:
1221  @  1: /urn/common
1222  @  2: /urn/specific_funcs
1223Functions:
1224  # 10 @  1: func_a
1225  # 11 @  2: func_b_special
1226Types:
1227  # 20 @  1: SomeType
1228Type Variations:
1229  # 30 @  2: VarX
1230
1231=== Plan
1232Project[$0, $1, 42, 84]
1233  Filter[$2 => $0, $1]
1234    Read[my.table => a:i32, b:string?, c:boolean]
1235"#;
1236
1237        let plan = Parser::parse(input).unwrap();
1238
1239        // Verify the plan structure
1240        assert_eq!(plan.extension_urns.len(), 2);
1241        assert_eq!(plan.extensions.len(), 4);
1242        assert_eq!(plan.relations.len(), 1);
1243
1244        // Verify extension URIs
1245        let urn1 = &plan.extension_urns[0];
1246        assert_eq!(urn1.extension_urn_anchor, 1);
1247        assert_eq!(urn1.urn, "/urn/common");
1248
1249        let urn2 = &plan.extension_urns[1];
1250        assert_eq!(urn2.extension_urn_anchor, 2);
1251        assert_eq!(urn2.urn, "/urn/specific_funcs");
1252
1253        // Verify extensions
1254        let func1 = &plan.extensions[0];
1255        match &func1.mapping_type {
1256            Some(MappingType::ExtensionFunction(f)) => {
1257                assert_eq!(f.function_anchor, 10);
1258                assert_eq!(f.extension_urn_reference, 1);
1259                assert_eq!(f.name, "func_a");
1260            }
1261            other => panic!("Expected ExtensionFunction, got {other:?}"),
1262        }
1263
1264        let func2 = &plan.extensions[1];
1265        match &func2.mapping_type {
1266            Some(MappingType::ExtensionFunction(f)) => {
1267                assert_eq!(f.function_anchor, 11);
1268                assert_eq!(f.extension_urn_reference, 2);
1269                assert_eq!(f.name, "func_b_special");
1270            }
1271            other => panic!("Expected ExtensionFunction, got {other:?}"),
1272        }
1273
1274        let type1 = &plan.extensions[2];
1275        match &type1.mapping_type {
1276            Some(MappingType::ExtensionType(t)) => {
1277                assert_eq!(t.type_anchor, 20);
1278                assert_eq!(t.extension_urn_reference, 1);
1279                assert_eq!(t.name, "SomeType");
1280            }
1281            other => panic!("Expected ExtensionType, got {other:?}"),
1282        }
1283
1284        let var1 = &plan.extensions[3];
1285        match &var1.mapping_type {
1286            Some(MappingType::ExtensionTypeVariation(v)) => {
1287                assert_eq!(v.type_variation_anchor, 30);
1288                assert_eq!(v.extension_urn_reference, 2);
1289                assert_eq!(v.name, "VarX");
1290            }
1291            other => panic!("Expected ExtensionTypeVariation, got {other:?}"),
1292        }
1293
1294        // Verify the relation tree structure
1295        let root_rel = &plan.relations[0];
1296        match &root_rel.rel_type {
1297            Some(plan_rel::RelType::Rel(rel)) => {
1298                match &rel.rel_type {
1299                    Some(RelType::Project(project)) => {
1300                        // Verify Project relation
1301                        assert_eq!(project.expressions.len(), 2); // 42 and 84
1302                        assert!(project.input.is_some()); // Should have Filter as input
1303
1304                        // Check the Filter input
1305                        let filter_input = project.input.as_ref().unwrap();
1306                        match &filter_input.rel_type {
1307                            Some(RelType::Filter(filter)) => {
1308                                assert!(filter.input.is_some()); // Should have Read as input
1309
1310                                // Check the Read input
1311                                let read_input = filter.input.as_ref().unwrap();
1312                                match &read_input.rel_type {
1313                                    Some(RelType::Read(read)) => {
1314                                        // Verify Read relation
1315                                        let schema = read.base_schema.as_ref().unwrap();
1316                                        assert_eq!(schema.names.len(), 3);
1317                                        assert_eq!(schema.names[0], "a");
1318                                        assert_eq!(schema.names[1], "b");
1319                                        assert_eq!(schema.names[2], "c");
1320
1321                                        let struct_ = schema.r#struct.as_ref().unwrap();
1322                                        assert_eq!(struct_.types.len(), 3);
1323                                    }
1324                                    other => panic!("Expected Read relation, got {other:?}"),
1325                                }
1326                            }
1327                            other => panic!("Expected Filter relation, got {other:?}"),
1328                        }
1329                    }
1330                    other => panic!("Expected Project relation, got {other:?}"),
1331                }
1332            }
1333            other => panic!("Expected Rel type, got {other:?}"),
1334        }
1335    }
1336}