substrait_explain/parser/
structural.rs

1//! Parser for the structural part of the Substrait file format.
2//!
3//! This is the overall parser for parsing the text format. It is responsible
4//! for tracking which section of the file we are currently parsing, and parsing
5//! each line separately.
6
7use std::fmt;
8
9use pest::iterators::Pair;
10use substrait::proto::extensions::AdvancedExtension;
11use substrait::proto::{
12    AggregateRel, FetchRel, FilterRel, JoinRel, Plan, PlanRel, ProjectRel, ReadRel, Rel, RelRoot,
13    SortRel, plan_rel,
14};
15
16use crate::extensions::{ExtensionRegistry, ExtensionType, SimpleExtensions, simple};
17use crate::parser::common::{MessageParseError, ParsePair};
18use crate::parser::errors::{ParseContext, ParseError, ParseResult};
19use crate::parser::expressions::Name;
20use crate::parser::extensions::{
21    AdvExtInvocation, ExtensionInvocation, ExtensionParseError, ExtensionParser,
22};
23use crate::parser::relations::{RelationParsingContext, VirtualReadRel};
24use crate::parser::{ErrorKind, ExpressionParser, RelationParsePair, Rule, unwrap_single_pair};
25
26pub const PLAN_HEADER: &str = "=== Plan";
27
28/// Represents an input line, trimmed of leading two-space indents and final
29/// whitespace. Contains the number of indents and the trimmed line.
30#[derive(Debug, Clone, Copy, PartialEq, Eq)]
31pub struct IndentedLine<'a>(pub usize, pub &'a str);
32
33impl<'a> From<&'a str> for IndentedLine<'a> {
34    fn from(line: &'a str) -> Self {
35        let line = line.trim_end();
36        let mut spaces = 0;
37        for c in line.chars() {
38            if c == ' ' {
39                spaces += 1;
40            } else {
41                break;
42            }
43        }
44
45        let indents = spaces / 2;
46
47        let (_, trimmed) = line.split_at(indents * 2);
48
49        IndentedLine(indents, trimmed)
50    }
51}
52
53/// An advanced-extension annotation (`+ Enh:Name[args]` or `+ Opt:Name[args]`)
54/// that is attached to a relation node. The `pair` holds the `adv_extension`
55/// grammar rule directly (already unwrapped from the outer `planNode`).
56#[derive(Debug, Clone)]
57pub struct AdvExt<'a> {
58    pub pair: Pair<'a, Rule>, // Rule::adv_extension
59    pub line_no: i64,
60}
61
62/// A relation node in the plan tree, before conversion to a Substrait proto.
63#[derive(Debug, Clone)]
64pub struct RelationNode<'a> {
65    pub pair: Pair<'a, Rule>,
66    pub line_no: i64,
67    pub adv_extensions: Vec<AdvExt<'a>>,
68    pub children: Vec<RelationNode<'a>>,
69}
70
71impl<'a> RelationNode<'a> {
72    pub fn context(&self) -> ParseContext {
73        ParseContext {
74            line_no: self.line_no,
75            line: self.pair.as_str().to_string(),
76        }
77    }
78}
79
80/// A parsed plan line: either a relation or an advanced extension (`+`-prefixed line).
81///
82/// Classification happens at construction time by inspecting the inner grammar
83/// rule, so downstream code can use standard Rust pattern matching rather than
84/// runtime rule inspection.
85#[derive(Debug, Clone)]
86pub enum LineNode<'a> {
87    Relation(RelationNode<'a>),
88    AdvExt(AdvExt<'a>),
89}
90
91impl<'a> LineNode<'a> {
92    pub fn parse(line: &'a str, line_no: i64) -> Result<Self, ParseError> {
93        let mut pairs: pest::iterators::Pairs<'a, Rule> =
94            <ExpressionParser as pest::Parser<Rule>>::parse(Rule::planNode, line).map_err(|e| {
95                ParseError::Plan(
96                    ParseContext {
97                        line_no,
98                        line: line.to_string(),
99                    },
100                    MessageParseError::new("planNode", ErrorKind::InvalidValue, Box::new(e)),
101                )
102            })?;
103
104        let outer = pairs.next().unwrap();
105        assert!(pairs.next().is_none()); // Should be exactly one pair
106        let inner = unwrap_single_pair(outer);
107
108        Ok(match inner.as_rule() {
109            Rule::adv_extension => LineNode::AdvExt(AdvExt {
110                pair: inner,
111                line_no,
112            }),
113            _ => LineNode::Relation(RelationNode {
114                pair: inner,
115                line_no,
116                adv_extensions: Vec::new(),
117                children: Vec::new(),
118            }),
119        })
120    }
121
122    /// Parse the line as a top-level relation at depth 0 (either root_relation or regular relation)
123    pub fn parse_root(line: &'a str, line_no: i64) -> Result<Self, ParseError> {
124        let mut pairs: pest::iterators::Pairs<'a, Rule> = <ExpressionParser as pest::Parser<
125            Rule,
126        >>::parse(
127            Rule::top_level_relation, line
128        )
129        .map_err(|e| {
130            ParseError::Plan(
131                ParseContext::new(line_no, line.to_string()),
132                MessageParseError::new("top_level_relation", ErrorKind::Syntax, Box::new(e)),
133            )
134        })?;
135
136        let outer = pairs.next().unwrap();
137        assert!(pairs.next().is_none());
138
139        // top_level_relation is either root_relation or planNode.
140        // If planNode, unwrap one more level to obtain the specific relation rule.
141        let inner = unwrap_single_pair(outer);
142        let pair = if inner.as_rule() == Rule::planNode {
143            unwrap_single_pair(inner)
144        } else {
145            inner // root_relation
146        };
147
148        // planNode can include addenda (+Enh:, +Opt:); surface them so
149        // TreeBuilder::add_line can produce the appropriate depth-0 error.
150        if pair.as_rule() == Rule::adv_extension {
151            return Ok(LineNode::AdvExt(AdvExt { pair, line_no }));
152        }
153
154        Ok(LineNode::Relation(RelationNode {
155            pair,
156            line_no,
157            adv_extensions: Vec::new(),
158            children: Vec::new(),
159        }))
160    }
161}
162
163#[derive(Copy, Clone, Debug)]
164pub enum State {
165    // The initial state, before we have parsed any lines.
166    Initial,
167    // The extensions section, after parsing the header and any other Extension lines.
168    Extensions,
169    // The plan section, after parsing the header and any other Plan lines.
170    Plan,
171}
172
173impl fmt::Display for State {
174    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
175        write!(f, "{self:?}")
176    }
177}
178
179// An in-progress tree builder, building the tree of relations.
180#[derive(Debug, Clone, Default)]
181pub struct TreeBuilder<'a> {
182    // Current tree of nodes being built. These have been successfully parsed
183    // into Pest pairs, but have not yet been converted to substrait plans.
184    current: Option<RelationNode<'a>>,
185    // Completed trees that have been built.
186    completed: Vec<RelationNode<'a>>,
187}
188
189impl<'a> TreeBuilder<'a> {
190    /// Traverse down the tree, always taking the last child at each level, until reaching the specified depth.
191    pub fn get_at_depth(&mut self, depth: usize) -> Option<&mut RelationNode<'a>> {
192        let mut node = self.current.as_mut()?;
193        for _ in 0..depth {
194            node = node.children.last_mut()?;
195        }
196        Some(node)
197    }
198
199    pub fn add_line(&mut self, depth: usize, node: LineNode<'a>) -> Result<(), ParseError> {
200        match node {
201            LineNode::Relation(rel_node) => {
202                if depth == 0 {
203                    if let Some(prev) = self.current.take() {
204                        self.completed.push(prev);
205                    }
206                    self.current = Some(rel_node);
207                    return Ok(());
208                }
209
210                let parent = match self.get_at_depth(depth - 1) {
211                    None => {
212                        return Err(ParseError::Plan(
213                            rel_node.context(),
214                            MessageParseError::invalid(
215                                "relation",
216                                rel_node.pair.as_span(),
217                                format!("No parent found for depth {depth}"),
218                            ),
219                        ));
220                    }
221                    Some(parent) => parent,
222                };
223
224                parent.children.push(rel_node);
225            }
226            LineNode::AdvExt(adv_ext) => {
227                let context = ParseContext::new(adv_ext.line_no, adv_ext.pair.as_str().to_string());
228                if depth == 0 {
229                    return Err(ParseError::ValidationError(
230                        context,
231                        "addenda (+ Enh: / + Opt:) cannot appear at the top level".to_string(),
232                    ));
233                }
234
235                let parent = match self.get_at_depth(depth - 1) {
236                    None => {
237                        return Err(ParseError::ValidationError(
238                            context,
239                            format!("no parent found for addendum at depth {depth}"),
240                        ));
241                    }
242                    Some(parent) => parent,
243                };
244
245                if !parent.children.is_empty() {
246                    return Err(ParseError::ValidationError(
247                        context,
248                        "addenda (+ Enh: / + Opt:) must appear before child relations, \
249                         not after"
250                            .to_string(),
251                    ));
252                }
253
254                parent.adv_extensions.push(adv_ext);
255            }
256        }
257        Ok(())
258    }
259
260    /// End of input - move any remaining nodes from stack to completed and
261    /// return any trees in progress. Resets the builder to its initial state
262    /// (empty)
263    /// Move any remaining nodes from stack to completed
264    pub fn finish(&mut self) -> Vec<RelationNode<'a>> {
265        if let Some(node) = self.current.take() {
266            self.completed.push(node);
267        }
268        std::mem::take(&mut self.completed)
269    }
270}
271
272/// Intermediate state for relation parsing: the structural tree data
273/// (children, addenda) has been processed into proto types, but the
274/// relation's own grammar pair hasn't been parsed yet.
275struct RelationContext<'a> {
276    pair: Pair<'a, Rule>,
277    line_no: i64,
278    #[allow(clippy::vec_box)]
279    children: Vec<Box<Rel>>,
280    input_field_count: usize,
281    advanced_extension: Option<AdvancedExtension>,
282}
283
284// Relation parsing component - handles converting LineNodes to Relations
285#[derive(Debug, Clone, Default)]
286pub struct RelationParser<'a> {
287    tree: TreeBuilder<'a>,
288}
289
290impl<'a> RelationParser<'a> {
291    pub fn parse_line(&mut self, line: IndentedLine<'a>, line_no: i64) -> Result<(), ParseError> {
292        let IndentedLine(depth, line) = line;
293
294        // Use parse_root for depth 0 (top-level relations), parse for other depths
295        let node = if depth == 0 {
296            LineNode::parse_root(line, line_no)?
297        } else {
298            LineNode::parse(line, line_no)?
299        };
300
301        self.tree.add_line(depth, node)
302    }
303
304    /// Dispatch by grammar rule after validating addenda constraints.
305    /// Standard relations go through [`parse_rel`](Self::parse_rel);
306    /// extension relations go through
307    /// [`parse_extension_relation`](Self::parse_extension_relation).
308    fn parse_relation(
309        &self,
310        extensions: &SimpleExtensions,
311        registry: &ExtensionRegistry,
312        ctx: RelationContext,
313    ) -> Result<(Rel, usize), ParseError> {
314        match ctx.pair.as_rule() {
315            Rule::virtual_read_relation => self.parse_rel::<VirtualReadRel>(extensions, ctx),
316            Rule::read_relation => self.parse_rel::<ReadRel>(extensions, ctx),
317            Rule::filter_relation => self.parse_rel::<FilterRel>(extensions, ctx),
318            Rule::project_relation => self.parse_rel::<ProjectRel>(extensions, ctx),
319            Rule::aggregate_relation => self.parse_rel::<AggregateRel>(extensions, ctx),
320            Rule::sort_relation => self.parse_rel::<SortRel>(extensions, ctx),
321            Rule::fetch_relation => self.parse_rel::<FetchRel>(extensions, ctx),
322            Rule::join_relation => self.parse_rel::<JoinRel>(extensions, ctx),
323            Rule::extension_relation => self.parse_extension_relation(extensions, registry, ctx),
324            _ => unreachable!("unhandled relation rule: {:?}", ctx.pair.as_rule()),
325        }
326    }
327
328    /// Generic bridge between [`parse_relation`](Self::parse_relation) and
329    /// the [`RelationParsePair`] trait: wraps `MessageParseError` with line
330    /// context and calls [`into_rel`](RelationParsePair::into_rel) to apply
331    /// addenda and produce the final [`Rel`].
332    fn parse_rel<T: RelationParsePair>(
333        &self,
334        extensions: &SimpleExtensions,
335        ctx: RelationContext,
336    ) -> Result<(Rel, usize), ParseError> {
337        assert_eq!(ctx.pair.as_rule(), T::rule());
338        let line_no = ctx.line_no;
339        let line = ctx.pair.as_str();
340
341        match T::parse_pair_with_context(extensions, ctx.pair, ctx.children, ctx.input_field_count)
342        {
343            Ok((parsed, count)) => Ok((parsed.into_rel(ctx.advanced_extension), count)),
344            Err(e) => Err(ParseError::Plan(
345                ParseContext::new(line_no, line.to_string()),
346                e,
347            )),
348        }
349    }
350
351    /// Handle extension relations separately from [`parse_rel`](Self::parse_rel)
352    /// because they need registry lookups that [`RelationParsePair`] doesn't
353    /// support.
354    fn parse_extension_relation(
355        &self,
356        extensions: &SimpleExtensions,
357        registry: &ExtensionRegistry,
358        ctx: RelationContext,
359    ) -> Result<(Rel, usize), ParseError> {
360        assert_eq!(ctx.pair.as_rule(), Rule::extension_relation);
361        let line_no = ctx.line_no;
362        let line = ctx.pair.as_str();
363        let pair_span = ctx.pair.as_span();
364
365        let ExtensionInvocation {
366            name,
367            args: extension_args,
368        } = ExtensionInvocation::parse_pair(ctx.pair);
369
370        let child_count = ctx.children.len();
371        extension_args
372            .relation_type
373            .validate_child_count(child_count)
374            .map_err(|e| {
375                ParseError::Plan(
376                    ParseContext::new(line_no, line.to_string()),
377                    MessageParseError::invalid("extension_relation", pair_span, e),
378                )
379            })?;
380
381        let context = RelationParsingContext {
382            extensions,
383            registry,
384            line_no,
385            line,
386        };
387
388        let detail = context.resolve_extension_detail(&name, &extension_args)?;
389        let output_column_count = extension_args.output_columns.len();
390
391        let rel = extension_args
392            .relation_type
393            .create_rel(detail, ctx.children)
394            .map_err(|e| {
395                ParseError::Plan(
396                    ParseContext::new(line_no, line.to_string()),
397                    MessageParseError::invalid("extension_relation", pair_span, e),
398                )
399            })?;
400
401        if ctx.advanced_extension.is_some() {
402            return Err(ParseError::ValidationError(
403                ParseContext::new(line_no, line.to_string()),
404                "extension relations do not support advanced extensions (+ Enh / + Opt)"
405                    .to_string(),
406            ));
407        }
408        Ok((rel, output_column_count))
409    }
410
411    /// Walk the relation tree depth-first, converting structural types
412    /// (children, addenda) into proto types via [`RelationContext`].
413    /// Delegates grammar-rule-specific work to
414    /// [`parse_relation`](Self::parse_relation).
415    fn build_rel(
416        &self,
417        extensions: &SimpleExtensions,
418        registry: &ExtensionRegistry,
419        node: RelationNode,
420    ) -> Result<(Rel, usize), ParseError> {
421        let mut children: Vec<Box<Rel>> = Vec::new();
422        let mut input_field_count: usize = 0;
423        for child in node.children {
424            let (rel, count) = self.build_rel(extensions, registry, child)?;
425            input_field_count += count;
426            children.push(Box::new(rel));
427        }
428
429        let advanced_extension = if node.adv_extensions.is_empty() {
430            None
431        } else {
432            Some(self.build_advanced_extension(extensions, registry, node.adv_extensions)?)
433        };
434
435        self.parse_relation(
436            extensions,
437            registry,
438            RelationContext {
439                pair: node.pair,
440                line_no: node.line_no,
441                children,
442                input_field_count,
443                advanced_extension,
444            },
445        )
446    }
447
448    /// Parse a list of [`AdvExt`] nodes into an [`AdvancedExtension`] proto.
449    fn build_advanced_extension(
450        &self,
451        extensions: &SimpleExtensions,
452        registry: &ExtensionRegistry,
453        adv_exts: Vec<AdvExt>,
454    ) -> Result<AdvancedExtension, ParseError> {
455        let mut enhancement = None;
456        let mut optimizations = Vec::new();
457
458        for adv_ext in adv_exts {
459            let line_no = adv_ext.line_no;
460            let line = adv_ext.pair.as_str().to_string();
461            let invocation = AdvExtInvocation::parse_pair(adv_ext.pair);
462            let context = RelationParsingContext {
463                extensions,
464                registry,
465                line_no,
466                line: &line,
467            };
468
469            match invocation.ext_type {
470                ExtensionType::Enhancement => {
471                    let detail = context.resolve_adv_ext_detail(
472                        ExtensionType::Enhancement,
473                        &invocation.name,
474                        &invocation.args,
475                    )?;
476                    if enhancement.is_some() {
477                        return Err(ParseError::ValidationError(
478                            ParseContext::new(line_no, line.clone()),
479                            "at most one enhancement per relation is allowed".to_string(),
480                        ));
481                    }
482                    enhancement = Some(detail.into());
483                }
484                ExtensionType::Optimization => {
485                    let detail = context.resolve_adv_ext_detail(
486                        ExtensionType::Optimization,
487                        &invocation.name,
488                        &invocation.args,
489                    )?;
490                    optimizations.push(detail.into());
491                }
492                ExtensionType::Relation => {
493                    unreachable!("Grammar restricts adv_ext_type to 'Enh' or 'Opt'")
494                }
495            }
496        }
497
498        Ok(AdvancedExtension {
499            enhancement,
500            optimization: optimizations,
501        })
502    }
503
504    /// Build a tree of relations.
505    fn build_plan_rel(
506        &self,
507        extensions: &SimpleExtensions,
508        registry: &ExtensionRegistry,
509        node: RelationNode,
510    ) -> Result<PlanRel, ParseError> {
511        // Plain relations are allowed as root relations; they just don't have names.
512        if node.pair.as_rule() != Rule::root_relation {
513            let (rel, _) = self.build_rel(extensions, registry, node)?;
514            return Ok(PlanRel {
515                rel_type: Some(plan_rel::RelType::Rel(rel)),
516            });
517        }
518
519        // Root relations don't support addenda — reject rather than silently discard.
520        if !node.adv_extensions.is_empty() {
521            let first = &node.adv_extensions[0];
522            let context = ParseContext::new(first.line_no, first.pair.as_str().to_string());
523            return Err(ParseError::ValidationError(
524                context,
525                "addenda (+ Enh: / + Opt:) are not supported on Root relations".to_string(),
526            ));
527        }
528
529        // Named root relation.
530        let context = node.context();
531        let span = node.pair.as_span();
532
533        // Parse the column names
534        let column_names_pair = unwrap_single_pair(node.pair);
535        assert_eq!(column_names_pair.as_rule(), Rule::root_name_list);
536
537        let names: Vec<String> = column_names_pair
538            .into_inner()
539            .map(|name_pair| {
540                assert_eq!(name_pair.as_rule(), Rule::name);
541                Name::parse_pair(name_pair).0
542            })
543            .collect();
544
545        let mut children = node.children;
546        let child = match children.len() {
547            1 => {
548                let (rel, _) = self.build_rel(extensions, registry, children.pop().unwrap())?;
549                rel
550            }
551            n => {
552                return Err(ParseError::Plan(
553                    context,
554                    MessageParseError::invalid(
555                        "root_relation",
556                        span,
557                        format!("Root relation must have exactly one child, found {n}"),
558                    ),
559                ));
560            }
561        };
562
563        Ok(PlanRel {
564            rel_type: Some(plan_rel::RelType::Root(RelRoot {
565                names,
566                input: Some(child),
567            })),
568        })
569    }
570
571    /// Build all the trees.
572    fn build(
573        mut self,
574        extensions: &SimpleExtensions,
575        registry: &ExtensionRegistry,
576    ) -> Result<Vec<PlanRel>, ParseError> {
577        let nodes = self.tree.finish();
578        nodes
579            .into_iter()
580            .map(|n| self.build_plan_rel(extensions, registry, n))
581            .collect::<Result<Vec<PlanRel>, ParseError>>()
582    }
583}
584
585/// A parser for Substrait query plans in text format.
586///
587/// The `Parser` converts human-readable Substrait text format into Substrait
588/// protobuf plans. It handles both the extensions section (which defines
589/// functions, types, etc.) and the plan section (which defines the actual query
590/// structure).
591///
592/// ## Usage
593///
594/// The simplest entry point is the static `parse()` method:
595///
596/// ```rust
597/// use substrait_explain::parser::Parser;
598///
599/// let plan_text = r#"
600/// === Plan
601/// Root[c, d]
602///   Project[$1, 42]
603///     Read[schema.table => a:i64, b:string?]
604/// "#;
605///
606/// let plan = Parser::parse(plan_text).unwrap();
607/// ```
608///
609/// ## Input Format
610///
611/// The parser expects input in the following format:
612///
613/// ```text
614/// === Extensions
615/// URNs:
616///   @  1: https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml
617/// Functions:
618///   # 10 @  1: add
619/// === Plan
620/// Root[columns]
621///   Relation[arguments => columns]
622///     ChildRelation[arguments => columns]
623/// ```
624///
625/// - **Extensions section** (optional): Defines URNs and function/type declarations
626/// - **Plan section** (required): Defines the query structure with indented relations
627///
628/// ## Error Handling
629///
630/// The parser provides detailed error information including:
631/// - Line number where the error occurred
632/// - The actual line content that failed to parse
633/// - Specific error type and description
634///
635/// ```rust
636/// use substrait_explain::parser::Parser;
637///
638/// let invalid_plan = r#"
639/// === Plan
640/// InvalidRelation[invalid syntax]
641/// "#;
642///
643/// match Parser::parse(invalid_plan) {
644///     Ok(plan) => println!("Successfully parsed"),
645///     Err(e) => eprintln!("Parse error: {}", e),
646/// }
647/// ```
648///
649/// ## Supported Relations
650///
651/// The parser supports all standard Substrait relations:
652/// - `Read[table => columns]` - Read from a table
653/// - `Project[expressions]` - Project columns/expressions
654/// - `Filter[condition => columns]` - Filter rows
655/// - `Root[columns]` - Root relation with output columns
656/// - And more...
657///
658/// ## Extensions Support
659///
660/// The parser fully supports Substrait Simple Extensions, allowing you to:
661/// - Define custom functions with URNs and anchors
662/// - Reference functions by name in expressions
663/// - Use custom types and type variations
664///
665/// ```rust
666/// use substrait_explain::parser::Parser;
667///
668/// let plan_with_extensions = r#"
669/// === Extensions
670/// URNs:
671///   @  1: https://example.com/functions.yaml
672/// Functions:
673///   ## 10 @  1: my_custom_function
674/// === Plan
675/// Root[result]
676///   Project[my_custom_function($0, $1)]
677///     Read[table => col1:i32, col2:i32]
678/// "#;
679///
680/// let plan = Parser::parse(plan_with_extensions).unwrap();
681/// ```
682///
683/// ## Performance
684///
685/// The parser is designed for efficiency:
686/// - Single-pass parsing with minimal allocations
687/// - Early error detection and reporting
688/// - Memory-efficient tree building
689///
690/// ## Thread Safety
691///
692/// `Parser` instances are not thread-safe and should not be shared between threads.
693/// However, the static `parse()` method is safe to call from multiple threads.
694#[derive(Debug)]
695pub struct Parser<'a> {
696    line_no: i64,
697    state: State,
698    extension_parser: ExtensionParser,
699    extension_registry: ExtensionRegistry,
700    relation_parser: RelationParser<'a>,
701}
702impl<'a> Default for Parser<'a> {
703    fn default() -> Self {
704        Self::new()
705    }
706}
707
708impl<'a> Parser<'a> {
709    /// Parse a Substrait plan from text format.
710    ///
711    /// This is the main entry point for parsing.
712    ///
713    /// The input should be in the Substrait text format, which consists of:
714    /// - An optional extensions section starting with "=== Extensions"
715    /// - A plan section starting with "=== Plan"
716    /// - Indented relation definitions
717    ///
718    /// # Examples
719    ///
720    /// Simple parsing:
721    /// ```rust
722    /// use substrait_explain::parser::Parser;
723    ///
724    /// let plan_text = r#"
725    /// === Plan
726    /// Root[result]
727    ///   Read[table => col:i32]
728    /// "#;
729    ///
730    /// let plan = Parser::parse(plan_text).unwrap();
731    /// assert_eq!(plan.relations.len(), 1);
732    /// ```
733    ///
734    /// # Errors
735    ///
736    /// Returns a [`ParseError`] if the input cannot be parsed.
737    pub fn parse(input: &str) -> ParseResult {
738        Self::new().parse_plan(input)
739    }
740
741    /// Create a new parser with default configuration.
742    pub fn new() -> Self {
743        Self {
744            line_no: 1,
745            state: State::Initial,
746            extension_parser: ExtensionParser::default(),
747            extension_registry: ExtensionRegistry::new(),
748            relation_parser: RelationParser::default(),
749        }
750    }
751
752    /// Configure the parser to use the specified extension registry.
753    pub fn with_extension_registry(mut self, registry: ExtensionRegistry) -> Self {
754        self.extension_registry = registry;
755        self
756    }
757
758    /// Parse a Substrait plan with the current parser configuration.
759    pub fn parse_plan(mut self, input: &'a str) -> ParseResult {
760        for line in input.lines() {
761            if line.trim().is_empty() {
762                self.line_no += 1;
763                continue;
764            }
765
766            self.parse_line(line)?;
767            self.line_no += 1;
768        }
769
770        let plan = self.build_plan()?;
771        Ok(plan)
772    }
773
774    /// Parse a single line of input.
775    fn parse_line(&mut self, line: &'a str) -> Result<(), ParseError> {
776        let indented_line = IndentedLine::from(line);
777        let line_no = self.line_no;
778        let ctx = || ParseContext {
779            line_no,
780            line: line.to_string(),
781        };
782
783        match self.state {
784            State::Initial => self.parse_initial(indented_line),
785            State::Extensions => self
786                .parse_extensions(indented_line)
787                .map_err(|e| ParseError::Extension(ctx(), e)),
788            State::Plan => {
789                let IndentedLine(depth, line_str) = indented_line;
790
791                // Parse the line
792                let node = if depth == 0 {
793                    LineNode::parse_root(line_str, line_no)?
794                } else {
795                    LineNode::parse(line_str, line_no)?
796                };
797
798                self.relation_parser.tree.add_line(depth, node)
799            }
800        }
801    }
802
803    /// Parse the initial line(s) of the input, which is either a blank line or
804    /// the extensions or plan header.
805    fn parse_initial(&mut self, line: IndentedLine) -> Result<(), ParseError> {
806        match line {
807            IndentedLine(0, l) if l.trim().is_empty() => {}
808            IndentedLine(0, simple::EXTENSIONS_HEADER) => {
809                self.state = State::Extensions;
810            }
811            IndentedLine(0, PLAN_HEADER) => {
812                self.state = State::Plan;
813            }
814            IndentedLine(n, l) => {
815                return Err(ParseError::Initial(
816                    ParseContext::new(n as i64, l.to_string()),
817                    MessageParseError::invalid(
818                        "initial",
819                        pest::Span::new(l, 0, l.len()).expect("Invalid span?!"),
820                        format!("Unknown initial line: {l:?}"),
821                    ),
822                ));
823            }
824        }
825        Ok(())
826    }
827
828    /// Parse a single line from the extensions section of the input, updating
829    /// the parser state.
830    fn parse_extensions(&mut self, line: IndentedLine<'_>) -> Result<(), ExtensionParseError> {
831        if line == IndentedLine(0, PLAN_HEADER) {
832            self.state = State::Plan;
833            return Ok(());
834        }
835        self.extension_parser.parse_line(line)
836    }
837
838    /// Build the plan from the parser state with warning collection.
839    fn build_plan(self) -> Result<Plan, ParseError> {
840        let Parser {
841            relation_parser,
842            extension_parser,
843            extension_registry,
844            ..
845        } = self;
846
847        let extensions = extension_parser.extensions();
848
849        // Parse the tree into relations
850        let root_relations = relation_parser.build(extensions, &extension_registry)?;
851
852        // Build the final plan
853        Ok(Plan {
854            extension_urns: extensions.to_extension_urns(),
855            extensions: extensions.to_extension_declarations(),
856            relations: root_relations,
857            ..Default::default()
858        })
859    }
860}
861
862#[cfg(test)]
863mod tests {
864    use substrait::proto::extensions::simple_extension_declaration::MappingType;
865    use substrait::proto::rel::RelType;
866
867    use super::*;
868    use crate::extensions::simple::ExtensionKind;
869    use crate::parser::extensions::ExtensionParserState;
870
871    #[test]
872    fn test_parse_basic_block() {
873        let mut expected_extensions = SimpleExtensions::new();
874        expected_extensions
875            .add_extension_urn("/urn/common".to_string(), 1)
876            .unwrap();
877        expected_extensions
878            .add_extension_urn("/urn/specific_funcs".to_string(), 2)
879            .unwrap();
880        expected_extensions
881            .add_extension(ExtensionKind::Function, 1, 10, "func_a".to_string())
882            .unwrap();
883        expected_extensions
884            .add_extension(ExtensionKind::Function, 2, 11, "func_b_special".to_string())
885            .unwrap();
886        expected_extensions
887            .add_extension(ExtensionKind::Type, 1, 20, "SomeType".to_string())
888            .unwrap();
889        expected_extensions
890            .add_extension(ExtensionKind::TypeVariation, 2, 30, "VarX".to_string())
891            .unwrap();
892
893        let mut parser = ExtensionParser::default();
894        let input_block = r#"
895URNs:
896  @  1: /urn/common
897  @  2: /urn/specific_funcs
898Functions:
899  # 10 @  1: func_a
900  # 11 @  2: func_b_special
901Types:
902  # 20 @  1: SomeType
903Type Variations:
904  # 30 @  2: VarX
905"#;
906
907        for line_str in input_block.trim().lines() {
908            parser
909                .parse_line(IndentedLine::from(line_str))
910                .unwrap_or_else(|e| panic!("Failed to parse line \'{line_str}\': {e:?}"));
911        }
912
913        assert_eq!(*parser.extensions(), expected_extensions);
914
915        let extensions_str = parser.extensions().to_string("  ");
916        // The writer adds the header; the ExtensionParser does not parse the
917        // header, so we add it here for comparison.
918        let expected_str = format!(
919            "{}\n{}",
920            simple::EXTENSIONS_HEADER,
921            input_block.trim_start()
922        );
923        assert_eq!(extensions_str.trim(), expected_str.trim());
924        // Check final state after all lines are processed.
925        // The last significant line in input_block is a TypeVariation declaration.
926        assert_eq!(
927            parser.state(),
928            ExtensionParserState::ExtensionDeclarations(ExtensionKind::TypeVariation)
929        );
930
931        // Check that a subsequent blank line correctly resets state to Extensions.
932        parser.parse_line(IndentedLine(0, "")).unwrap();
933        assert_eq!(parser.state(), ExtensionParserState::Extensions);
934    }
935
936    /// Test that we can parse a larger extensions block and it matches the input.
937    #[test]
938    fn test_parse_complete_extension_block() {
939        let mut parser = ExtensionParser::default();
940        let input_block = r#"
941URNs:
942  @  1: /urn/common
943  @  2: /urn/specific_funcs
944  @  3: /urn/types_lib
945  @  4: /urn/variations_lib
946Functions:
947  # 10 @  1: func_a
948  # 11 @  2: func_b_special
949  # 12 @  1: func_c_common
950Types:
951  # 20 @  1: CommonType
952  # 21 @  3: LibraryType
953  # 22 @  1: AnotherCommonType
954Type Variations:
955  # 30 @  4: VarX
956  # 31 @  4: VarY
957"#;
958
959        for line_str in input_block.trim().lines() {
960            parser
961                .parse_line(IndentedLine::from(line_str))
962                .unwrap_or_else(|e| panic!("Failed to parse line \'{line_str}\': {e:?}"));
963        }
964
965        let extensions_str = parser.extensions().to_string("  ");
966        // The writer adds the header; the ExtensionParser does not parse the
967        // header, so we add it here for comparison.
968        let expected_str = format!(
969            "{}\n{}",
970            simple::EXTENSIONS_HEADER,
971            input_block.trim_start()
972        );
973        assert_eq!(extensions_str.trim(), expected_str.trim());
974    }
975
976    #[test]
977    fn test_parse_relation_tree() {
978        // Example plan with a Project, a Filter, and a Read, nested by indentation
979        let plan = r#"=== Plan
980Project[$0, $1, 42, 84]
981  Filter[$2 => $0, $1]
982    Read[my.table => a:i32, b:string?, c:boolean]
983"#;
984        let mut parser = Parser::default();
985        for line in plan.lines() {
986            parser.parse_line(line).unwrap();
987        }
988
989        // Complete the current tree to convert it to relations
990        let plan = parser.build_plan().unwrap();
991
992        let root_rel = &plan.relations[0].rel_type;
993        let first_rel = match root_rel {
994            Some(plan_rel::RelType::Rel(rel)) => rel,
995            _ => panic!("Expected Rel type, got {root_rel:?}"),
996        };
997        // Root should be Project
998        let project = match &first_rel.rel_type {
999            Some(RelType::Project(p)) => p,
1000            other => panic!("Expected Project at root, got {other:?}"),
1001        };
1002
1003        // Check that Project has Filter as input
1004        assert!(project.input.is_some());
1005        let filter_input = project.input.as_ref().unwrap();
1006
1007        // Check that Filter has Read as input
1008        match &filter_input.rel_type {
1009            Some(RelType::Filter(_)) => {
1010                match &filter_input.rel_type {
1011                    Some(RelType::Filter(filter)) => {
1012                        assert!(filter.input.is_some());
1013                        let read_input = filter.input.as_ref().unwrap();
1014
1015                        // Check that Read has no input (it's a leaf)
1016                        match &read_input.rel_type {
1017                            Some(RelType::Read(_)) => {}
1018                            other => panic!("Expected Read relation, got {other:?}"),
1019                        }
1020                    }
1021                    other => panic!("Expected Filter relation, got {other:?}"),
1022                }
1023            }
1024            other => panic!("Expected Filter relation, got {other:?}"),
1025        }
1026    }
1027
1028    #[test]
1029    fn test_parse_root_relation() {
1030        // Test a plan with a Root relation
1031        let plan = r#"=== Plan
1032Root[result]
1033  Project[$0, $1]
1034    Read[my.table => a:i32, b:string?]
1035"#;
1036        let mut parser = Parser::default();
1037        for line in plan.lines() {
1038            parser.parse_line(line).unwrap();
1039        }
1040
1041        let plan = parser.build_plan().unwrap();
1042
1043        // Check that we have exactly one relation
1044        assert_eq!(plan.relations.len(), 1);
1045
1046        let root_rel = &plan.relations[0].rel_type;
1047        let rel_root = match root_rel {
1048            Some(plan_rel::RelType::Root(rel_root)) => rel_root,
1049            other => panic!("Expected Root type, got {other:?}"),
1050        };
1051
1052        // Check that the root has the correct name
1053        assert_eq!(rel_root.names, vec!["result"]);
1054
1055        // Check that the root has a Project as input
1056        let project_input = match &rel_root.input {
1057            Some(rel) => rel,
1058            None => panic!("Root should have an input"),
1059        };
1060
1061        let project = match &project_input.rel_type {
1062            Some(RelType::Project(p)) => p,
1063            other => panic!("Expected Project as root input, got {other:?}"),
1064        };
1065
1066        // Check that Project has Read as input
1067        let read_input = match &project.input {
1068            Some(rel) => rel,
1069            None => panic!("Project should have an input"),
1070        };
1071
1072        match &read_input.rel_type {
1073            Some(RelType::Read(_)) => {}
1074            other => panic!("Expected Read relation, got {other:?}"),
1075        }
1076    }
1077
1078    #[test]
1079    fn test_parse_root_relation_no_names() {
1080        // Test a plan with a Root relation with no names
1081        let plan = r#"=== Plan
1082Root[]
1083  Project[$0, $1]
1084    Read[my.table => a:i32, b:string?]
1085"#;
1086        let mut parser = Parser::default();
1087        for line in plan.lines() {
1088            parser.parse_line(line).unwrap();
1089        }
1090
1091        let plan = parser.build_plan().unwrap();
1092
1093        let root_rel = &plan.relations[0].rel_type;
1094        let rel_root = match root_rel {
1095            Some(plan_rel::RelType::Root(rel_root)) => rel_root,
1096            other => panic!("Expected Root type, got {other:?}"),
1097        };
1098
1099        // Check that the root has no names
1100        assert_eq!(rel_root.names, Vec::<String>::new());
1101    }
1102
1103    #[test]
1104    fn test_parse_full_plan() {
1105        // Test a complete Substrait plan with extensions and relations
1106        let input = r#"
1107=== Extensions
1108URNs:
1109  @  1: /urn/common
1110  @  2: /urn/specific_funcs
1111Functions:
1112  # 10 @  1: func_a
1113  # 11 @  2: func_b_special
1114Types:
1115  # 20 @  1: SomeType
1116Type Variations:
1117  # 30 @  2: VarX
1118
1119=== Plan
1120Project[$0, $1, 42, 84]
1121  Filter[$2 => $0, $1]
1122    Read[my.table => a:i32, b:string?, c:boolean]
1123"#;
1124
1125        let plan = Parser::parse(input).unwrap();
1126
1127        // Verify the plan structure
1128        assert_eq!(plan.extension_urns.len(), 2);
1129        assert_eq!(plan.extensions.len(), 4);
1130        assert_eq!(plan.relations.len(), 1);
1131
1132        // Verify extension URIs
1133        let urn1 = &plan.extension_urns[0];
1134        assert_eq!(urn1.extension_urn_anchor, 1);
1135        assert_eq!(urn1.urn, "/urn/common");
1136
1137        let urn2 = &plan.extension_urns[1];
1138        assert_eq!(urn2.extension_urn_anchor, 2);
1139        assert_eq!(urn2.urn, "/urn/specific_funcs");
1140
1141        // Verify extensions
1142        let func1 = &plan.extensions[0];
1143        match &func1.mapping_type {
1144            Some(MappingType::ExtensionFunction(f)) => {
1145                assert_eq!(f.function_anchor, 10);
1146                assert_eq!(f.extension_urn_reference, 1);
1147                assert_eq!(f.name, "func_a");
1148            }
1149            other => panic!("Expected ExtensionFunction, got {other:?}"),
1150        }
1151
1152        let func2 = &plan.extensions[1];
1153        match &func2.mapping_type {
1154            Some(MappingType::ExtensionFunction(f)) => {
1155                assert_eq!(f.function_anchor, 11);
1156                assert_eq!(f.extension_urn_reference, 2);
1157                assert_eq!(f.name, "func_b_special");
1158            }
1159            other => panic!("Expected ExtensionFunction, got {other:?}"),
1160        }
1161
1162        let type1 = &plan.extensions[2];
1163        match &type1.mapping_type {
1164            Some(MappingType::ExtensionType(t)) => {
1165                assert_eq!(t.type_anchor, 20);
1166                assert_eq!(t.extension_urn_reference, 1);
1167                assert_eq!(t.name, "SomeType");
1168            }
1169            other => panic!("Expected ExtensionType, got {other:?}"),
1170        }
1171
1172        let var1 = &plan.extensions[3];
1173        match &var1.mapping_type {
1174            Some(MappingType::ExtensionTypeVariation(v)) => {
1175                assert_eq!(v.type_variation_anchor, 30);
1176                assert_eq!(v.extension_urn_reference, 2);
1177                assert_eq!(v.name, "VarX");
1178            }
1179            other => panic!("Expected ExtensionTypeVariation, got {other:?}"),
1180        }
1181
1182        // Verify the relation tree structure
1183        let root_rel = &plan.relations[0];
1184        match &root_rel.rel_type {
1185            Some(plan_rel::RelType::Rel(rel)) => {
1186                match &rel.rel_type {
1187                    Some(RelType::Project(project)) => {
1188                        // Verify Project relation
1189                        assert_eq!(project.expressions.len(), 2); // 42 and 84
1190                        assert!(project.input.is_some()); // Should have Filter as input
1191
1192                        // Check the Filter input
1193                        let filter_input = project.input.as_ref().unwrap();
1194                        match &filter_input.rel_type {
1195                            Some(RelType::Filter(filter)) => {
1196                                assert!(filter.input.is_some()); // Should have Read as input
1197
1198                                // Check the Read input
1199                                let read_input = filter.input.as_ref().unwrap();
1200                                match &read_input.rel_type {
1201                                    Some(RelType::Read(read)) => {
1202                                        // Verify Read relation
1203                                        let schema = read.base_schema.as_ref().unwrap();
1204                                        assert_eq!(schema.names.len(), 3);
1205                                        assert_eq!(schema.names[0], "a");
1206                                        assert_eq!(schema.names[1], "b");
1207                                        assert_eq!(schema.names[2], "c");
1208
1209                                        let struct_ = schema.r#struct.as_ref().unwrap();
1210                                        assert_eq!(struct_.types.len(), 3);
1211                                    }
1212                                    other => panic!("Expected Read relation, got {other:?}"),
1213                                }
1214                            }
1215                            other => panic!("Expected Filter relation, got {other:?}"),
1216                        }
1217                    }
1218                    other => panic!("Expected Project relation, got {other:?}"),
1219                }
1220            }
1221            other => panic!("Expected Rel type, got {other:?}"),
1222        }
1223    }
1224}