substrait_explain/parser/
structural.rs

1//! Parser for the structural part of the Substrait file format.
2//!
3//! This is the overall parser for parsing the text format. It is responsible
4//! for tracking which section of the file we are currently parsing, and parsing
5//! each line separately.
6
7use std::fmt;
8
9use substrait::proto::extensions::AdvancedExtension;
10use substrait::proto::rel::RelType;
11use substrait::proto::rel_common::EmitKind;
12use substrait::proto::{
13    AggregateRel, FetchRel, FilterRel, JoinRel, Plan, PlanRel, ProjectRel, ReadRel, Rel, RelRoot,
14    SortRel, plan_rel,
15};
16
17use crate::extensions::{ExtensionRegistry, ExtensionType, SimpleExtensions, simple};
18use crate::parser::common::{MessageParseError, ParsePair};
19use crate::parser::errors::{ParseContext, ParseError, ParseResult};
20use crate::parser::expressions::Name;
21use crate::parser::extensions::{
22    AdvExtInvocation, ExtensionInvocation, ExtensionParseError, ExtensionParser,
23};
24use crate::parser::relations::RelationParsingContext;
25use crate::parser::{ErrorKind, ExpressionParser, RelationParsePair, Rule, unwrap_single_pair};
26
27pub const PLAN_HEADER: &str = "=== Plan";
28
29/// Represents an input line, trimmed of leading two-space indents and final
30/// whitespace. Contains the number of indents and the trimmed line.
31#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32pub struct IndentedLine<'a>(pub usize, pub &'a str);
33
34impl<'a> From<&'a str> for IndentedLine<'a> {
35    fn from(line: &'a str) -> Self {
36        let line = line.trim_end();
37        let mut spaces = 0;
38        for c in line.chars() {
39            if c == ' ' {
40                spaces += 1;
41            } else {
42                break;
43            }
44        }
45
46        let indents = spaces / 2;
47
48        let (_, trimmed) = line.split_at(indents * 2);
49
50        IndentedLine(indents, trimmed)
51    }
52}
53
54/// An advanced-extension annotation (`+ Enh:Name[args]` or `+ Opt:Name[args]`)
55/// that is attached to a relation node. The `pair` holds the `adv_extension`
56/// grammar rule directly (already unwrapped from the outer `planNode`).
57#[derive(Debug, Clone)]
58pub struct AdvExt<'a> {
59    pub pair: pest::iterators::Pair<'a, Rule>, // Rule::adv_extension
60    pub line_no: i64,
61}
62
63/// A relation node in the plan tree, before conversion to a Substrait proto.
64#[derive(Debug, Clone)]
65pub struct RelationNode<'a> {
66    pub pair: pest::iterators::Pair<'a, Rule>,
67    pub line_no: i64,
68    pub adv_extensions: Vec<AdvExt<'a>>,
69    pub children: Vec<RelationNode<'a>>,
70}
71
72impl<'a> RelationNode<'a> {
73    pub fn context(&self) -> ParseContext {
74        ParseContext {
75            line_no: self.line_no,
76            line: self.pair.as_str().to_string(),
77        }
78    }
79}
80
81/// A parsed plan line: either a relation or an advanced-extension annotation.
82///
83/// Classification happens at construction time by inspecting the inner grammar
84/// rule, so downstream code can use standard Rust pattern matching rather than
85/// runtime rule inspection.
86#[derive(Debug, Clone)]
87pub enum LineNode<'a> {
88    Relation(RelationNode<'a>),
89    AdvExt(AdvExt<'a>),
90}
91
92impl<'a> LineNode<'a> {
93    pub fn parse(line: &'a str, line_no: i64) -> Result<Self, ParseError> {
94        let mut pairs: pest::iterators::Pairs<'a, Rule> =
95            <ExpressionParser as pest::Parser<Rule>>::parse(Rule::planNode, line).map_err(|e| {
96                ParseError::Plan(
97                    ParseContext {
98                        line_no,
99                        line: line.to_string(),
100                    },
101                    MessageParseError::new("planNode", ErrorKind::InvalidValue, Box::new(e)),
102                )
103            })?;
104
105        let outer = pairs.next().unwrap();
106        assert!(pairs.next().is_none()); // Should be exactly one pair
107        let inner = unwrap_single_pair(outer);
108
109        Ok(if inner.as_rule() == Rule::adv_extension {
110            LineNode::AdvExt(AdvExt {
111                pair: inner,
112                line_no,
113            })
114        } else {
115            LineNode::Relation(RelationNode {
116                pair: inner,
117                line_no,
118                adv_extensions: Vec::new(),
119                children: Vec::new(),
120            })
121        })
122    }
123
124    /// Parse the line as a top-level relation at depth 0 (either root_relation or regular relation)
125    pub fn parse_root(line: &'a str, line_no: i64) -> Result<Self, ParseError> {
126        let mut pairs: pest::iterators::Pairs<'a, Rule> = <ExpressionParser as pest::Parser<
127            Rule,
128        >>::parse(
129            Rule::top_level_relation, line
130        )
131        .map_err(|e| {
132            ParseError::Plan(
133                ParseContext::new(line_no, line.to_string()),
134                MessageParseError::new("top_level_relation", ErrorKind::Syntax, Box::new(e)),
135            )
136        })?;
137
138        let outer = pairs.next().unwrap();
139        assert!(pairs.next().is_none());
140
141        // top_level_relation is either root_relation or planNode.
142        // If planNode, unwrap one more level to obtain the specific relation rule.
143        let inner = unwrap_single_pair(outer);
144        let pair = if inner.as_rule() == Rule::planNode {
145            unwrap_single_pair(inner)
146        } else {
147            inner // root_relation
148        };
149
150        // planNode can technically include adv_extension; surface it as AdvExt so
151        // TreeBuilder::add_line can produce the appropriate depth-0 error.
152        if pair.as_rule() == Rule::adv_extension {
153            return Ok(LineNode::AdvExt(AdvExt { pair, line_no })); // Where is the error produced or checked???
154        }
155
156        Ok(LineNode::Relation(RelationNode {
157            pair,
158            line_no,
159            adv_extensions: Vec::new(),
160            children: Vec::new(),
161        }))
162    }
163}
164
165/// Return the number of output columns for a relation.
166///
167/// Extension relations are excluded here intentionally: their output column
168/// count is carried as the `usize` in the `(Rel, usize)` returned by
169/// `build_rel` / `parse_extension_relation`, so they never reach this function.
170fn get_output_field_count(rel: &Rel) -> usize {
171    // An explicit emit mapping defines the exact emitted column count regardless
172    // of relation type
173    if let Some(EmitKind::Emit(e)) = get_emit_kind(rel) {
174        return e.output_mapping.len();
175    }
176
177    match &rel.rel_type {
178        Some(RelType::Read(read_rel)) => {
179            // Read embeds the schema directly — count the typed fields.
180            if let Some(schema) = read_rel.base_schema.as_ref()
181                && let Some(s) = schema.r#struct.as_ref()
182            {
183                return s.types.len();
184            }
185            0
186        }
187        Some(RelType::Filter(filter_rel)) => {
188            // Filter passes all input columns through unchanged.
189            if let Some(input) = filter_rel.input.as_ref() {
190                return get_output_field_count(input);
191            }
192            0
193        }
194        Some(RelType::Project(project_rel)) => {
195            // Without an emit, Project's direct output is all input columns
196            // followed by all computed expressions.
197            if let Some(input) = project_rel.input.as_ref() {
198                return get_output_field_count(input) + project_rel.expressions.len();
199            }
200            0
201        }
202        Some(RelType::Sort(sort_rel)) => {
203            // Sort passes all input columns through unchanged.
204            if let Some(input) = sort_rel.input.as_ref() {
205                return get_output_field_count(input);
206            }
207            0
208        }
209        Some(RelType::Fetch(fetch_rel)) => {
210            // Fetch (LIMIT/OFFSET) passes all input columns through unchanged.
211            if let Some(input) = fetch_rel.input.as_ref() {
212                return get_output_field_count(input);
213            }
214            0
215        }
216        Some(RelType::Join(join_rel)) => {
217            // Join concatenates the columns of its two inputs.
218            let left = join_rel.left.as_deref().map_or(0, get_output_field_count);
219            let right = join_rel.right.as_deref().map_or(0, get_output_field_count);
220            left + right
221        }
222        Some(RelType::Aggregate(agg_rel)) => {
223            // Aggregate's direct output is the grouping expressions followed
224            // by the measure results.
225            agg_rel.grouping_expressions.len() + agg_rel.measures.len()
226        }
227        _ => unimplemented!("TODO: implement this for the specific type"),
228    }
229}
230
231/// Extract the emit kind from a relation's `common` field, covering all
232/// standard relation variants that carry it.
233fn get_emit_kind(rel: &Rel) -> Option<&EmitKind> {
234    match &rel.rel_type {
235        Some(RelType::Read(r)) => r.common.as_ref(),
236        Some(RelType::Filter(r)) => r.common.as_ref(),
237        Some(RelType::Project(r)) => r.common.as_ref(),
238        Some(RelType::Sort(r)) => r.common.as_ref(),
239        Some(RelType::Fetch(r)) => r.common.as_ref(),
240        Some(RelType::Join(r)) => r.common.as_ref(),
241        Some(RelType::Aggregate(r)) => r.common.as_ref(),
242        _ => unimplemented!("TODO: implement this for the specific type"),
243    }
244    .and_then(|c| c.emit_kind.as_ref())
245}
246
247/// Set the `advanced_extension` field on a [`Rel`], covering all standard
248/// relation variants that carry the field directly.
249fn set_advanced_extension(rel: &mut Rel, adv_ext: AdvancedExtension) {
250    match &mut rel.rel_type {
251        Some(RelType::Read(r)) => r.advanced_extension = Some(adv_ext),
252        Some(RelType::Filter(r)) => r.advanced_extension = Some(adv_ext),
253        Some(RelType::Project(r)) => r.advanced_extension = Some(adv_ext),
254        Some(RelType::Aggregate(r)) => r.advanced_extension = Some(adv_ext),
255        Some(RelType::Sort(r)) => r.advanced_extension = Some(adv_ext),
256        Some(RelType::Fetch(r)) => r.advanced_extension = Some(adv_ext),
257        Some(RelType::Join(r)) => r.advanced_extension = Some(adv_ext),
258        Some(RelType::ExtensionLeaf(_)) => {
259            unreachable!("Extension types do not have advanced extensions defined on them")
260        }
261        Some(RelType::ExtensionSingle(_)) => {
262            unreachable!("Extension types do not have advanced extensions defined on them")
263        }
264        Some(RelType::ExtensionMulti(_)) => {
265            unreachable!("Extension types do not have advanced extensions defined on them")
266        }
267        _ => {}
268    }
269}
270
271#[derive(Copy, Clone, Debug)]
272pub enum State {
273    // The initial state, before we have parsed any lines.
274    Initial,
275    // The extensions section, after parsing the header and any other Extension lines.
276    Extensions,
277    // The plan section, after parsing the header and any other Plan lines.
278    Plan,
279}
280
281impl fmt::Display for State {
282    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
283        write!(f, "{self:?}")
284    }
285}
286
287// An in-progress tree builder, building the tree of relations.
288#[derive(Debug, Clone, Default)]
289pub struct TreeBuilder<'a> {
290    // Current tree of nodes being built. These have been successfully parsed
291    // into Pest pairs, but have not yet been converted to substrait plans.
292    current: Option<RelationNode<'a>>,
293    // Completed trees that have been built.
294    completed: Vec<RelationNode<'a>>,
295}
296
297impl<'a> TreeBuilder<'a> {
298    /// Traverse down the tree, always taking the last child at each level, until reaching the specified depth.
299    pub fn get_at_depth(&mut self, depth: usize) -> Option<&mut RelationNode<'a>> {
300        let mut node = self.current.as_mut()?;
301        for _ in 0..depth {
302            node = node.children.last_mut()?;
303        }
304        Some(node)
305    }
306
307    pub fn add_line(&mut self, depth: usize, node: LineNode<'a>) -> Result<(), ParseError> {
308        match node {
309            LineNode::Relation(rel_node) => {
310                if depth == 0 {
311                    if let Some(prev) = self.current.take() {
312                        self.completed.push(prev);
313                    }
314                    self.current = Some(rel_node);
315                    return Ok(());
316                }
317
318                let parent = match self.get_at_depth(depth - 1) {
319                    None => {
320                        return Err(ParseError::Plan(
321                            rel_node.context(),
322                            MessageParseError::invalid(
323                                "relation",
324                                rel_node.pair.as_span(),
325                                format!("No parent found for depth {depth}"),
326                            ),
327                        ));
328                    }
329                    Some(parent) => parent,
330                };
331
332                parent.children.push(rel_node);
333            }
334            LineNode::AdvExt(adv_ext) => {
335                if depth == 0 {
336                    return Err(ParseError::ValidationError(format!(
337                        "line {}: advanced extension annotations cannot appear at the top level",
338                        adv_ext.line_no
339                    )));
340                }
341
342                let parent = match self.get_at_depth(depth - 1) {
343                    None => {
344                        return Err(ParseError::ValidationError(format!(
345                            "line {}: no parent found for advanced extension at depth {depth}",
346                            adv_ext.line_no
347                        )));
348                    }
349                    Some(parent) => parent,
350                };
351
352                if !parent.children.is_empty() {
353                    return Err(ParseError::ValidationError(format!(
354                        "line {}: advanced extension annotations (+ Enh: / + Opt:) must \
355                         appear before child relations, not after",
356                        adv_ext.line_no
357                    )));
358                }
359
360                parent.adv_extensions.push(adv_ext);
361            }
362        }
363        Ok(())
364    }
365
366    /// End of input - move any remaining nodes from stack to completed and
367    /// return any trees in progress. Resets the builder to its initial state
368    /// (empty)
369    /// Move any remaining nodes from stack to completed
370    pub fn finish(&mut self) -> Vec<RelationNode<'a>> {
371        if let Some(node) = self.current.take() {
372            self.completed.push(node);
373        }
374        std::mem::take(&mut self.completed)
375    }
376}
377
378// Relation parsing component - handles converting LineNodes to Relations
379#[derive(Debug, Clone, Default)]
380pub struct RelationParser<'a> {
381    tree: TreeBuilder<'a>,
382}
383
384impl<'a> RelationParser<'a> {
385    pub fn parse_line(&mut self, line: IndentedLine<'a>, line_no: i64) -> Result<(), ParseError> {
386        let IndentedLine(depth, line) = line;
387
388        // Use parse_root for depth 0 (top-level relations), parse for other depths
389        let node = if depth == 0 {
390            LineNode::parse_root(line, line_no)?
391        } else {
392            LineNode::parse(line, line_no)?
393        };
394
395        self.tree.add_line(depth, node)
396    }
397
398    /// Parse a relation from a Pest pair of rule 'relation' into a Substrait
399    /// Rel, together with its output column count.
400    ///
401    /// The `usize` in the return value is the number of columns this relation
402    /// outputs.  Callers accumulate these counts across children and pass the
403    /// sum in as `input_field_count` so that relations like `ProjectRel` can
404    /// assign correct emit-mapping indices to their computed expressions.
405    //
406    // Clippy says a Vec<Box<…>> is unnecessary, as the Vec is already on the
407    // heap, but this is what the protobuf requires so we allow it here
408    #[allow(clippy::vec_box)]
409    fn parse_relation(
410        &self,
411        extensions: &SimpleExtensions,
412        registry: &ExtensionRegistry,
413        line_no: i64,
414        pair: pest::iterators::Pair<Rule>,
415        child_relations: Vec<Box<substrait::proto::Rel>>,
416        input_field_count: usize,
417    ) -> Result<(substrait::proto::Rel, usize), ParseError> {
418        // `pair` is already the specific relation rule (e.g. Rule::project_relation),
419        // unwrapped from the outer planNode during LineNode construction.
420        let (e, r, l, p_inner, cr, ic) = (
421            extensions,
422            registry,
423            line_no,
424            pair,
425            child_relations,
426            input_field_count,
427        );
428
429        match p_inner.as_rule() {
430            Rule::read_relation => self.parse_rel::<ReadRel>(e, l, p_inner, cr, ic),
431            Rule::filter_relation => self.parse_rel::<FilterRel>(e, l, p_inner, cr, ic),
432            Rule::project_relation => self.parse_rel::<ProjectRel>(e, l, p_inner, cr, ic),
433            Rule::aggregate_relation => self.parse_rel::<AggregateRel>(e, l, p_inner, cr, ic),
434            Rule::sort_relation => self.parse_rel::<SortRel>(e, l, p_inner, cr, ic),
435            Rule::fetch_relation => self.parse_rel::<FetchRel>(e, l, p_inner, cr, ic),
436            Rule::join_relation => self.parse_rel::<JoinRel>(e, l, p_inner, cr, ic),
437            Rule::extension_relation => self.parse_extension_relation(e, r, l, p_inner, cr),
438            _ => unreachable!("unhandled relation rule: {:?}", p_inner.as_rule()),
439        }
440    }
441
442    /// Parse a specific relation type, returning the relation and its output column count.
443    ///
444    /// The count is derived from the just-built relation via `get_output_field_count`.
445    /// Extension relations are handled separately in `parse_extension_relation` because
446    /// their count comes from the user-supplied output-columns list, not from a schema
447    /// embedded in the protobuf.
448    // Box is needed because Rel is a large enum and we need to pass ownership
449    // through the RelationParsePair trait, which requires Box<Rel>.
450    #[allow(clippy::vec_box)]
451    fn parse_rel<T: RelationParsePair>(
452        &self,
453        extensions: &SimpleExtensions,
454        line_no: i64,
455        pair: pest::iterators::Pair<Rule>,
456        child_relations: Vec<Box<substrait::proto::Rel>>,
457        input_field_count: usize,
458    ) -> Result<(substrait::proto::Rel, usize), ParseError> {
459        assert_eq!(pair.as_rule(), T::rule());
460
461        let line = pair.as_str();
462
463        let rel_type =
464            T::parse_pair_with_context(extensions, pair, child_relations, input_field_count);
465
466        match rel_type {
467            Ok(rel) => {
468                let as_rel = rel.into_rel();
469                let count = get_output_field_count(&as_rel);
470                Ok((as_rel, count))
471            }
472            Err(e) => Err(ParseError::Plan(
473                ParseContext::new(line_no, line.to_string()),
474                e,
475            )),
476        }
477    }
478
479    /// Parse extension relations.
480    /// Extension relations need the full parsing context for registry lookups and warning handling.
481    // Box is needed because Rel is a large enum and we need to pass ownership
482    // through the RelationParsePair trait, which requires Box<Rel>.
483    #[allow(clippy::vec_box)]
484    fn parse_extension_relation(
485        &self,
486        extensions: &SimpleExtensions,
487        registry: &ExtensionRegistry,
488        line_no: i64,
489        pair: pest::iterators::Pair<Rule>,
490        child_relations: Vec<Box<substrait::proto::Rel>>,
491    ) -> Result<(substrait::proto::Rel, usize), ParseError> {
492        assert_eq!(pair.as_rule(), Rule::extension_relation);
493
494        let line = pair.as_str();
495        let pair_span = pair.as_span();
496
497        // Parse extension invocation, which includes the user-provided name
498        let ExtensionInvocation {
499            name,
500            args: extension_args,
501        } = ExtensionInvocation::parse_pair(pair);
502
503        // Validate child count matches relation type
504        let child_count = child_relations.len();
505        extension_args
506            .relation_type
507            .validate_child_count(child_count)
508            .map_err(|e| {
509                ParseError::Plan(
510                    ParseContext::new(line_no, line.to_string()),
511                    MessageParseError::invalid("extension_relation", pair_span, e),
512                )
513            })?;
514
515        let context = RelationParsingContext {
516            extensions,
517            registry,
518            line_no,
519            line,
520        };
521
522        let detail = context.resolve_extension_detail(&name, &extension_args)?;
523        // Capture the output column count before consuming extension_args, so
524        // it can be returned to the caller.
525        let output_column_count = extension_args.output_columns.len();
526
527        let rel = extension_args
528            .relation_type
529            .create_rel(detail, child_relations)
530            .map_err(|e| {
531                ParseError::Plan(
532                    ParseContext::new(line_no, line.to_string()),
533                    MessageParseError::invalid("extension_relation", pair_span, e),
534                )
535            })?;
536
537        Ok((rel, output_column_count))
538    }
539
540    /// Convert a [`RelationNode`] into a Substrait Rel, together with its output column count.
541    /// The returned `usize` is the number of columns this relation emits.
542    fn build_rel(
543        &self,
544        extensions: &SimpleExtensions,
545        registry: &ExtensionRegistry,
546        node: RelationNode,
547    ) -> Result<(substrait::proto::Rel, usize), ParseError> {
548        let mut child_relations: Vec<Box<Rel>> = Vec::new();
549        let mut input_field_count: usize = 0;
550
551        for child in node.children {
552            let (rel, count) = self.build_rel(extensions, registry, child)?;
553            input_field_count += count;
554            child_relations.push(Box::new(rel));
555        }
556
557        let (mut rel, output_count) = self.parse_relation(
558            extensions,
559            registry,
560            node.line_no,
561            node.pair,
562            child_relations,
563            input_field_count,
564        )?;
565
566        if !node.adv_extensions.is_empty() {
567            let adv_ext =
568                self.build_advanced_extension(extensions, registry, node.adv_extensions)?;
569            set_advanced_extension(&mut rel, adv_ext);
570        }
571
572        Ok((rel, output_count))
573    }
574
575    /// Parse a list of [`AdvExt`] nodes into an [`AdvancedExtension`] proto.
576    fn build_advanced_extension(
577        &self,
578        extensions: &SimpleExtensions,
579        registry: &ExtensionRegistry,
580        adv_exts: Vec<AdvExt>,
581    ) -> Result<AdvancedExtension, ParseError> {
582        let mut enhancement = None;
583        let mut optimizations = Vec::new();
584
585        for adv_ext in adv_exts {
586            let line_no = adv_ext.line_no;
587            let line = adv_ext.pair.as_str().to_string();
588            let invocation = AdvExtInvocation::parse_pair(adv_ext.pair);
589            let context = RelationParsingContext {
590                extensions,
591                registry,
592                line_no,
593                line: &line,
594            };
595
596            match invocation.ext_type {
597                ExtensionType::Enhancement => {
598                    let detail = context.resolve_adv_ext_detail(
599                        ExtensionType::Enhancement,
600                        &invocation.name,
601                        &invocation.args,
602                    )?;
603                    if enhancement.is_some() {
604                        return Err(ParseError::ValidationError(
605                            "at most one enhancement per relation is allowed".to_string(),
606                        ));
607                    }
608                    enhancement = Some(detail.into());
609                }
610                ExtensionType::Optimization => {
611                    let detail = context.resolve_adv_ext_detail(
612                        ExtensionType::Optimization,
613                        &invocation.name,
614                        &invocation.args,
615                    )?;
616                    optimizations.push(detail.into());
617                }
618                ExtensionType::Relation => {
619                    unreachable!("Grammar restricts adv_ext_type to 'Enh' or 'Opt'")
620                }
621            }
622        }
623
624        Ok(AdvancedExtension {
625            enhancement,
626            optimization: optimizations,
627        })
628    }
629
630    /// Build a tree of relations.
631    fn build_plan_rel(
632        &self,
633        extensions: &SimpleExtensions,
634        registry: &ExtensionRegistry,
635        node: RelationNode,
636    ) -> Result<PlanRel, ParseError> {
637        // Plain relations are allowed as root relations; they just don't have names.
638        if node.pair.as_rule() != Rule::root_relation {
639            let (rel, _) = self.build_rel(extensions, registry, node)?;
640            return Ok(PlanRel {
641                rel_type: Some(plan_rel::RelType::Rel(rel)),
642            });
643        }
644
645        // Named root relation.
646        let context = node.context();
647        let span = node.pair.as_span();
648
649        // Parse the column names
650        let column_names_pair = unwrap_single_pair(node.pair);
651        assert_eq!(column_names_pair.as_rule(), Rule::root_name_list);
652
653        let names: Vec<String> = column_names_pair
654            .into_inner()
655            .map(|name_pair| {
656                assert_eq!(name_pair.as_rule(), Rule::name);
657                Name::parse_pair(name_pair).0
658            })
659            .collect();
660
661        let mut children = node.children;
662        let child = match children.len() {
663            1 => {
664                let (rel, _) = self.build_rel(extensions, registry, children.pop().unwrap())?;
665                rel
666            }
667            n => {
668                return Err(ParseError::Plan(
669                    context,
670                    MessageParseError::invalid(
671                        "root_relation",
672                        span,
673                        format!("Root relation must have exactly one child, found {n}"),
674                    ),
675                ));
676            }
677        };
678
679        Ok(PlanRel {
680            rel_type: Some(plan_rel::RelType::Root(RelRoot {
681                names,
682                input: Some(child),
683            })),
684        })
685    }
686
687    /// Build all the trees.
688    fn build(
689        mut self,
690        extensions: &SimpleExtensions,
691        registry: &ExtensionRegistry,
692    ) -> Result<Vec<PlanRel>, ParseError> {
693        let nodes = self.tree.finish();
694        nodes
695            .into_iter()
696            .map(|n| self.build_plan_rel(extensions, registry, n))
697            .collect::<Result<Vec<PlanRel>, ParseError>>()
698    }
699}
700
701/// A parser for Substrait query plans in text format.
702///
703/// The `Parser` converts human-readable Substrait text format into Substrait
704/// protobuf plans. It handles both the extensions section (which defines
705/// functions, types, etc.) and the plan section (which defines the actual query
706/// structure).
707///
708/// ## Usage
709///
710/// The simplest entry point is the static `parse()` method:
711///
712/// ```rust
713/// use substrait_explain::parser::Parser;
714///
715/// let plan_text = r#"
716/// === Plan
717/// Root[c, d]
718///   Project[$1, 42]
719///     Read[schema.table => a:i64, b:string?]
720/// "#;
721///
722/// let plan = Parser::parse(plan_text).unwrap();
723/// ```
724///
725/// ## Input Format
726///
727/// The parser expects input in the following format:
728///
729/// ```text
730/// === Extensions
731/// URNs:
732///   @  1: https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml
733/// Functions:
734///   # 10 @  1: add
735/// === Plan
736/// Root[columns]
737///   Relation[arguments => columns]
738///     ChildRelation[arguments => columns]
739/// ```
740///
741/// - **Extensions section** (optional): Defines URNs and function/type declarations
742/// - **Plan section** (required): Defines the query structure with indented relations
743///
744/// ## Error Handling
745///
746/// The parser provides detailed error information including:
747/// - Line number where the error occurred
748/// - The actual line content that failed to parse
749/// - Specific error type and description
750///
751/// ```rust
752/// use substrait_explain::parser::Parser;
753///
754/// let invalid_plan = r#"
755/// === Plan
756/// InvalidRelation[invalid syntax]
757/// "#;
758///
759/// match Parser::parse(invalid_plan) {
760///     Ok(plan) => println!("Successfully parsed"),
761///     Err(e) => eprintln!("Parse error: {}", e),
762/// }
763/// ```
764///
765/// ## Supported Relations
766///
767/// The parser supports all standard Substrait relations:
768/// - `Read[table => columns]` - Read from a table
769/// - `Project[expressions]` - Project columns/expressions
770/// - `Filter[condition => columns]` - Filter rows
771/// - `Root[columns]` - Root relation with output columns
772/// - And more...
773///
774/// ## Extensions Support
775///
776/// The parser fully supports Substrait Simple Extensions, allowing you to:
777/// - Define custom functions with URNs and anchors
778/// - Reference functions by name in expressions
779/// - Use custom types and type variations
780///
781/// ```rust
782/// use substrait_explain::parser::Parser;
783///
784/// let plan_with_extensions = r#"
785/// === Extensions
786/// URNs:
787///   @  1: https://example.com/functions.yaml
788/// Functions:
789///   ## 10 @  1: my_custom_function
790/// === Plan
791/// Root[result]
792///   Project[my_custom_function($0, $1)]
793///     Read[table => col1:i32, col2:i32]
794/// "#;
795///
796/// let plan = Parser::parse(plan_with_extensions).unwrap();
797/// ```
798///
799/// ## Performance
800///
801/// The parser is designed for efficiency:
802/// - Single-pass parsing with minimal allocations
803/// - Early error detection and reporting
804/// - Memory-efficient tree building
805///
806/// ## Thread Safety
807///
808/// `Parser` instances are not thread-safe and should not be shared between threads.
809/// However, the static `parse()` method is safe to call from multiple threads.
810#[derive(Debug)]
811pub struct Parser<'a> {
812    line_no: i64,
813    state: State,
814    extension_parser: ExtensionParser,
815    extension_registry: ExtensionRegistry,
816    relation_parser: RelationParser<'a>,
817}
818impl<'a> Default for Parser<'a> {
819    fn default() -> Self {
820        Self::new()
821    }
822}
823
824impl<'a> Parser<'a> {
825    /// Parse a Substrait plan from text format.
826    ///
827    /// This is the main entry point for parsing.
828    ///
829    /// The input should be in the Substrait text format, which consists of:
830    /// - An optional extensions section starting with "=== Extensions"
831    /// - A plan section starting with "=== Plan"
832    /// - Indented relation definitions
833    ///
834    /// # Examples
835    ///
836    /// Simple parsing:
837    /// ```rust
838    /// use substrait_explain::parser::Parser;
839    ///
840    /// let plan_text = r#"
841    /// === Plan
842    /// Root[result]
843    ///   Read[table => col:i32]
844    /// "#;
845    ///
846    /// let plan = Parser::parse(plan_text).unwrap();
847    /// assert_eq!(plan.relations.len(), 1);
848    /// ```
849    ///
850    /// # Errors
851    ///
852    /// Returns a [`ParseError`] if the input cannot be parsed.
853    pub fn parse(input: &str) -> ParseResult {
854        Self::new().parse_plan(input)
855    }
856
857    /// Create a new parser with default configuration.
858    pub fn new() -> Self {
859        Self {
860            line_no: 1,
861            state: State::Initial,
862            extension_parser: ExtensionParser::default(),
863            extension_registry: ExtensionRegistry::new(),
864            relation_parser: RelationParser::default(),
865        }
866    }
867
868    /// Configure the parser to use the specified extension registry.
869    pub fn with_extension_registry(mut self, registry: ExtensionRegistry) -> Self {
870        self.extension_registry = registry;
871        self
872    }
873
874    /// Parse a Substrait plan with the current parser configuration.
875    pub fn parse_plan(mut self, input: &'a str) -> ParseResult {
876        for line in input.lines() {
877            if line.trim().is_empty() {
878                self.line_no += 1;
879                continue;
880            }
881
882            self.parse_line(line)?;
883            self.line_no += 1;
884        }
885
886        let plan = self.build_plan()?;
887        Ok(plan)
888    }
889
890    /// Parse a single line of input.
891    fn parse_line(&mut self, line: &'a str) -> Result<(), ParseError> {
892        let indented_line = IndentedLine::from(line);
893        let line_no = self.line_no;
894        let ctx = || ParseContext {
895            line_no,
896            line: line.to_string(),
897        };
898
899        match self.state {
900            State::Initial => self.parse_initial(indented_line),
901            State::Extensions => self
902                .parse_extensions(indented_line)
903                .map_err(|e| ParseError::Extension(ctx(), e)),
904            State::Plan => {
905                let IndentedLine(depth, line_str) = indented_line;
906
907                // Parse the line
908                let node = if depth == 0 {
909                    LineNode::parse_root(line_str, line_no)?
910                } else {
911                    LineNode::parse(line_str, line_no)?
912                };
913
914                self.relation_parser.tree.add_line(depth, node)
915            }
916        }
917    }
918
919    /// Parse the initial line(s) of the input, which is either a blank line or
920    /// the extensions or plan header.
921    fn parse_initial(&mut self, line: IndentedLine) -> Result<(), ParseError> {
922        match line {
923            IndentedLine(0, l) if l.trim().is_empty() => {}
924            IndentedLine(0, simple::EXTENSIONS_HEADER) => {
925                self.state = State::Extensions;
926            }
927            IndentedLine(0, PLAN_HEADER) => {
928                self.state = State::Plan;
929            }
930            IndentedLine(n, l) => {
931                return Err(ParseError::Initial(
932                    ParseContext::new(n as i64, l.to_string()),
933                    MessageParseError::invalid(
934                        "initial",
935                        pest::Span::new(l, 0, l.len()).expect("Invalid span?!"),
936                        format!("Unknown initial line: {l:?}"),
937                    ),
938                ));
939            }
940        }
941        Ok(())
942    }
943
944    /// Parse a single line from the extensions section of the input, updating
945    /// the parser state.
946    fn parse_extensions(&mut self, line: IndentedLine<'_>) -> Result<(), ExtensionParseError> {
947        if line == IndentedLine(0, PLAN_HEADER) {
948            self.state = State::Plan;
949            return Ok(());
950        }
951        self.extension_parser.parse_line(line)
952    }
953
954    /// Build the plan from the parser state with warning collection.
955    fn build_plan(self) -> Result<Plan, ParseError> {
956        let Parser {
957            relation_parser,
958            extension_parser,
959            extension_registry,
960            ..
961        } = self;
962
963        let extensions = extension_parser.extensions();
964
965        // Parse the tree into relations
966        let root_relations = relation_parser.build(extensions, &extension_registry)?;
967
968        // Build the final plan
969        Ok(Plan {
970            extension_urns: extensions.to_extension_urns(),
971            extensions: extensions.to_extension_declarations(),
972            relations: root_relations,
973            ..Default::default()
974        })
975    }
976}
977
978#[cfg(test)]
979mod tests {
980    use substrait::proto::extensions::simple_extension_declaration::MappingType;
981
982    use super::*;
983    use crate::extensions::simple::ExtensionKind;
984    use crate::parser::extensions::ExtensionParserState;
985
986    #[test]
987    fn test_parse_basic_block() {
988        let mut expected_extensions = SimpleExtensions::new();
989        expected_extensions
990            .add_extension_urn("/urn/common".to_string(), 1)
991            .unwrap();
992        expected_extensions
993            .add_extension_urn("/urn/specific_funcs".to_string(), 2)
994            .unwrap();
995        expected_extensions
996            .add_extension(ExtensionKind::Function, 1, 10, "func_a".to_string())
997            .unwrap();
998        expected_extensions
999            .add_extension(ExtensionKind::Function, 2, 11, "func_b_special".to_string())
1000            .unwrap();
1001        expected_extensions
1002            .add_extension(ExtensionKind::Type, 1, 20, "SomeType".to_string())
1003            .unwrap();
1004        expected_extensions
1005            .add_extension(ExtensionKind::TypeVariation, 2, 30, "VarX".to_string())
1006            .unwrap();
1007
1008        let mut parser = ExtensionParser::default();
1009        let input_block = r#"
1010URNs:
1011  @  1: /urn/common
1012  @  2: /urn/specific_funcs
1013Functions:
1014  # 10 @  1: func_a
1015  # 11 @  2: func_b_special
1016Types:
1017  # 20 @  1: SomeType
1018Type Variations:
1019  # 30 @  2: VarX
1020"#;
1021
1022        for line_str in input_block.trim().lines() {
1023            parser
1024                .parse_line(IndentedLine::from(line_str))
1025                .unwrap_or_else(|e| panic!("Failed to parse line \'{line_str}\': {e:?}"));
1026        }
1027
1028        assert_eq!(*parser.extensions(), expected_extensions);
1029
1030        let extensions_str = parser.extensions().to_string("  ");
1031        // The writer adds the header; the ExtensionParser does not parse the
1032        // header, so we add it here for comparison.
1033        let expected_str = format!(
1034            "{}\n{}",
1035            simple::EXTENSIONS_HEADER,
1036            input_block.trim_start()
1037        );
1038        assert_eq!(extensions_str.trim(), expected_str.trim());
1039        // Check final state after all lines are processed.
1040        // The last significant line in input_block is a TypeVariation declaration.
1041        assert_eq!(
1042            parser.state(),
1043            ExtensionParserState::ExtensionDeclarations(ExtensionKind::TypeVariation)
1044        );
1045
1046        // Check that a subsequent blank line correctly resets state to Extensions.
1047        parser.parse_line(IndentedLine(0, "")).unwrap();
1048        assert_eq!(parser.state(), ExtensionParserState::Extensions);
1049    }
1050
1051    /// Test that we can parse a larger extensions block and it matches the input.
1052    #[test]
1053    fn test_parse_complete_extension_block() {
1054        let mut parser = ExtensionParser::default();
1055        let input_block = r#"
1056URNs:
1057  @  1: /urn/common
1058  @  2: /urn/specific_funcs
1059  @  3: /urn/types_lib
1060  @  4: /urn/variations_lib
1061Functions:
1062  # 10 @  1: func_a
1063  # 11 @  2: func_b_special
1064  # 12 @  1: func_c_common
1065Types:
1066  # 20 @  1: CommonType
1067  # 21 @  3: LibraryType
1068  # 22 @  1: AnotherCommonType
1069Type Variations:
1070  # 30 @  4: VarX
1071  # 31 @  4: VarY
1072"#;
1073
1074        for line_str in input_block.trim().lines() {
1075            parser
1076                .parse_line(IndentedLine::from(line_str))
1077                .unwrap_or_else(|e| panic!("Failed to parse line \'{line_str}\': {e:?}"));
1078        }
1079
1080        let extensions_str = parser.extensions().to_string("  ");
1081        // The writer adds the header; the ExtensionParser does not parse the
1082        // header, so we add it here for comparison.
1083        let expected_str = format!(
1084            "{}\n{}",
1085            simple::EXTENSIONS_HEADER,
1086            input_block.trim_start()
1087        );
1088        assert_eq!(extensions_str.trim(), expected_str.trim());
1089    }
1090
1091    #[test]
1092    fn test_parse_relation_tree() {
1093        // Example plan with a Project, a Filter, and a Read, nested by indentation
1094        let plan = r#"=== Plan
1095Project[$0, $1, 42, 84]
1096  Filter[$2 => $0, $1]
1097    Read[my.table => a:i32, b:string?, c:boolean]
1098"#;
1099        let mut parser = Parser::default();
1100        for line in plan.lines() {
1101            parser.parse_line(line).unwrap();
1102        }
1103
1104        // Complete the current tree to convert it to relations
1105        let plan = parser.build_plan().unwrap();
1106
1107        let root_rel = &plan.relations[0].rel_type;
1108        let first_rel = match root_rel {
1109            Some(plan_rel::RelType::Rel(rel)) => rel,
1110            _ => panic!("Expected Rel type, got {root_rel:?}"),
1111        };
1112        // Root should be Project
1113        let project = match &first_rel.rel_type {
1114            Some(RelType::Project(p)) => p,
1115            other => panic!("Expected Project at root, got {other:?}"),
1116        };
1117
1118        // Check that Project has Filter as input
1119        assert!(project.input.is_some());
1120        let filter_input = project.input.as_ref().unwrap();
1121
1122        // Check that Filter has Read as input
1123        match &filter_input.rel_type {
1124            Some(RelType::Filter(_)) => {
1125                match &filter_input.rel_type {
1126                    Some(RelType::Filter(filter)) => {
1127                        assert!(filter.input.is_some());
1128                        let read_input = filter.input.as_ref().unwrap();
1129
1130                        // Check that Read has no input (it's a leaf)
1131                        match &read_input.rel_type {
1132                            Some(RelType::Read(_)) => {}
1133                            other => panic!("Expected Read relation, got {other:?}"),
1134                        }
1135                    }
1136                    other => panic!("Expected Filter relation, got {other:?}"),
1137                }
1138            }
1139            other => panic!("Expected Filter relation, got {other:?}"),
1140        }
1141    }
1142
1143    #[test]
1144    fn test_parse_root_relation() {
1145        // Test a plan with a Root relation
1146        let plan = r#"=== Plan
1147Root[result]
1148  Project[$0, $1]
1149    Read[my.table => a:i32, b:string?]
1150"#;
1151        let mut parser = Parser::default();
1152        for line in plan.lines() {
1153            parser.parse_line(line).unwrap();
1154        }
1155
1156        let plan = parser.build_plan().unwrap();
1157
1158        // Check that we have exactly one relation
1159        assert_eq!(plan.relations.len(), 1);
1160
1161        let root_rel = &plan.relations[0].rel_type;
1162        let rel_root = match root_rel {
1163            Some(plan_rel::RelType::Root(rel_root)) => rel_root,
1164            other => panic!("Expected Root type, got {other:?}"),
1165        };
1166
1167        // Check that the root has the correct name
1168        assert_eq!(rel_root.names, vec!["result"]);
1169
1170        // Check that the root has a Project as input
1171        let project_input = match &rel_root.input {
1172            Some(rel) => rel,
1173            None => panic!("Root should have an input"),
1174        };
1175
1176        let project = match &project_input.rel_type {
1177            Some(RelType::Project(p)) => p,
1178            other => panic!("Expected Project as root input, got {other:?}"),
1179        };
1180
1181        // Check that Project has Read as input
1182        let read_input = match &project.input {
1183            Some(rel) => rel,
1184            None => panic!("Project should have an input"),
1185        };
1186
1187        match &read_input.rel_type {
1188            Some(RelType::Read(_)) => {}
1189            other => panic!("Expected Read relation, got {other:?}"),
1190        }
1191    }
1192
1193    #[test]
1194    fn test_parse_root_relation_no_names() {
1195        // Test a plan with a Root relation with no names
1196        let plan = r#"=== Plan
1197Root[]
1198  Project[$0, $1]
1199    Read[my.table => a:i32, b:string?]
1200"#;
1201        let mut parser = Parser::default();
1202        for line in plan.lines() {
1203            parser.parse_line(line).unwrap();
1204        }
1205
1206        let plan = parser.build_plan().unwrap();
1207
1208        let root_rel = &plan.relations[0].rel_type;
1209        let rel_root = match root_rel {
1210            Some(plan_rel::RelType::Root(rel_root)) => rel_root,
1211            other => panic!("Expected Root type, got {other:?}"),
1212        };
1213
1214        // Check that the root has no names
1215        assert_eq!(rel_root.names, Vec::<String>::new());
1216    }
1217
1218    #[test]
1219    fn test_parse_full_plan() {
1220        // Test a complete Substrait plan with extensions and relations
1221        let input = r#"
1222=== Extensions
1223URNs:
1224  @  1: /urn/common
1225  @  2: /urn/specific_funcs
1226Functions:
1227  # 10 @  1: func_a
1228  # 11 @  2: func_b_special
1229Types:
1230  # 20 @  1: SomeType
1231Type Variations:
1232  # 30 @  2: VarX
1233
1234=== Plan
1235Project[$0, $1, 42, 84]
1236  Filter[$2 => $0, $1]
1237    Read[my.table => a:i32, b:string?, c:boolean]
1238"#;
1239
1240        let plan = Parser::parse(input).unwrap();
1241
1242        // Verify the plan structure
1243        assert_eq!(plan.extension_urns.len(), 2);
1244        assert_eq!(plan.extensions.len(), 4);
1245        assert_eq!(plan.relations.len(), 1);
1246
1247        // Verify extension URIs
1248        let urn1 = &plan.extension_urns[0];
1249        assert_eq!(urn1.extension_urn_anchor, 1);
1250        assert_eq!(urn1.urn, "/urn/common");
1251
1252        let urn2 = &plan.extension_urns[1];
1253        assert_eq!(urn2.extension_urn_anchor, 2);
1254        assert_eq!(urn2.urn, "/urn/specific_funcs");
1255
1256        // Verify extensions
1257        let func1 = &plan.extensions[0];
1258        match &func1.mapping_type {
1259            Some(MappingType::ExtensionFunction(f)) => {
1260                assert_eq!(f.function_anchor, 10);
1261                assert_eq!(f.extension_urn_reference, 1);
1262                assert_eq!(f.name, "func_a");
1263            }
1264            other => panic!("Expected ExtensionFunction, got {other:?}"),
1265        }
1266
1267        let func2 = &plan.extensions[1];
1268        match &func2.mapping_type {
1269            Some(MappingType::ExtensionFunction(f)) => {
1270                assert_eq!(f.function_anchor, 11);
1271                assert_eq!(f.extension_urn_reference, 2);
1272                assert_eq!(f.name, "func_b_special");
1273            }
1274            other => panic!("Expected ExtensionFunction, got {other:?}"),
1275        }
1276
1277        let type1 = &plan.extensions[2];
1278        match &type1.mapping_type {
1279            Some(MappingType::ExtensionType(t)) => {
1280                assert_eq!(t.type_anchor, 20);
1281                assert_eq!(t.extension_urn_reference, 1);
1282                assert_eq!(t.name, "SomeType");
1283            }
1284            other => panic!("Expected ExtensionType, got {other:?}"),
1285        }
1286
1287        let var1 = &plan.extensions[3];
1288        match &var1.mapping_type {
1289            Some(MappingType::ExtensionTypeVariation(v)) => {
1290                assert_eq!(v.type_variation_anchor, 30);
1291                assert_eq!(v.extension_urn_reference, 2);
1292                assert_eq!(v.name, "VarX");
1293            }
1294            other => panic!("Expected ExtensionTypeVariation, got {other:?}"),
1295        }
1296
1297        // Verify the relation tree structure
1298        let root_rel = &plan.relations[0];
1299        match &root_rel.rel_type {
1300            Some(plan_rel::RelType::Rel(rel)) => {
1301                match &rel.rel_type {
1302                    Some(RelType::Project(project)) => {
1303                        // Verify Project relation
1304                        assert_eq!(project.expressions.len(), 2); // 42 and 84
1305                        assert!(project.input.is_some()); // Should have Filter as input
1306
1307                        // Check the Filter input
1308                        let filter_input = project.input.as_ref().unwrap();
1309                        match &filter_input.rel_type {
1310                            Some(RelType::Filter(filter)) => {
1311                                assert!(filter.input.is_some()); // Should have Read as input
1312
1313                                // Check the Read input
1314                                let read_input = filter.input.as_ref().unwrap();
1315                                match &read_input.rel_type {
1316                                    Some(RelType::Read(read)) => {
1317                                        // Verify Read relation
1318                                        let schema = read.base_schema.as_ref().unwrap();
1319                                        assert_eq!(schema.names.len(), 3);
1320                                        assert_eq!(schema.names[0], "a");
1321                                        assert_eq!(schema.names[1], "b");
1322                                        assert_eq!(schema.names[2], "c");
1323
1324                                        let struct_ = schema.r#struct.as_ref().unwrap();
1325                                        assert_eq!(struct_.types.len(), 3);
1326                                    }
1327                                    other => panic!("Expected Read relation, got {other:?}"),
1328                                }
1329                            }
1330                            other => panic!("Expected Filter relation, got {other:?}"),
1331                        }
1332                    }
1333                    other => panic!("Expected Project relation, got {other:?}"),
1334                }
1335            }
1336            other => panic!("Expected Rel type, got {other:?}"),
1337        }
1338    }
1339}