substrait_explain/
json.rs

1//! The standard `serde`/pbjson JSON encoding used by Rust stores `google.protobuf.Any`
2//! fields as `{"typeUrl": "...", "value": "<base64>"}`. Go's `protojson` library uses a
3//! different encoding: `{"@type": "...", "field1": val, ...}` where the concrete message's
4//! fields are inlined. `serde_json::from_str::<Plan>` fails on Go-produced JSON because it
5//! only understands the `typeUrl/value` form.
6//!
7//! [`prost_reflect::DynamicMessage`] implements the full protobuf JSON mapping spec and
8//! handles both forms, as long as the `DescriptorPool` contains the schema for every type
9//! URL referenced in the JSON.
10//!
11//! This module exposes [`build_descriptor_pool`] (to construct
12//! the pool, optionally merging in extra descriptor blobs for extension types) and
13//! [`parse_json`] (to parse a JSON string into a [`Plan`] using the pool).
14//!
15//! # Example
16//!
17//! ```rust,ignore
18//! use substrait_explain::json::{build_descriptor_pool, parse_json};
19//!
20//! static MY_EXT: &[u8] = include_bytes!("my_extensions.bin");
21//! let pool = build_descriptor_pool(&[MY_EXT]).unwrap();
22//!  // Works with both Go protojson and Rust pbjson encoding.
23//! let plan = parse_json(json_str, &pool).unwrap();
24//! ```
25
26use anyhow::Context;
27use prost::Message;
28use prost_reflect::{DescriptorPool, DynamicMessage};
29use prost_types::FileDescriptorSet;
30use substrait::proto::Plan;
31
32/// Build a [`DescriptorPool`] covering the Substrait core schema plus any extra
33/// descriptor passed in.
34pub fn build_descriptor_pool(extra_descriptors: &[&[u8]]) -> anyhow::Result<DescriptorPool> {
35    let mut fds = FileDescriptorSet::decode(substrait::proto::FILE_DESCRIPTOR_SET)
36        .context("failed to decode substrait core descriptor")?;
37
38    // Descriptor blobs compiled from proto files bundle their transitive dependencies,
39    // therefore custom descriptors are likely to have repeat file names
40    // such as: google/protobuf/timestamp.proto, google/protobuf/any.proto,
41    // which are also present in substrait core protos.
42    // DescriptorPool::decode treats duplicate filenames as a hard error.
43    // Track filenames already in the set so we can skip duplicates.
44    let mut seen: std::collections::HashSet<String> =
45        fds.file.iter().map(|f| f.name().to_owned()).collect();
46
47    for blob in extra_descriptors {
48        let extra =
49            FileDescriptorSet::decode(*blob).context("failed to decode extra descriptor")?;
50        for f in extra.file {
51            if seen.insert(f.name().to_owned()) {
52                fds.file.push(f);
53            }
54        }
55    }
56
57    DescriptorPool::decode(fds.encode_to_vec().as_slice())
58        .context("failed to build descriptor pool")
59}
60
61/// - **Naive** (`{"typeUrl": "...", "value": "<base64>"}`): decoded via
62///   `serde_json` and `pbjson`.
63///   - This takes the protobuf fields of an `Any` (`type_url`, `value`) and
64///     serializes them like it would any other field. This is the 'naive'
65///     approach to JSON encoding protobufs; see
66///     <https://github.com/influxdata/pbjson/issues/2>
67/// - **Standard** (`{"@type": "...", "field": value, ...}`): decoded via
68///   `prost-reflect`
69///   - `Any` is a Well-Known Type in Protobuf, so in the standard, it has
70///     special handling: the protobuf `type_url` should become the JSON `@type`
71///     field, and other fields should be inlined. See
72///     <https://protobuf.dev/reference/protobuf/google.protobuf/#any>.
73///   - This requires the concrete type's schema to be present in `pool`.
74///
75/// The naive method is tried first (via `serde_json` + `pbjson`); we fall back
76/// to `prost-reflect`, which requires descriptors but can decode
77/// standards-correct JSON-encoded protobufs.
78pub fn parse_json(json: &str, pool: &DescriptorPool) -> anyhow::Result<Plan> {
79    // serde handles the parsing of rust pbjson
80    if let Ok(plan) = serde_json::from_str::<Plan>(json) {
81        return Ok(plan);
82    }
83
84    //  prost-reflect's JSON deserializer handles google.protobuf.Any specifically.
85    //   DynamicMessage::deserialize implements the proto3 JSON mapping
86    //   spec, which encode Any as: { "@type": "type.googleapis.com/pkg.Msg", "field1": val, ... }
87    let plan_desc = pool
88        .get_message_by_name("substrait.Plan")
89        .context("substrait.Plan not found in descriptor pool")?;
90
91    let dyn_msg =
92        DynamicMessage::deserialize(plan_desc, &mut serde_json::Deserializer::from_str(json))
93            .context("failed to parse JSON as substrait.Plan")?;
94
95    Plan::decode(dyn_msg.encode_to_vec().as_slice())
96        .context("failed to decode Plan from dynamic message bytes")
97}