substrait_explain/json.rs
1//! The standard `serde`/pbjson JSON encoding used by Rust stores `google.protobuf.Any`
2//! fields as `{"typeUrl": "...", "value": "<base64>"}`. Go's `protojson` library uses a
3//! different encoding: `{"@type": "...", "field1": val, ...}` where the concrete message's
4//! fields are inlined. `serde_json::from_str::<Plan>` fails on Go-produced JSON because it
5//! only understands the `typeUrl/value` form.
6//!
7//! [`prost_reflect::DynamicMessage`] implements the full protobuf JSON mapping spec and
8//! handles both forms, as long as the `DescriptorPool` contains the schema for every type
9//! URL referenced in the JSON.
10//!
11//! This module exposes [`build_descriptor_pool`] (to construct
12//! the pool, optionally merging in extra descriptor blobs for extension types) and
13//! [`parse_json`] (to parse a JSON string into a [`Plan`] using the pool).
14//!
15//! # Example
16//!
17//! ```rust,ignore
18//! use substrait_explain::json::{build_descriptor_pool, parse_json};
19//!
20//! static MY_EXT: &[u8] = include_bytes!("my_extensions.bin");
21//! let pool = build_descriptor_pool(&[MY_EXT]).unwrap();
22//! // Works with both Go protojson and Rust pbjson encoding.
23//! let plan = parse_json(json_str, &pool).unwrap();
24//! ```
25
26use anyhow::Context;
27use prost::Message;
28use prost_reflect::{DescriptorPool, DynamicMessage};
29use prost_types::FileDescriptorSet;
30use substrait::proto::Plan;
31
32/// Build a [`DescriptorPool`] covering the Substrait core schema plus any extra
33/// descriptor passed in.
34pub fn build_descriptor_pool(extra_descriptors: &[&[u8]]) -> anyhow::Result<DescriptorPool> {
35 let mut fds = FileDescriptorSet::decode(substrait::proto::FILE_DESCRIPTOR_SET)
36 .context("failed to decode substrait core descriptor")?;
37
38 // Descriptor blobs compiled from proto files bundle their transitive dependencies,
39 // therefore custom descriptors are likely to have repeat file names
40 // such as: google/protobuf/timestamp.proto, google/protobuf/any.proto,
41 // which are also present in substrait core protos.
42 // DescriptorPool::decode treats duplicate filenames as a hard error.
43 // Track filenames already in the set so we can skip duplicates.
44 let mut seen: std::collections::HashSet<String> =
45 fds.file.iter().map(|f| f.name().to_owned()).collect();
46
47 for blob in extra_descriptors {
48 let extra =
49 FileDescriptorSet::decode(*blob).context("failed to decode extra descriptor")?;
50 for f in extra.file {
51 if seen.insert(f.name().to_owned()) {
52 fds.file.push(f);
53 }
54 }
55 }
56
57 DescriptorPool::decode(fds.encode_to_vec().as_slice())
58 .context("failed to build descriptor pool")
59}
60
61/// - **Naive** (`{"typeUrl": "...", "value": "<base64>"}`): decoded via
62/// `serde_json` and `pbjson`.
63/// - This takes the protobuf fields of an `Any` (`type_url`, `value`) and
64/// serializes them like it would any other field. This is the 'naive'
65/// approach to JSON encoding protobufs; see
66/// <https://github.com/influxdata/pbjson/issues/2>
67/// - **Standard** (`{"@type": "...", "field": value, ...}`): decoded via
68/// `prost-reflect`
69/// - `Any` is a Well-Known Type in Protobuf, so in the standard, it has
70/// special handling: the protobuf `type_url` should become the JSON `@type`
71/// field, and other fields should be inlined. See
72/// <https://protobuf.dev/reference/protobuf/google.protobuf/#any>.
73/// - This requires the concrete type's schema to be present in `pool`.
74///
75/// The naive method is tried first (via `serde_json` + `pbjson`); we fall back
76/// to `prost-reflect`, which requires descriptors but can decode
77/// standards-correct JSON-encoded protobufs.
78pub fn parse_json(json: &str, pool: &DescriptorPool) -> anyhow::Result<Plan> {
79 // serde handles the parsing of rust pbjson
80 if let Ok(plan) = serde_json::from_str::<Plan>(json) {
81 return Ok(plan);
82 }
83
84 // prost-reflect's JSON deserializer handles google.protobuf.Any specifically.
85 // DynamicMessage::deserialize implements the proto3 JSON mapping
86 // spec, which encode Any as: { "@type": "type.googleapis.com/pkg.Msg", "field1": val, ... }
87 let plan_desc = pool
88 .get_message_by_name("substrait.Plan")
89 .context("substrait.Plan not found in descriptor pool")?;
90
91 let dyn_msg =
92 DynamicMessage::deserialize(plan_desc, &mut serde_json::Deserializer::from_str(json))
93 .context("failed to parse JSON as substrait.Plan")?;
94
95 Plan::decode(dyn_msg.encode_to_vec().as_slice())
96 .context("failed to decode Plan from dynamic message bytes")
97}