stringtheory/
lib.rs

1//! Sharing-optimized strings and string interning utilities.
2//!
3//! `stringtheory` provides two main components: a sharing-optimized string type, `MetaString`, and string interning
4//! implementations (`FixedSizeInterner`, `GenericMapInterner`, etc). These components are meant to work in concert,
5//! allowing for using a single string type that can handle owned, shared, and interned strings, and providing a way to
6//! efficiently intern strings strings when possible.
7#![deny(warnings)]
8#![deny(missing_docs)]
9// We only support 64-bit little-endian platforms anyways, so there's no risk of our enum variants having their values truncated.
10#![allow(clippy::enum_clike_unportable_variant)]
11
12use std::{
13    borrow::Borrow, fmt, hash, mem::ManuallyDrop, ops::Deref, ptr::NonNull, slice::from_raw_parts,
14    str::from_utf8_unchecked, sync::Arc,
15};
16
17use serde::Serialize;
18
19mod clone;
20pub use self::clone::CheapMetaString;
21
22pub mod interning;
23use self::interning::{InternedString, InternedStringState};
24
25const ZERO_VALUE: usize = 0;
26const TOP_MOST_BIT: usize = usize::MAX & !(isize::MAX as usize);
27const INLINED_STR_DATA_BUF_LEN: usize = std::mem::size_of::<usize>() * 3;
28const INLINED_STR_MAX_LEN: usize = INLINED_STR_DATA_BUF_LEN - 1;
29const INLINED_STR_MAX_LEN_U8: u8 = INLINED_STR_MAX_LEN as u8;
30
31const UNION_TYPE_TAG_VALUE_STATIC: u8 = get_offset_tag_value(0);
32const UNION_TYPE_TAG_VALUE_INTERNED_FIXED_SIZE: u8 = get_offset_tag_value(1);
33const UNION_TYPE_TAG_VALUE_INTERNED_GENERIC_MAP: u8 = get_offset_tag_value(2);
34const UNION_TYPE_TAG_VALUE_SHARED: u8 = get_offset_tag_value(3);
35
36const fn get_offset_tag_value(tag: u8) -> u8 {
37    const UNION_TYPE_TAG_VALUE_BASE: u8 = INLINED_STR_MAX_LEN as u8 + 1;
38
39    if tag > (u8::MAX - INLINED_STR_MAX_LEN as u8) {
40        panic!("Union type tag value must be less than 232 to fit.");
41    }
42
43    tag + UNION_TYPE_TAG_VALUE_BASE
44}
45
46const fn get_scaled_union_tag(tag: u8) -> usize {
47    // We want to shift our tag value (single byte) to make it the top most byte in a `usize`.
48    const UNION_TYPE_TAG_VALUE_SHIFT: u32 = (std::mem::size_of::<usize>() as u32 - 1) * 8;
49
50    (tag as usize) << UNION_TYPE_TAG_VALUE_SHIFT
51}
52
53// High-level invariant checks to ensure `stringtheory` isn't being used on an unsupported platform.
54#[cfg(not(all(target_pointer_width = "64", target_endian = "little")))]
55const _INVARIANTS_CHECK: () = {
56    compile_error!("`stringtheory` is only supported on 64-bit little-endian platforms.");
57};
58
59const fn is_tagged(cap: u8) -> bool {
60    cap & 0b10000000 != 0
61}
62
63const fn tag_cap(cap: usize) -> usize {
64    cap | TOP_MOST_BIT
65}
66
67const fn untag_cap(cap: usize) -> usize {
68    cap & !(TOP_MOST_BIT)
69}
70
71#[derive(Clone, Copy, Eq, PartialEq)]
72#[repr(usize)]
73enum Zero {
74    Zero = ZERO_VALUE,
75}
76
77#[derive(Clone, Copy, Debug, Eq, PartialEq)]
78#[repr(usize)]
79enum Tag {
80    Static = get_scaled_union_tag(UNION_TYPE_TAG_VALUE_STATIC),
81    InternedFixedSize = get_scaled_union_tag(UNION_TYPE_TAG_VALUE_INTERNED_FIXED_SIZE),
82    InternedGenericMap = get_scaled_union_tag(UNION_TYPE_TAG_VALUE_INTERNED_GENERIC_MAP),
83    Shared = get_scaled_union_tag(UNION_TYPE_TAG_VALUE_SHARED),
84}
85
86#[repr(C)]
87#[derive(Clone, Copy)]
88struct EmptyUnion {
89    ptr: Zero, // Field one.
90    len: Zero, // Field two.
91    cap: Zero, // Field three.
92}
93
94impl EmptyUnion {
95    const fn new() -> Self {
96        Self {
97            ptr: Zero::Zero,
98            len: Zero::Zero,
99            cap: Zero::Zero,
100        }
101    }
102}
103
104#[repr(C)]
105#[derive(Clone, Copy)]
106struct OwnedUnion {
107    ptr: *mut u8, // Field one
108    len: usize,   // Field two.
109    cap: usize,   // Field three.
110}
111
112impl OwnedUnion {
113    #[inline]
114    fn as_str(&self) -> &str {
115        // SAFETY: We know our pointer is valid, and non-null, since it's derived from a valid `String`, and that the
116        // data it points to is valid UTF-8, again, by virtue of it being derived from a valid `String`.
117        unsafe { from_utf8_unchecked(from_raw_parts(self.ptr, self.len)) }
118    }
119
120    fn into_owned(self) -> String {
121        // SAFETY: We know our pointer is valid, and non-null, since it's derived from a valid `String`, and that the
122        // data it points to is valid UTF-8, again, by virtue of it being derived from a valid `String`.
123        unsafe { String::from_raw_parts(self.ptr, self.len, untag_cap(self.cap)) }
124    }
125}
126
127#[repr(C)]
128#[derive(Clone, Copy)]
129struct StaticUnion {
130    value: &'static str, // Fields one and two.
131    _cap: Tag,           // Field three.
132}
133
134impl StaticUnion {
135    #[inline]
136    const fn as_str(&self) -> &str {
137        self.value
138    }
139}
140
141#[repr(C)]
142struct InternedUnion {
143    state: InternedStateUnion, // Fields one and two.
144    _cap: Tag,                 // Field three.
145}
146
147impl Drop for InternedUnion {
148    fn drop(&mut self) {
149        match self._cap {
150            Tag::InternedFixedSize => {
151                let state = unsafe { ManuallyDrop::take(&mut self.state.fixed_size) };
152                drop(state);
153            }
154            Tag::InternedGenericMap => {
155                let state = unsafe { ManuallyDrop::take(&mut self.state.generic_map) };
156                drop(state);
157            }
158            _ => unreachable!(),
159        }
160    }
161}
162
163impl Clone for InternedUnion {
164    fn clone(&self) -> Self {
165        // SAFETY: We use our tag (stored in `_cap`) to determine which field to access and clone, which
166        // ensures we only access fields which are initialized.
167        let state = unsafe {
168            match self._cap {
169                Tag::InternedFixedSize => InternedStateUnion {
170                    fixed_size: self.state.fixed_size.clone(),
171                },
172                Tag::InternedGenericMap => InternedStateUnion {
173                    generic_map: self.state.generic_map.clone(),
174                },
175                _ => unreachable!(),
176            }
177        };
178
179        Self { state, _cap: self._cap }
180    }
181}
182
183#[repr(C)]
184union InternedStateUnion {
185    fixed_size: ManuallyDrop<self::interning::fixed_size::StringState>,
186    generic_map: ManuallyDrop<self::interning::map::StringState>,
187}
188
189#[repr(C)]
190struct SharedUnion {
191    ptr: NonNull<str>, // Fields one and two
192    _cap: Tag,         // Field three.
193}
194
195impl SharedUnion {
196    #[inline]
197    const fn as_str(&self) -> &str {
198        // SAFETY: The pointee is still live by virtue of being held in an `Arc`.
199        unsafe { self.ptr.as_ref() }
200    }
201}
202
203#[repr(C)]
204#[derive(Clone, Copy)]
205struct InlinedUnion {
206    // Data is arranged as 23 bytes for string data, and the remaining 1 byte for the string length.
207    data: [u8; INLINED_STR_DATA_BUF_LEN], // Fields one, two, and three.
208}
209
210impl InlinedUnion {
211    #[inline]
212    fn as_str(&self) -> &str {
213        let len = self.data[INLINED_STR_MAX_LEN] as usize;
214
215        // SAFETY: We know our data is valid UTF-8 since we only ever derive inlined strings from a valid string
216        // reference.
217        unsafe { from_utf8_unchecked(&self.data[0..len]) }
218    }
219}
220
221#[repr(C)]
222#[derive(Clone, Copy)]
223struct DiscriminantUnion {
224    unused: [u8; INLINED_STR_DATA_BUF_LEN - 1], // Fields one, two, and three, minus the last byte.
225    tag_byte: u8,                               // Last byte, overlapped with the "highest" byte of field three.
226}
227
228#[derive(Debug, Eq, PartialEq)]
229enum UnionType {
230    Empty,
231    Owned,
232    Static,
233    InternedFixedSize,
234    InternedGenericMap,
235    Inlined,
236    Shared,
237}
238
239impl UnionType {
240    #[inline]
241    const fn is_owned(&self) -> bool {
242        matches!(self, UnionType::Owned)
243    }
244}
245
246impl DiscriminantUnion {
247    #[inline]
248    const fn get_union_type(&self) -> UnionType {
249        // At a high level, we encode the type of the union into the last byte of the struct, which overlaps with the
250        // capacity of owned strings, the length of an inlined string, and the unused field of other string types.
251        //
252        // Our logic is simple here:
253        //
254        // - Allocations can only ever be as large as `isize::MAX`, which means that the top-most bit of the capacity
255        //   value would never be used for a valid allocation.
256        // - In turn, the length of a string can also only ever be as large as `isize::MAX`, which means that the
257        //   top-most bit of the length byte for any string type would never be used
258        // - Inlined strings can only ever be up to 23 bytes long, which means that the top-most bit of the length byte
259        //   for an inlined string would never be used.
260        // - Static strings and interned strings only occupy the first two fields, which means their capacity should not
261        //   be used.
262        //
263        // As such, we encode the possible string types as follows:
264        //
265        // - when all fields are zero, we have an empty string
266        // - when the last byte does have the top-bit set, we have an owned string
267        // - when the last byte does _not_ have the top-bit set, and the value is less than or equal to 23, we have an
268        //   inlined string
269        // - when the last byte does _not_ have the top-bit set, and the value is greater than 23, we interpret the
270        //   specific value of the last byte as a discriminant for the remaining string types (static, interned, etc)
271        //
272        // We abuse the layout of little endian integers here to ensure that the upper-most bits of the capacity
273        // field overlaps with the last byte of an inlined string, and the unused field of a static/interned string.
274        //
275        // In an inlined string, its length byte is situated as the very last byte (data[23]) in its layout. In an
276        // owned string, we instead have a pointer, length, and capacity, in that order. This means that the length
277        // byte of the inlined string and the capacity field of the owned string overlap, as seen below:
278        //
279        //                ~ an inlined string, "hello, world", with a length of 12 (0C) ~
280        //      ┌───────────────────────────────────────────────────────────────────────────────┐
281        //      │ 68 65 6C 6C 6F 20 77 6F    72 6C 64 21 ?? ?? ?? ??    ?? ?? ?? ?? ?? ?? ?? 0C │
282        //      └───────────────────────────────────────────────────────────────────────────────┘
283        //                                                                                    ▲
284        //                                                                                    ├──── overlapping
285        //                          ~ an owned string with a capacity of 64 ~                 ▼         byte
286        //      ┌─────────────────────────┐┌─────────────────────────┐┌─────────────────────────┐
287        //      │ ?? ?? ?? ?? ?? ?? ?? ?? ││ ?? ?? ?? ?? ?? ?? ?? ?? ││ 40 00 00 00 00 00 00 80 │
288        //      └─────────────────────────┘└─────────────────────────┘└─────────────────────────┘
289        //                                                                                    ▲
290        //                 original capacity: 64                  (0x0000000000000040)        │
291        //                 "tagged" capacity: 9223372036854775872 (0x8000000000000040)        │
292        //                                                           ▲                        │
293        //                     (tag bit) ────────────────────────────┘                        │
294        //                                                                                    │
295        //                                                                                    │
296        //                       inlined last byte (0x0C)  [0 0 0 0 1 1 0 0] ◀────────────────┤
297        //                       owned last byte (0x80)    [1 0 0 0 0 0 0 0] ◀────────────────┤
298        //                                                                                    │
299        //                       inlined last byte                                            │
300        //                       maximum of 23 (0x17)      [0 0 0 1 0 1 1 1] ◀────────────────┤
301        //                                                                                    │
302        //                       "owned" discriminant                                         │
303        //                       bitmask (any X bit)       [X X X ? ? ? ? ?] ◀────────────────┘
304        //
305        // As such, the length byte of an inlined string overlaps with the capacity field of an owned string.
306        // Additionally, due to the integer fields being written in little-endian order, the "upper" byte of the
307        // capacity field -- the eight highest bits -- is actually written in the same location as the length byte
308        // of an inlined string.
309        //
310        // Given that we know an inlined string cannot be any longer than 23 bytes, we know that the top-most bit in
311        // the last byte (data[23]) can never be set, as it would imply a length of _at least_ 128. With that, we
312        // utilize invariant #3 of `Inner` -- allocations can never be larger than `isize::MAX` -- which lets us
313        // safely "tag" an owned string's capacity -- setting the upper most bit to 1 -- to indicate that it's an
314        // owned string.
315        //
316        // An inlined string couldn't possibly have a length that occupied that bit, and so we know if we're down
317        // here, and our pointer field isn't all zeroes and our length field isn't all zeroes, that we _have_ to be
318        // an owned string if our capacity is tagged.
319
320        // If the top bit is set, we know we're dealing with an owned string.
321        if is_tagged(self.tag_byte) {
322            return UnionType::Owned;
323        }
324
325        // The top-most bit has to be set for an owned string, but isn't set for any other type, so try differentiating
326        // at this point.
327        match self.tag_byte {
328            // Empty string. Easy.
329            0 => UnionType::Empty,
330
331            // Anything between 1 and INLINED_STR_MAX_LEN (23, inclusive) is an inlined string.
332            1..=INLINED_STR_MAX_LEN_U8 => UnionType::Inlined,
333
334            // These are fixed values between 24 and 128, so we just match them directly.
335            UNION_TYPE_TAG_VALUE_STATIC => UnionType::Static,
336            UNION_TYPE_TAG_VALUE_INTERNED_FIXED_SIZE => UnionType::InternedFixedSize,
337            UNION_TYPE_TAG_VALUE_INTERNED_GENERIC_MAP => UnionType::InternedGenericMap,
338            UNION_TYPE_TAG_VALUE_SHARED => UnionType::Shared,
339
340            // If we haven't matched any specific type tag value, then this is something else that we don't handle or
341            // know about... which we handle as just acting like we're an empty string for simplicity.
342            _ => UnionType::Empty,
343        }
344    }
345}
346
347/// The core data structure for holding all different string variants.
348///
349/// This union has six data fields -- one for each possible string variant -- and a discriminant field, used to
350/// determine which string variant is actually present. The discriminant field interrogates the bits in each machine
351/// word field (all variants are three machine words) to determine which bit patterns are valid or not for a given
352/// variant, allowing the string variant to be authoritatively determined.
353///
354/// # Invariants
355///
356/// This code depends on a number of invariants in order to work correctly:
357///
358/// 1. Only used on 64-bit little-endian platforms. (checked at compile-time via _INVARIANTS_CHECK)
359/// 2. The data pointers for `String` and `&'static str` cannot ever be null when the strings are non-empty.
360/// 3. Allocations can never be larger than `isize::MAX` (see [here][rust_isize_alloc_limit]), meaning that any
361///    length/capacity field for a string cannot ever be larger than `isize::MAX`, implying the 64th bit (top-most bit)
362///    for length/capacity should always be 0.
363/// 4. An inlined string can only hold up to 23 bytes of data, meaning that the length byte for that string can never
364///    have a value greater than 23. (_We_ have to provide this invariant, which is handled in `Inner::try_inlined`.)
365///
366/// [rust_isize_alloc_limit]: https://doc.rust-lang.org/stable/std/alloc/struct.Layout.html#method.from_size_align
367union Inner {
368    empty: EmptyUnion,
369    owned: OwnedUnion,
370    static_: StaticUnion,
371    interned: ManuallyDrop<InternedUnion>,
372    shared: ManuallyDrop<SharedUnion>,
373    inlined: InlinedUnion,
374    discriminant: DiscriminantUnion,
375}
376
377impl Inner {
378    const fn empty() -> Self {
379        Self {
380            empty: EmptyUnion::new(),
381        }
382    }
383
384    fn owned(value: String) -> Self {
385        match value.capacity() {
386            // The string we got is empty.
387            0 => Self::empty(),
388            cap => {
389                let mut value = value.into_bytes();
390
391                let ptr = value.as_mut_ptr();
392                let len = value.len();
393
394                // We're taking ownership of the underlying string allocation so we can't let it drop.
395                std::mem::forget(value);
396
397                Self {
398                    owned: OwnedUnion {
399                        ptr,
400                        len,
401                        cap: tag_cap(cap),
402                    },
403                }
404            }
405        }
406    }
407
408    const fn static_str(value: &'static str) -> Self {
409        match value.len() {
410            0 => Self::empty(),
411            _ => Self {
412                static_: StaticUnion {
413                    value,
414                    _cap: Tag::Static,
415                },
416            },
417        }
418    }
419
420    fn interned(value: InternedString) -> Self {
421        match value.len() {
422            0 => Self::empty(),
423            _len => {
424                let (state, tag) = match value.into_state() {
425                    InternedStringState::FixedSize(fixed_size) => (
426                        InternedStateUnion {
427                            fixed_size: ManuallyDrop::new(fixed_size),
428                        },
429                        Tag::InternedFixedSize,
430                    ),
431                    InternedStringState::GenericMap(generic_map) => (
432                        InternedStateUnion {
433                            generic_map: ManuallyDrop::new(generic_map),
434                        },
435                        Tag::InternedGenericMap,
436                    ),
437                };
438
439                Self {
440                    interned: ManuallyDrop::new(InternedUnion { state, _cap: tag }),
441                }
442            }
443        }
444    }
445
446    fn shared(value: Arc<str>) -> Self {
447        match value.len() {
448            0 => Self::empty(),
449            _len => Self {
450                shared: ManuallyDrop::new(SharedUnion {
451                    // SAFETY: We know `ptr` is non-null because `Arc::into_raw` is called on a valid `Arc<str>`.
452                    ptr: unsafe { NonNull::new_unchecked(Arc::into_raw(value).cast_mut()) },
453                    _cap: Tag::Shared,
454                }),
455            },
456        }
457    }
458
459    fn try_inlined(value: &str) -> Option<Self> {
460        match value.len() {
461            0 => Some(Self::empty()),
462            len => {
463                if len > INLINED_STR_MAX_LEN {
464                    return None;
465                }
466
467                let mut data = [0; INLINED_STR_DATA_BUF_LEN];
468
469                // SAFETY: We know it fits because we just checked that the string length is 23 or less.
470                data[INLINED_STR_MAX_LEN] = len as u8;
471
472                let buf = value.as_bytes();
473                data[0..len].copy_from_slice(buf);
474
475                Some(Self {
476                    inlined: InlinedUnion { data },
477                })
478            }
479        }
480    }
481
482    #[inline]
483    fn as_str(&self) -> &str {
484        let union_type = unsafe { self.discriminant.get_union_type() };
485        match union_type {
486            UnionType::Empty => "",
487            UnionType::Owned => {
488                let owned = unsafe { &self.owned };
489                owned.as_str()
490            }
491            UnionType::Static => {
492                let static_ = unsafe { &self.static_ };
493                static_.as_str()
494            }
495            UnionType::InternedFixedSize => {
496                let interned = unsafe { &self.interned.state.fixed_size };
497                interned.as_str()
498            }
499            UnionType::InternedGenericMap => {
500                let interned = unsafe { &self.interned.state.generic_map };
501                interned.as_str()
502            }
503            UnionType::Shared => {
504                let shared = unsafe { &self.shared };
505                shared.as_str()
506            }
507            UnionType::Inlined => {
508                let inlined = unsafe { &self.inlined };
509                inlined.as_str()
510            }
511        }
512    }
513
514    #[inline]
515    const fn get_union_type(&self) -> UnionType {
516        unsafe { self.discriminant.get_union_type() }
517    }
518
519    fn into_owned(mut self) -> String {
520        let union_type = unsafe { self.discriminant.get_union_type() };
521        match union_type {
522            UnionType::Empty => String::new(),
523            UnionType::Owned => {
524                // We're (`Inner`) being consumed here, but we need to update our internal state to ensure that our drop
525                // logic doesn't try to double free the string allocation.
526                let owned = unsafe { self.owned };
527                self.empty = EmptyUnion::new();
528
529                owned.into_owned()
530            }
531            UnionType::Static => {
532                let static_ = unsafe { self.static_ };
533                static_.as_str().to_owned()
534            }
535            UnionType::InternedFixedSize => {
536                let interned = unsafe { &self.interned.state.fixed_size };
537                interned.as_str().to_owned()
538            }
539            UnionType::InternedGenericMap => {
540                let interned = unsafe { &self.interned.state.generic_map };
541                interned.as_str().to_owned()
542            }
543            UnionType::Shared => {
544                let shared = unsafe { &self.shared };
545                shared.as_str().to_owned()
546            }
547            UnionType::Inlined => {
548                let inlined = unsafe { self.inlined };
549                inlined.as_str().to_owned()
550            }
551        }
552    }
553}
554
555impl Drop for Inner {
556    fn drop(&mut self) {
557        let union_type = unsafe { self.discriminant.get_union_type() };
558        match union_type {
559            UnionType::Owned => {
560                let owned = unsafe { &mut self.owned };
561
562                let ptr = owned.ptr;
563                let len = owned.len;
564                let cap = untag_cap(owned.cap);
565
566                // SAFETY: The pointer has to be non-null, because we only ever construct an owned variant when the
567                // `String` has a non-zero capacity, which implies a valid allocation, and thus a valid, non-null
568                // pointer.
569                let data = unsafe { Vec::<u8>::from_raw_parts(ptr, len, cap) };
570                drop(data);
571            }
572            UnionType::InternedFixedSize | UnionType::InternedGenericMap => {
573                let interned = unsafe { &mut self.interned };
574
575                // SAFETY: We're already dropping `Inner`, so nothing else can use the `InternedString` value after we
576                // consume it here.
577                let data = unsafe { ManuallyDrop::take(interned) };
578                drop(data);
579            }
580            UnionType::Shared => {
581                let shared = unsafe { &mut self.shared };
582
583                // Decrement the strong count before we drop, ensuring the `Arc` has a chance to clean itself up if this
584                // is the less strong reference.
585                //
586                // SAFETY: We know `shared.ptr` was obtained from `Arc::into_raw`, so it's valid to decrement on. We
587                // also know the backing storage is still live because it has to be by virtue of us being here.
588                unsafe {
589                    Arc::decrement_strong_count(shared.ptr.as_ptr().cast_const());
590                }
591            }
592            _ => {}
593        }
594    }
595}
596
597impl Clone for Inner {
598    fn clone(&self) -> Self {
599        let union_type = unsafe { self.discriminant.get_union_type() };
600        match union_type {
601            UnionType::Empty => Self::empty(),
602            UnionType::Owned => {
603                let owned = unsafe { self.owned };
604                let s = owned.as_str();
605
606                // We specifically try to inline here.
607                //
608                // At a high-level, when we're _given_ an owned string, we avoid inlining because we don't want to trash
609                // the underlying allocation since we may be asked to give it back later (via `MetaString::into_owned`).
610                // However, when we're _cloning_, we'd rather avoiding allocating if we can help it.
611                Self::try_inlined(s).unwrap_or_else(|| Self::owned(s.to_owned()))
612            }
613            UnionType::Static => Self {
614                static_: unsafe { self.static_ },
615            },
616            UnionType::InternedFixedSize | UnionType::InternedGenericMap => Self {
617                interned: unsafe { self.interned.clone() },
618            },
619            UnionType::Shared => {
620                let shared = unsafe { &self.shared };
621
622                // We have to increment the strong count before cloning.
623                //
624                // SAFETY: We know `shared.ptr` was obtained from `Arc::into_raw`. We also know that if we're cloning
625                // this value, that the underlying `Arc` must still be live, since we're holding a reference to it.
626                unsafe {
627                    Arc::increment_strong_count(shared.ptr.as_ptr().cast_const());
628                }
629
630                Self {
631                    shared: ManuallyDrop::new(SharedUnion {
632                        ptr: shared.ptr,
633                        _cap: Tag::Shared,
634                    }),
635                }
636            }
637            UnionType::Inlined => Self {
638                inlined: unsafe { self.inlined },
639            },
640        }
641    }
642}
643
644// SAFETY: None of our union variants are tied to the original thread they were created on.
645unsafe impl Send for Inner {}
646
647// SAFETY: None of our union variants use any form of interior mutability and are thus safe to be shared between
648// threads.
649unsafe impl Sync for Inner {}
650
651/// An immutable string type that abstracts over various forms of string storage.
652///
653/// Normally, developers will work with either `String` (owned) or `&str` (borrowed) when dealing with strings. In some
654/// cases, though, it can be useful to work with strings that use alternative storage, such as those that are atomically
655/// shared (e.g. `InternedString`). While using those string types themselves isn't complex, using them and _also_
656/// supporting normal string types can be complex.
657///
658/// `MetaString` is an opinionated string type that abstracts over the normal string types like `String` and `&str`
659/// while also supporting alternative storage, such as interned strings from `FixedSizeInterner`, unifying these
660/// different variants behind a single concrete type.
661///
662/// ## Supported types
663///
664/// `MetaString` supports the following "variants":
665///
666/// - owned (`String` and non-inlineable `&str`)
667/// - static (`&'static str`)
668/// - interned (`InternedString`)
669/// - shared (`Arc<str>`)
670/// - inlined (up to 23 bytes)
671///
672/// ### Owned and borrowed strings
673///
674/// `MetaString` can be created from `String` and `&str` directly. For owned scenarios (`String`), the string value is
675/// simply wrapped. For borrowed strings, we attempt to inline them (see more below) or, if they cannot be inlined, they
676/// are copied into a new `String`.
677///
678/// ### Static strings
679///
680/// `MetaString` can be created from `&'static str` directly. This is useful for string literals and other static
681/// strings that are too large to be inlined.
682///
683/// ### Interned strings
684///
685/// `MetaString` can also be created from `InternedString`, which is a string that has been interned (using an interner
686/// like [`FixedSizeInterner`][crate::interning::FixedSizeInterner] or
687/// [`GenericMapInterner`][crate::interning::GenericMapInterner]). Interned strings are essentially a combination of the
688/// properties of `Arc<T>` -- owned wrappers around an atomically reference counted piece of data -- and a fixed-size
689/// buffer, where we allocate one large buffer, and write many small strings into it, and provide references to those
690/// strings through `InternedString`.
691///
692/// ### Shared strings
693///
694/// `MetaString` can be created from `Arc<str>`, which is a string slice that can be atomically shared between threads.
695/// This is a simpler version of interned strings where strict memory control and re-use is not required.
696///
697/// ### Inlined strings
698///
699/// Finally, `MetaString` can also be created by inlining small strings into `MetaString` itself, avoiding the need for
700/// any backing allocation. "Small string optimization" is a common optimization for string types where small strings
701/// can be stored directly in a string type itself by utilizing a "union"-style layout.
702///
703/// As `MetaString` utilizes such a layout, we can provide a small string optimization that allows for strings up to 23
704/// bytes in length.
705///
706/// ## Conversion methods
707///
708/// Implementations of `From<T>` exist for all of the aforementioned types to allow for easily converting to
709/// `MetaString`. Once a caller has a `MetaString` value, they are generally expected to interact with the string in a
710/// read-only way, as `MetaString` can be dereferenced directly to `&str`.
711///
712/// If a caller needs to be able to modify the string data, they can call `into_owned` to get an owned version of the
713/// string, make their modifications to the owned version, and then convert that back to `MetaString`.
714#[derive(Clone)]
715pub struct MetaString {
716    inner: Inner,
717}
718
719impl MetaString {
720    /// Creates an empty `MetaString`.
721    ///
722    /// This does not allocate.
723    pub const fn empty() -> Self {
724        Self { inner: Inner::empty() }
725    }
726
727    /// Creates a new `MetaString` from the given static string.
728    ///
729    /// This does not allocate.
730    pub const fn from_static(s: &'static str) -> Self {
731        Self {
732            inner: Inner::static_str(s),
733        }
734    }
735
736    /// Attempts to create a new `MetaString` from the given string if it can be inlined.
737    pub fn try_inline(s: &str) -> Option<Self> {
738        Inner::try_inlined(s).map(|inner| Self { inner })
739    }
740
741    /// Returns `true` if `self` has a length of zero bytes.
742    pub fn is_empty(&self) -> bool {
743        self.deref().is_empty()
744    }
745
746    /// Returns `true` if `self` can be cheaply cloned.
747    pub const fn is_cheaply_cloneable(&self) -> bool {
748        // If we're wrapping an owned string, cloning means cloning that allocation. All other types are cheap to clone.
749        !self.inner.get_union_type().is_owned()
750    }
751
752    /// Consumes `self` and returns an owned `String`.
753    ///
754    /// If the `MetaString` is already owned, this will simply return the inner `String` directly. Otherwise, this will
755    /// allocate an owned version (`String`) of the string data.
756    pub fn into_owned(self) -> String {
757        self.inner.into_owned()
758    }
759}
760
761impl Default for MetaString {
762    fn default() -> Self {
763        Self::empty()
764    }
765}
766
767impl hash::Hash for MetaString {
768    fn hash<H: hash::Hasher>(&self, state: &mut H) {
769        self.deref().hash(state)
770    }
771}
772
773impl PartialEq<str> for MetaString {
774    fn eq(&self, other: &str) -> bool {
775        self.deref() == other
776    }
777}
778
779impl PartialEq<&str> for MetaString {
780    fn eq(&self, other: &&str) -> bool {
781        self.deref() == *other
782    }
783}
784
785impl PartialEq<MetaString> for MetaString {
786    fn eq(&self, other: &MetaString) -> bool {
787        self.deref() == other.deref()
788    }
789}
790
791impl Eq for MetaString {}
792
793impl PartialOrd for MetaString {
794    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
795        Some(self.cmp(other))
796    }
797}
798
799impl Ord for MetaString {
800    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
801        self.deref().cmp(other.deref())
802    }
803}
804
805impl Borrow<str> for MetaString {
806    fn borrow(&self) -> &str {
807        self.deref()
808    }
809}
810
811impl Serialize for MetaString {
812    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
813    where
814        S: serde::Serializer,
815    {
816        serializer.serialize_str(self.deref())
817    }
818}
819
820impl From<String> for MetaString {
821    fn from(s: String) -> Self {
822        Self { inner: Inner::owned(s) }
823    }
824}
825
826impl From<&str> for MetaString {
827    fn from(s: &str) -> Self {
828        Self::try_inline(s).unwrap_or_else(|| Self::from(s.to_owned()))
829    }
830}
831
832impl From<InternedString> for MetaString {
833    fn from(s: InternedString) -> Self {
834        Self {
835            inner: Inner::interned(s),
836        }
837    }
838}
839
840impl From<Arc<str>> for MetaString {
841    fn from(s: Arc<str>) -> Self {
842        Self {
843            inner: Inner::shared(s),
844        }
845    }
846}
847
848impl From<MetaString> for protobuf::Chars {
849    fn from(value: MetaString) -> Self {
850        value.into_owned().into()
851    }
852}
853
854impl<'a> From<&'a MetaString> for protobuf::Chars {
855    fn from(value: &'a MetaString) -> Self {
856        // TODO: We're foregoing additional code/complexity to recover a static reference, when our string storage is
857        // static, in order to optimize the conversion to `Bytes` and then `Chars`.
858        //
859        // Static strings being written to Protocol Buffers should be decently rare across the codebase, so no biggie
860        // for now.
861        value.deref().into()
862    }
863}
864
865impl Deref for MetaString {
866    type Target = str;
867
868    fn deref(&self) -> &str {
869        self.inner.as_str()
870    }
871}
872
873impl AsRef<str> for MetaString {
874    fn as_ref(&self) -> &str {
875        self.deref()
876    }
877}
878
879impl fmt::Debug for MetaString {
880    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
881        self.deref().fmt(f)
882    }
883}
884
885impl fmt::Display for MetaString {
886    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
887        self.deref().fmt(f)
888    }
889}
890
891#[cfg(test)]
892mod tests {
893    use std::{num::NonZeroUsize, sync::Arc};
894
895    use proptest::{prelude::*, proptest};
896
897    use super::{interning::GenericMapInterner, InlinedUnion, Inner, MetaString, UnionType};
898    use crate::interning::{FixedSizeInterner, Interner as _};
899
900    #[test]
901    fn struct_sizes() {
902        // Overall, we expect `MetaString`, and thus `Inner`, to always be three machine words.
903        assert_eq!(std::mem::size_of::<MetaString>(), std::mem::size_of::<usize>() * 3);
904        assert_eq!(std::mem::size_of::<Inner>(), std::mem::size_of::<usize>() * 3);
905
906        // We also expect all of inlined union variant to be the exact size of `Inner`, which means we're properly
907        // maximizing the available space for inlining.
908        assert_eq!(std::mem::size_of::<InlinedUnion>(), std::mem::size_of::<Inner>());
909    }
910
911    #[test]
912    fn static_str_inlineable() {
913        // We always use the static variant, even if the string is inlineable, because this lets us make `from_static` const.
914        let s = "hello";
915        let meta = MetaString::from_static(s);
916
917        assert_eq!(meta.inner.get_union_type(), UnionType::Static);
918        assert_eq!(s, &*meta);
919        assert_eq!(s, meta.into_owned());
920    }
921
922    #[test]
923    fn static_str_not_inlineable() {
924        let s = "hello there, world! it's me, margaret!";
925        let meta = MetaString::from_static(s);
926
927        assert_eq!(s, &*meta);
928        assert_eq!(meta.inner.get_union_type(), UnionType::Static);
929        assert_eq!(s, meta.into_owned());
930    }
931
932    #[test]
933    fn owned_string() {
934        let s_orig = "hello";
935        let s = String::from(s_orig);
936        let meta = MetaString::from(s);
937
938        assert_eq!(s_orig, &*meta);
939        assert_eq!(meta.inner.get_union_type(), UnionType::Owned);
940        assert_eq!(s_orig, meta.into_owned());
941    }
942
943    #[test]
944    fn inlined_string() {
945        let s = "hello";
946        let meta = MetaString::from(s);
947
948        assert_eq!(s, &*meta);
949        assert_eq!(meta.inner.get_union_type(), UnionType::Inlined);
950        assert_eq!(s, meta.into_owned());
951    }
952
953    #[test]
954    fn interned_string() {
955        let intern_str = "hello interned str!";
956
957        let fs_interner = FixedSizeInterner::<1>::new(NonZeroUsize::new(1024).unwrap());
958        let s = fs_interner.try_intern(intern_str).unwrap();
959        assert_eq!(intern_str, &*s);
960
961        let meta = MetaString::from(s);
962        assert_eq!(intern_str, &*meta);
963        assert_eq!(meta.inner.get_union_type(), UnionType::InternedFixedSize);
964        assert_eq!(intern_str, meta.into_owned());
965
966        let gm_interner = GenericMapInterner::new(NonZeroUsize::new(1024).unwrap());
967        let s = gm_interner.try_intern(intern_str).unwrap();
968        assert_eq!(intern_str, &*s);
969
970        let meta = MetaString::from(s);
971        assert_eq!(intern_str, &*meta);
972        assert_eq!(meta.inner.get_union_type(), UnionType::InternedGenericMap);
973        assert_eq!(intern_str, meta.into_owned());
974    }
975
976    #[test]
977    fn shared_string() {
978        let shared_str = "hello shared str!";
979
980        let s = Arc::<str>::from(shared_str);
981        assert_eq!(shared_str, &*s);
982
983        let meta = MetaString::from(s);
984        assert_eq!(shared_str, &*meta);
985        assert_eq!(meta.inner.get_union_type(), UnionType::Shared);
986        assert_eq!(shared_str, meta.into_owned());
987    }
988
989    #[test]
990    fn shared_string_clone() {
991        let shared_str = "hello shared str!";
992        let s = Arc::<str>::from(shared_str);
993        let meta = MetaString::from(s);
994
995        // Clone the `MetaString` to make sure we can still access the original string.
996        let meta2 = meta.clone();
997        assert_eq!(shared_str, &*meta2);
998
999        // Drop the original `MetaString` to ensure we can still access the string from our clone after going through
1000        // the drop logic for the shared variant.
1001        drop(meta);
1002        assert_eq!(shared_str, &*meta2);
1003    }
1004
1005    #[test]
1006    fn empty_string_shared() {
1007        let shared_str = "";
1008
1009        let s = Arc::<str>::from(shared_str);
1010        assert_eq!(shared_str, &*s);
1011
1012        let meta = MetaString::from(s);
1013        assert_eq!(shared_str, &*meta);
1014        assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1015        assert_eq!(shared_str, meta.into_owned());
1016    }
1017
1018    #[test]
1019    fn empty_string_interned() {
1020        let intern_str = "";
1021
1022        let interner = GenericMapInterner::new(NonZeroUsize::new(1024).unwrap());
1023        let s = interner.try_intern(intern_str).unwrap();
1024        assert_eq!(intern_str, &*s);
1025
1026        let meta = MetaString::from(s);
1027        assert_eq!(intern_str, &*meta);
1028        assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1029        assert_eq!(intern_str, meta.into_owned());
1030    }
1031
1032    #[test]
1033    fn empty_string_static() {
1034        let s = "";
1035
1036        let meta = MetaString::from_static(s);
1037        assert_eq!(s, &*meta);
1038        assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1039        assert_eq!(s, meta.into_owned());
1040    }
1041
1042    #[test]
1043    fn empty_string_inlined() {
1044        let s = "";
1045
1046        let meta = MetaString::try_inline(s).expect("empty string definitely 'fits'");
1047        assert_eq!(s, &*meta);
1048        assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1049        assert_eq!(s, meta.into_owned());
1050    }
1051
1052    #[test]
1053    fn empty_string_owned() {
1054        // When a string has capacity, we don't care if it's actually empty or not, because we want to preserve the
1055        // allocation... so our string here is empty but _does_ have capacity.
1056        let s = String::with_capacity(32);
1057        let actual_cap = s.capacity();
1058
1059        let meta = MetaString::from(s);
1060        assert_eq!("", &*meta);
1061        assert_eq!(meta.inner.get_union_type(), UnionType::Owned);
1062
1063        let owned = meta.into_owned();
1064        assert_eq!(owned.capacity(), actual_cap);
1065        assert_eq!("", owned);
1066    }
1067
1068    #[test]
1069    fn empty_string_owned_zero_capacity() {
1070        // When a string has _no_ capacity, it's effectively empty, and we treat it that way.
1071        let s = String::new();
1072
1073        let meta = MetaString::from(s);
1074        assert_eq!("", &*meta);
1075        assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1076        assert_eq!("", meta.into_owned());
1077    }
1078
1079    #[test]
1080    fn test_is_cheaply_cloneable() {
1081        // Owned strings are never cheap to clone.
1082        let s =
1083            String::from("big ol' stringy string that can't be inlined and lives out its bleak existence in the heap");
1084        let ms = MetaString::from(s);
1085        assert!(!ms.is_cheaply_cloneable());
1086
1087        // Empty strings are always cheap to clone.
1088        let ms = MetaString::empty();
1089        assert!(ms.is_cheaply_cloneable());
1090
1091        // Inlined strings are always cheap to clone.
1092        let s = "hello";
1093        let ms = MetaString::try_inline(s).expect("inlined string should fit");
1094        assert!(ms.is_cheaply_cloneable());
1095
1096        // Static strings are always cheap to clone.
1097        let s = "hello there, world! it's me, margaret!";
1098        let ms = MetaString::from_static(s);
1099        assert!(ms.is_cheaply_cloneable());
1100
1101        // Interned strings are always cheap to clone.
1102        let interner = GenericMapInterner::new(NonZeroUsize::new(1024).unwrap());
1103        let s = interner.try_intern("hello interned str!").unwrap();
1104        let ms = MetaString::from(s);
1105        assert!(ms.is_cheaply_cloneable());
1106
1107        // Shared strings are always cheap to clone.
1108        let s = Arc::from("hello shared str!");
1109        let ms = MetaString::from(s);
1110        assert!(ms.is_cheaply_cloneable());
1111    }
1112
1113    fn arb_unicode_str_max_len(max_len: usize) -> impl Strategy<Value = String> {
1114        ".{0,23}".prop_filter("resulting string is too longer", move |s| s.len() <= max_len)
1115    }
1116
1117    proptest! {
1118        #![proptest_config(ProptestConfig::with_cases(10000))]
1119
1120        #[test]
1121        #[cfg_attr(miri, ignore)]
1122        fn property_test_inlined_string(
1123            input in arb_unicode_str_max_len(23),
1124        ) {
1125            assert!(input.len() <= 23, "input should be 23 bytes or less");
1126            let meta = MetaString::try_inline(&input).expect("input should fit");
1127
1128            if input.is_empty() {
1129                assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1130            } else {
1131                assert_eq!(input, &*meta);
1132                assert_eq!(meta.inner.get_union_type(), UnionType::Inlined);
1133            }
1134        }
1135
1136        #[test]
1137        #[cfg_attr(miri, ignore)]
1138        fn property_test_owned_string(
1139            input in ".*",
1140        ) {
1141            let is_empty = input.is_empty();
1142            let meta = MetaString::from(input);
1143
1144            if is_empty {
1145                assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1146            } else {
1147                assert_eq!(meta.inner.get_union_type(), UnionType::Owned);
1148            }
1149        }
1150    }
1151}