Skip to main content

stringtheory/
lib.rs

1//! Sharing-optimized strings and string interning utilities.
2//!
3//! `stringtheory` provides two main components: a sharing-optimized string type, `MetaString`, and string interning
4//! implementations (`FixedSizeInterner`, `GenericMapInterner`, etc). These components are meant to work in concert,
5//! allowing for using a single string type that can handle owned, shared, and interned strings, and providing a way to
6//! efficiently intern strings strings when possible.
7#![deny(warnings)]
8#![deny(missing_docs)]
9// We only support 64-bit little-endian platforms anyways, so there's no risk of our enum variants having their values truncated.
10#![allow(clippy::enum_clike_unportable_variant)]
11
12use std::{
13    borrow::Borrow, fmt, hash, mem::ManuallyDrop, ops::Deref, ptr::NonNull, slice::from_raw_parts,
14    str::from_utf8_unchecked, sync::Arc,
15};
16
17use serde::{Deserialize, Serialize};
18
19mod clone;
20
21pub use self::clone::CheapMetaString;
22
23pub mod interning;
24use self::interning::{InternedString, InternedStringState, Interner};
25
26const ZERO_VALUE: usize = 0;
27const TOP_MOST_BIT: usize = usize::MAX & !(isize::MAX as usize);
28const INLINED_STR_DATA_BUF_LEN: usize = std::mem::size_of::<usize>() * 3;
29const INLINED_STR_MAX_LEN: usize = INLINED_STR_DATA_BUF_LEN - 1;
30const INLINED_STR_MAX_LEN_U8: u8 = INLINED_STR_MAX_LEN as u8;
31
32const UNION_TYPE_TAG_VALUE_STATIC: u8 = get_offset_tag_value(0);
33const UNION_TYPE_TAG_VALUE_INTERNED_FIXED_SIZE: u8 = get_offset_tag_value(1);
34const UNION_TYPE_TAG_VALUE_INTERNED_GENERIC_MAP: u8 = get_offset_tag_value(2);
35const UNION_TYPE_TAG_VALUE_SHARED: u8 = get_offset_tag_value(3);
36
37const fn get_offset_tag_value(tag: u8) -> u8 {
38    const UNION_TYPE_TAG_VALUE_BASE: u8 = INLINED_STR_MAX_LEN as u8 + 1;
39
40    if tag > (u8::MAX - INLINED_STR_MAX_LEN as u8) {
41        panic!("Union type tag value must be less than 232 to fit.");
42    }
43
44    tag + UNION_TYPE_TAG_VALUE_BASE
45}
46
47const fn get_scaled_union_tag(tag: u8) -> usize {
48    // We want to shift our tag value (single byte) to make it the top most byte in a `usize`.
49    const UNION_TYPE_TAG_VALUE_SHIFT: u32 = (std::mem::size_of::<usize>() as u32 - 1) * 8;
50
51    (tag as usize) << UNION_TYPE_TAG_VALUE_SHIFT
52}
53
54// High-level invariant checks to ensure `stringtheory` isn't being used on an unsupported platform.
55#[cfg(not(all(target_pointer_width = "64", target_endian = "little")))]
56const _INVARIANTS_CHECK: () = {
57    compile_error!("`stringtheory` is only supported on 64-bit little-endian platforms.");
58};
59
60const fn is_tagged(cap: u8) -> bool {
61    cap & 0b10000000 != 0
62}
63
64const fn tag_cap(cap: usize) -> usize {
65    cap | TOP_MOST_BIT
66}
67
68const fn untag_cap(cap: usize) -> usize {
69    cap & !(TOP_MOST_BIT)
70}
71
72#[derive(Clone, Copy, Eq, PartialEq)]
73#[repr(usize)]
74enum Zero {
75    Zero = ZERO_VALUE,
76}
77
78#[derive(Clone, Copy, Debug, Eq, PartialEq)]
79#[repr(usize)]
80enum Tag {
81    Static = get_scaled_union_tag(UNION_TYPE_TAG_VALUE_STATIC),
82    InternedFixedSize = get_scaled_union_tag(UNION_TYPE_TAG_VALUE_INTERNED_FIXED_SIZE),
83    InternedGenericMap = get_scaled_union_tag(UNION_TYPE_TAG_VALUE_INTERNED_GENERIC_MAP),
84    Shared = get_scaled_union_tag(UNION_TYPE_TAG_VALUE_SHARED),
85}
86
87#[repr(C)]
88#[derive(Clone, Copy)]
89struct EmptyUnion {
90    ptr: Zero, // Field one.
91    len: Zero, // Field two.
92    cap: Zero, // Field three.
93}
94
95impl EmptyUnion {
96    const fn new() -> Self {
97        Self {
98            ptr: Zero::Zero,
99            len: Zero::Zero,
100            cap: Zero::Zero,
101        }
102    }
103}
104
105#[repr(C)]
106#[derive(Clone, Copy)]
107struct OwnedUnion {
108    ptr: *mut u8, // Field one
109    len: usize,   // Field two.
110    cap: usize,   // Field three.
111}
112
113impl OwnedUnion {
114    #[inline]
115    fn as_str(&self) -> &str {
116        // SAFETY: We know our pointer is valid, and non-null, since it's derived from a valid `String`, and that the
117        // data it points to is valid UTF-8, again, by virtue of it being derived from a valid `String`.
118        unsafe { from_utf8_unchecked(from_raw_parts(self.ptr, self.len)) }
119    }
120
121    fn into_owned(self) -> String {
122        // SAFETY: We know our pointer is valid, and non-null, since it's derived from a valid `String`, and that the
123        // data it points to is valid UTF-8, again, by virtue of it being derived from a valid `String`.
124        unsafe { String::from_raw_parts(self.ptr, self.len, untag_cap(self.cap)) }
125    }
126}
127
128#[repr(C)]
129#[derive(Clone, Copy)]
130struct StaticUnion {
131    value: &'static str, // Fields one and two.
132    _cap: Tag,           // Field three.
133}
134
135impl StaticUnion {
136    #[inline]
137    const fn as_str(&self) -> &str {
138        self.value
139    }
140}
141
142#[repr(C)]
143struct InternedUnion {
144    state: InternedStateUnion, // Fields one and two.
145    _cap: Tag,                 // Field three.
146}
147
148impl Drop for InternedUnion {
149    fn drop(&mut self) {
150        match self._cap {
151            Tag::InternedFixedSize => {
152                let state = unsafe { ManuallyDrop::take(&mut self.state.fixed_size) };
153                drop(state);
154            }
155            Tag::InternedGenericMap => {
156                let state = unsafe { ManuallyDrop::take(&mut self.state.generic_map) };
157                drop(state);
158            }
159            _ => unreachable!(),
160        }
161    }
162}
163
164impl Clone for InternedUnion {
165    fn clone(&self) -> Self {
166        // SAFETY: We use our tag (stored in `_cap`) to determine which field to access and clone, which
167        // ensures we only access fields which are initialized.
168        let state = unsafe {
169            match self._cap {
170                Tag::InternedFixedSize => InternedStateUnion {
171                    fixed_size: self.state.fixed_size.clone(),
172                },
173                Tag::InternedGenericMap => InternedStateUnion {
174                    generic_map: self.state.generic_map.clone(),
175                },
176                _ => unreachable!(),
177            }
178        };
179
180        Self { state, _cap: self._cap }
181    }
182}
183
184#[repr(C)]
185union InternedStateUnion {
186    fixed_size: ManuallyDrop<self::interning::fixed_size::StringState>,
187    generic_map: ManuallyDrop<self::interning::map::StringState>,
188}
189
190#[repr(C)]
191struct SharedUnion {
192    ptr: NonNull<str>, // Fields one and two
193    _cap: Tag,         // Field three.
194}
195
196impl SharedUnion {
197    #[inline]
198    const fn as_str(&self) -> &str {
199        // SAFETY: The pointee is still live by virtue of being held in an `Arc`.
200        unsafe { self.ptr.as_ref() }
201    }
202}
203
204#[repr(C)]
205#[derive(Clone, Copy)]
206struct InlinedUnion {
207    // Data is arranged as 23 bytes for string data, and the remaining 1 byte for the string length.
208    data: [u8; INLINED_STR_DATA_BUF_LEN], // Fields one, two, and three.
209}
210
211impl InlinedUnion {
212    #[inline]
213    fn as_str(&self) -> &str {
214        let len = self.data[INLINED_STR_MAX_LEN] as usize;
215
216        // SAFETY: We know our data is valid UTF-8 since we only ever derive inlined strings from a valid string
217        // reference.
218        unsafe { from_utf8_unchecked(&self.data[0..len]) }
219    }
220}
221
222#[repr(C)]
223#[derive(Clone, Copy)]
224struct DiscriminantUnion {
225    unused: [u8; INLINED_STR_DATA_BUF_LEN - 1], // Fields one, two, and three, minus the last byte.
226    tag_byte: u8,                               // Last byte, overlapped with the "highest" byte of field three.
227}
228
229#[derive(Debug, Eq, PartialEq)]
230enum UnionType {
231    Empty,
232    Owned,
233    Static,
234    InternedFixedSize,
235    InternedGenericMap,
236    Inlined,
237    Shared,
238}
239
240impl UnionType {
241    #[inline]
242    const fn is_owned(&self) -> bool {
243        matches!(self, UnionType::Owned)
244    }
245}
246
247impl DiscriminantUnion {
248    #[inline]
249    const fn get_union_type(&self) -> UnionType {
250        // At a high level, we encode the type of the union into the last byte of the struct, which overlaps with the
251        // capacity of owned strings, the length of an inlined string, and the unused field of other string types.
252        //
253        // Our logic is simple here:
254        //
255        // - Allocations can only ever be as large as `isize::MAX`, which means that the top-most bit of the capacity
256        //   value would never be used for a valid allocation.
257        // - In turn, the length of a string can also only ever be as large as `isize::MAX`, which means that the
258        //   top-most bit of the length byte for any string type would never be used
259        // - Inlined strings can only ever be up to 23 bytes long, which means that the top-most bit of the length byte
260        //   for an inlined string would never be used.
261        // - Static strings and interned strings only occupy the first two fields, which means their capacity should not
262        //   be used.
263        //
264        // As such, we encode the possible string types as follows:
265        //
266        // - when all fields are zero, we have an empty string
267        // - when the last byte does have the top-bit set, we have an owned string
268        // - when the last byte does _not_ have the top-bit set, and the value is less than or equal to 23, we have an
269        //   inlined string
270        // - when the last byte does _not_ have the top-bit set, and the value is greater than 23, we interpret the
271        //   specific value of the last byte as a discriminant for the remaining string types (static, interned, etc)
272        //
273        // We abuse the layout of little endian integers here to ensure that the upper-most bits of the capacity
274        // field overlaps with the last byte of an inlined string, and the unused field of a static/interned string.
275        //
276        // In an inlined string, its length byte is situated as the very last byte (data[23]) in its layout. In an
277        // owned string, we instead have a pointer, length, and capacity, in that order. This means that the length
278        // byte of the inlined string and the capacity field of the owned string overlap, as seen below:
279        //
280        //                ~ an inlined string, "hello, world", with a length of 12 (0C) ~
281        //      ┌───────────────────────────────────────────────────────────────────────────────┐
282        //      │ 68 65 6C 6C 6F 20 77 6F    72 6C 64 21 ?? ?? ?? ??    ?? ?? ?? ?? ?? ?? ?? 0C │
283        //      └───────────────────────────────────────────────────────────────────────────────┘
284        //                                                                                    ▲
285        //                                                                                    ├──── overlapping
286        //                          ~ an owned string with a capacity of 64 ~                 ▼         byte
287        //      ┌─────────────────────────┐┌─────────────────────────┐┌─────────────────────────┐
288        //      │ ?? ?? ?? ?? ?? ?? ?? ?? ││ ?? ?? ?? ?? ?? ?? ?? ?? ││ 40 00 00 00 00 00 00 80 │
289        //      └─────────────────────────┘└─────────────────────────┘└─────────────────────────┘
290        //                                                                                    ▲
291        //                 original capacity: 64                  (0x0000000000000040)        │
292        //                 "tagged" capacity: 9223372036854775872 (0x8000000000000040)        │
293        //                                                           ▲                        │
294        //                     (tag bit) ────────────────────────────┘                        │
295        //                                                                                    │
296        //                                                                                    │
297        //                       inlined last byte (0x0C)  [0 0 0 0 1 1 0 0] ◀────────────────┤
298        //                       owned last byte (0x80)    [1 0 0 0 0 0 0 0] ◀────────────────┤
299        //                                                                                    │
300        //                       inlined last byte                                            │
301        //                       maximum of 23 (0x17)      [0 0 0 1 0 1 1 1] ◀────────────────┤
302        //                                                                                    │
303        //                       "owned" discriminant                                         │
304        //                       bitmask (any X bit)       [X X X ? ? ? ? ?] ◀────────────────┘
305        //
306        // As such, the length byte of an inlined string overlaps with the capacity field of an owned string.
307        // Additionally, due to the integer fields being written in little-endian order, the "upper" byte of the
308        // capacity field -- the eight highest bits -- is actually written in the same location as the length byte
309        // of an inlined string.
310        //
311        // Given that we know an inlined string cannot be any longer than 23 bytes, we know that the top-most bit in
312        // the last byte (data[23]) can never be set, as it would imply a length of _at least_ 128. With that, we
313        // utilize invariant #3 of `Inner` -- allocations can never be larger than `isize::MAX` -- which lets us
314        // safely "tag" an owned string's capacity -- setting the upper most bit to 1 -- to indicate that it's an
315        // owned string.
316        //
317        // An inlined string couldn't possibly have a length that occupied that bit, and so we know if we're down
318        // here, and our pointer field isn't all zeroes and our length field isn't all zeroes, that we _have_ to be
319        // an owned string if our capacity is tagged.
320
321        // If the top bit is set, we know we're dealing with an owned string.
322        if is_tagged(self.tag_byte) {
323            return UnionType::Owned;
324        }
325
326        // The top-most bit has to be set for an owned string, but isn't set for any other type, so try differentiating
327        // at this point.
328        match self.tag_byte {
329            // Empty string. Easy.
330            0 => UnionType::Empty,
331
332            // Anything between 1 and INLINED_STR_MAX_LEN (23, inclusive) is an inlined string.
333            1..=INLINED_STR_MAX_LEN_U8 => UnionType::Inlined,
334
335            // These are fixed values between 24 and 128, so we just match them directly.
336            UNION_TYPE_TAG_VALUE_STATIC => UnionType::Static,
337            UNION_TYPE_TAG_VALUE_INTERNED_FIXED_SIZE => UnionType::InternedFixedSize,
338            UNION_TYPE_TAG_VALUE_INTERNED_GENERIC_MAP => UnionType::InternedGenericMap,
339            UNION_TYPE_TAG_VALUE_SHARED => UnionType::Shared,
340
341            // If we haven't matched any specific type tag value, then this is something else that we don't handle or
342            // know about... which we handle as just acting like we're an empty string for simplicity.
343            _ => UnionType::Empty,
344        }
345    }
346}
347
348/// The core data structure for holding all different string variants.
349///
350/// This union has six data fields -- one for each possible string variant -- and a discriminant field, used to
351/// determine which string variant is actually present. The discriminant field interrogates the bits in each machine
352/// word field (all variants are three machine words) to determine which bit patterns are valid or not for a given
353/// variant, allowing the string variant to be authoritatively determined.
354///
355/// # Invariants
356///
357/// This code depends on a number of invariants in order to work correctly:
358///
359/// 1. Only used on 64-bit little-endian platforms. (checked at compile-time via _INVARIANTS_CHECK)
360/// 2. The data pointers for `String` and `&'static str` cannot ever be null when the strings are non-empty.
361/// 3. Allocations can never be larger than `isize::MAX` (see [here][rust_isize_alloc_limit]), meaning that any
362///    length/capacity field for a string cannot ever be larger than `isize::MAX`, implying the 64th bit (top-most bit)
363///    for length/capacity should always be 0.
364/// 4. An inlined string can only hold up to 23 bytes of data, meaning that the length byte for that string can never
365///    have a value greater than 23. (_We_ have to provide this invariant, which is handled in `Inner::try_inlined`.)
366///
367/// [rust_isize_alloc_limit]: https://doc.rust-lang.org/stable/std/alloc/struct.Layout.html#method.from_size_align
368union Inner {
369    empty: EmptyUnion,
370    owned: OwnedUnion,
371    static_: StaticUnion,
372    interned: ManuallyDrop<InternedUnion>,
373    shared: ManuallyDrop<SharedUnion>,
374    inlined: InlinedUnion,
375    discriminant: DiscriminantUnion,
376}
377
378impl Inner {
379    const fn empty() -> Self {
380        Self {
381            empty: EmptyUnion::new(),
382        }
383    }
384
385    fn owned(value: String) -> Self {
386        match value.capacity() {
387            // The string we got is empty.
388            0 => Self::empty(),
389            cap => {
390                let mut value = value.into_bytes();
391
392                let ptr = value.as_mut_ptr();
393                let len = value.len();
394
395                // We're taking ownership of the underlying string allocation so we can't let it drop.
396                std::mem::forget(value);
397
398                Self {
399                    owned: OwnedUnion {
400                        ptr,
401                        len,
402                        cap: tag_cap(cap),
403                    },
404                }
405            }
406        }
407    }
408
409    const fn static_str(value: &'static str) -> Self {
410        match value.len() {
411            0 => Self::empty(),
412            _ => Self {
413                static_: StaticUnion {
414                    value,
415                    _cap: Tag::Static,
416                },
417            },
418        }
419    }
420
421    fn interned(value: InternedString) -> Self {
422        match value.len() {
423            0 => Self::empty(),
424            _len => {
425                let (state, tag) = match value.into_state() {
426                    InternedStringState::FixedSize(fixed_size) => (
427                        InternedStateUnion {
428                            fixed_size: ManuallyDrop::new(fixed_size),
429                        },
430                        Tag::InternedFixedSize,
431                    ),
432                    InternedStringState::GenericMap(generic_map) => (
433                        InternedStateUnion {
434                            generic_map: ManuallyDrop::new(generic_map),
435                        },
436                        Tag::InternedGenericMap,
437                    ),
438                };
439
440                Self {
441                    interned: ManuallyDrop::new(InternedUnion { state, _cap: tag }),
442                }
443            }
444        }
445    }
446
447    fn shared(value: Arc<str>) -> Self {
448        match value.len() {
449            0 => Self::empty(),
450            _len => Self {
451                shared: ManuallyDrop::new(SharedUnion {
452                    // SAFETY: We know `ptr` is non-null because `Arc::into_raw` is called on a valid `Arc<str>`.
453                    ptr: unsafe { NonNull::new_unchecked(Arc::into_raw(value).cast_mut()) },
454                    _cap: Tag::Shared,
455                }),
456            },
457        }
458    }
459
460    fn try_inlined(value: &str) -> Option<Self> {
461        match value.len() {
462            0 => Some(Self::empty()),
463            len => {
464                if len > INLINED_STR_MAX_LEN {
465                    return None;
466                }
467
468                let mut data = [0; INLINED_STR_DATA_BUF_LEN];
469
470                // SAFETY: We know it fits because we just checked that the string length is 23 or less.
471                data[INLINED_STR_MAX_LEN] = len as u8;
472
473                let buf = value.as_bytes();
474                data[0..len].copy_from_slice(buf);
475
476                Some(Self {
477                    inlined: InlinedUnion { data },
478                })
479            }
480        }
481    }
482
483    #[inline]
484    fn as_str(&self) -> &str {
485        let union_type = unsafe { self.discriminant.get_union_type() };
486        match union_type {
487            UnionType::Empty => "",
488            UnionType::Owned => {
489                let owned = unsafe { &self.owned };
490                owned.as_str()
491            }
492            UnionType::Static => {
493                let static_ = unsafe { &self.static_ };
494                static_.as_str()
495            }
496            UnionType::InternedFixedSize => {
497                let interned = unsafe { &self.interned.state.fixed_size };
498                interned.as_str()
499            }
500            UnionType::InternedGenericMap => {
501                let interned = unsafe { &self.interned.state.generic_map };
502                interned.as_str()
503            }
504            UnionType::Shared => {
505                let shared = unsafe { &self.shared };
506                shared.as_str()
507            }
508            UnionType::Inlined => {
509                let inlined = unsafe { &self.inlined };
510                inlined.as_str()
511            }
512        }
513    }
514
515    #[inline]
516    const fn get_union_type(&self) -> UnionType {
517        unsafe { self.discriminant.get_union_type() }
518    }
519
520    fn into_owned(mut self) -> String {
521        let union_type = unsafe { self.discriminant.get_union_type() };
522        match union_type {
523            UnionType::Empty => String::new(),
524            UnionType::Owned => {
525                // We're (`Inner`) being consumed here, but we need to update our internal state to ensure that our drop
526                // logic doesn't try to double free the string allocation.
527                let owned = unsafe { self.owned };
528                self.empty = EmptyUnion::new();
529
530                owned.into_owned()
531            }
532            UnionType::Static => {
533                let static_ = unsafe { self.static_ };
534                static_.as_str().to_owned()
535            }
536            UnionType::InternedFixedSize => {
537                let interned = unsafe { &self.interned.state.fixed_size };
538                interned.as_str().to_owned()
539            }
540            UnionType::InternedGenericMap => {
541                let interned = unsafe { &self.interned.state.generic_map };
542                interned.as_str().to_owned()
543            }
544            UnionType::Shared => {
545                let shared = unsafe { &self.shared };
546                shared.as_str().to_owned()
547            }
548            UnionType::Inlined => {
549                let inlined = unsafe { self.inlined };
550                inlined.as_str().to_owned()
551            }
552        }
553    }
554}
555
556impl Drop for Inner {
557    fn drop(&mut self) {
558        let union_type = unsafe { self.discriminant.get_union_type() };
559        match union_type {
560            UnionType::Owned => {
561                let owned = unsafe { &mut self.owned };
562
563                let ptr = owned.ptr;
564                let len = owned.len;
565                let cap = untag_cap(owned.cap);
566
567                // SAFETY: The pointer has to be non-null, because we only ever construct an owned variant when the
568                // `String` has a non-zero capacity, which implies a valid allocation, and thus a valid, non-null
569                // pointer.
570                let data = unsafe { Vec::<u8>::from_raw_parts(ptr, len, cap) };
571                drop(data);
572            }
573            UnionType::InternedFixedSize | UnionType::InternedGenericMap => {
574                let interned = unsafe { &mut self.interned };
575
576                // SAFETY: We're already dropping `Inner`, so nothing else can use the `InternedString` value after we
577                // consume it here.
578                let data = unsafe { ManuallyDrop::take(interned) };
579                drop(data);
580            }
581            UnionType::Shared => {
582                let shared = unsafe { &mut self.shared };
583
584                // Decrement the strong count before we drop, ensuring the `Arc` has a chance to clean itself up if this
585                // is the less strong reference.
586                //
587                // SAFETY: We know `shared.ptr` was obtained from `Arc::into_raw`, so it's valid to decrement on. We
588                // also know the backing storage is still live because it has to be by virtue of us being here.
589                unsafe {
590                    Arc::decrement_strong_count(shared.ptr.as_ptr().cast_const());
591                }
592            }
593            _ => {}
594        }
595    }
596}
597
598impl Clone for Inner {
599    fn clone(&self) -> Self {
600        let union_type = unsafe { self.discriminant.get_union_type() };
601        match union_type {
602            UnionType::Empty => Self::empty(),
603            UnionType::Owned => {
604                let owned = unsafe { self.owned };
605                let s = owned.as_str();
606
607                // We specifically try to inline here.
608                //
609                // At a high-level, when we're _given_ an owned string, we avoid inlining because we don't want to trash
610                // the underlying allocation since we may be asked to give it back later (via `MetaString::into_owned`).
611                // However, when we're _cloning_, we'd rather avoiding allocating if we can help it.
612                Self::try_inlined(s).unwrap_or_else(|| Self::owned(s.to_owned()))
613            }
614            UnionType::Static => Self {
615                static_: unsafe { self.static_ },
616            },
617            UnionType::InternedFixedSize | UnionType::InternedGenericMap => Self {
618                interned: unsafe { self.interned.clone() },
619            },
620            UnionType::Shared => {
621                let shared = unsafe { &self.shared };
622
623                // We have to increment the strong count before cloning.
624                //
625                // SAFETY: We know `shared.ptr` was obtained from `Arc::into_raw`. We also know that if we're cloning
626                // this value, that the underlying `Arc` must still be live, since we're holding a reference to it.
627                unsafe {
628                    Arc::increment_strong_count(shared.ptr.as_ptr().cast_const());
629                }
630
631                Self {
632                    shared: ManuallyDrop::new(SharedUnion {
633                        ptr: shared.ptr,
634                        _cap: Tag::Shared,
635                    }),
636                }
637            }
638            UnionType::Inlined => Self {
639                inlined: unsafe { self.inlined },
640            },
641        }
642    }
643}
644
645// SAFETY: None of our union variants are tied to the original thread they were created on.
646unsafe impl Send for Inner {}
647
648// SAFETY: None of our union variants use any form of interior mutability and are thus safe to be shared between
649// threads.
650unsafe impl Sync for Inner {}
651
652/// An immutable string type that abstracts over various forms of string storage.
653///
654/// Normally, developers will work with either `String` (owned) or `&str` (borrowed) when dealing with strings. In some
655/// cases, though, it can be useful to work with strings that use alternative storage, such as those that are atomically
656/// shared (e.g. `InternedString`). While using those string types themselves isn't complex, using them and _also_
657/// supporting normal string types can be complex.
658///
659/// `MetaString` is an opinionated string type that abstracts over the normal string types like `String` and `&str`
660/// while also supporting alternative storage, such as interned strings from `FixedSizeInterner`, unifying these
661/// different variants behind a single concrete type.
662///
663/// ## Supported types
664///
665/// `MetaString` supports the following "variants":
666///
667/// - owned (`String` and non-inlineable `&str`)
668/// - static (`&'static str`)
669/// - interned (`InternedString`)
670/// - shared (`Arc<str>`)
671/// - inlined (up to 23 bytes)
672///
673/// ### Owned and borrowed strings
674///
675/// `MetaString` can be created from `String` and `&str` directly. For owned scenarios (`String`), the string value is
676/// simply wrapped. For borrowed strings, we attempt to inline them (see more below) or, if they cannot be inlined, they
677/// are copied into a new `String`.
678///
679/// ### Static strings
680///
681/// `MetaString` can be created from `&'static str` directly. This is useful for string literals and other static
682/// strings that are too large to be inlined.
683///
684/// ### Interned strings
685///
686/// `MetaString` can also be created from `InternedString`, which is a string that has been interned (using an interner
687/// like [`FixedSizeInterner`][crate::interning::FixedSizeInterner] or
688/// [`GenericMapInterner`][crate::interning::GenericMapInterner]). Interned strings are essentially a combination of the
689/// properties of `Arc<T>` -- owned wrappers around an atomically reference counted piece of data -- and a fixed-size
690/// buffer, where we allocate one large buffer, and write many small strings into it, and provide references to those
691/// strings through `InternedString`.
692///
693/// ### Shared strings
694///
695/// `MetaString` can be created from `Arc<str>`, which is a string slice that can be atomically shared between threads.
696/// This is a simpler version of interned strings where strict memory control and re-use is not required.
697///
698/// ### Inlined strings
699///
700/// Finally, `MetaString` can also be created by inlining small strings into `MetaString` itself, avoiding the need for
701/// any backing allocation. "Small string optimization" is a common optimization for string types where small strings
702/// can be stored directly in a string type itself by utilizing a "union"-style layout.
703///
704/// As `MetaString` utilizes such a layout, we can provide a small string optimization that allows for strings up to 23
705/// bytes in length.
706///
707/// ## Conversion methods
708///
709/// Implementations of `From<T>` exist for all of the aforementioned types to allow for easily converting to
710/// `MetaString`. Once a caller has a `MetaString` value, they are generally expected to interact with the string in a
711/// read-only way, as `MetaString` can be dereferenced directly to `&str`.
712///
713/// If a caller needs to be able to modify the string data, they can call `into_owned` to get an owned version of the
714/// string, make their modifications to the owned version, and then convert that back to `MetaString`.
715#[derive(Clone)]
716pub struct MetaString {
717    inner: Inner,
718}
719
720impl MetaString {
721    /// Creates an empty `MetaString`.
722    ///
723    /// This does not allocate.
724    pub const fn empty() -> Self {
725        Self { inner: Inner::empty() }
726    }
727
728    /// Creates a new `MetaString` from the given static string.
729    ///
730    /// This does not allocate.
731    pub const fn from_static(s: &'static str) -> Self {
732        Self {
733            inner: Inner::static_str(s),
734        }
735    }
736
737    /// Attempts to create a new `MetaString` from the given string if it can be inlined.
738    pub fn try_inline(s: &str) -> Option<Self> {
739        Inner::try_inlined(s).map(|inner| Self { inner })
740    }
741
742    /// Creates a new `MetaString` from the given string, using the provided interner.
743    ///
744    /// The string is inlined if possible. If it cannot be inlined, the interner is tried. If interning fails, an owned
745    /// string is allocated.
746    pub fn from_interner<I>(s: &str, interner: &I) -> Self
747    where
748        I: Interner,
749    {
750        if let Some(inlined) = Self::try_inline(s) {
751            return inlined;
752        }
753        if let Some(interned) = interner.try_intern(s) {
754            return Self::from(interned);
755        }
756        Self::from(s.to_owned())
757    }
758
759    /// Returns `true` if `self` has a length of zero bytes.
760    pub fn is_empty(&self) -> bool {
761        self.deref().is_empty()
762    }
763
764    /// Returns `true` if `self` can be cheaply cloned.
765    pub const fn is_cheaply_cloneable(&self) -> bool {
766        // If we're wrapping an owned string, cloning means cloning that allocation. All other types are cheap to clone.
767        !self.inner.get_union_type().is_owned()
768    }
769
770    /// Consumes `self` and returns an owned `String`.
771    ///
772    /// If the `MetaString` is already owned, this will simply return the inner `String` directly. Otherwise, this will
773    /// allocate an owned version (`String`) of the string data.
774    pub fn into_owned(self) -> String {
775        self.inner.into_owned()
776    }
777}
778
779impl Default for MetaString {
780    fn default() -> Self {
781        Self::empty()
782    }
783}
784
785impl hash::Hash for MetaString {
786    fn hash<H: hash::Hasher>(&self, state: &mut H) {
787        self.deref().hash(state)
788    }
789}
790
791impl PartialEq<str> for MetaString {
792    fn eq(&self, other: &str) -> bool {
793        self.deref() == other
794    }
795}
796
797impl PartialEq<&str> for MetaString {
798    fn eq(&self, other: &&str) -> bool {
799        self.deref() == *other
800    }
801}
802
803impl PartialEq<MetaString> for MetaString {
804    fn eq(&self, other: &MetaString) -> bool {
805        self.deref() == other.deref()
806    }
807}
808
809impl Eq for MetaString {}
810
811impl PartialOrd for MetaString {
812    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
813        Some(self.cmp(other))
814    }
815}
816
817impl Ord for MetaString {
818    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
819        self.deref().cmp(other.deref())
820    }
821}
822
823impl Borrow<str> for MetaString {
824    fn borrow(&self) -> &str {
825        self.deref()
826    }
827}
828
829impl Serialize for MetaString {
830    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
831    where
832        S: serde::Serializer,
833    {
834        serializer.serialize_str(self.deref())
835    }
836}
837
838impl<'de> Deserialize<'de> for MetaString {
839    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
840    where
841        D: serde::Deserializer<'de>,
842    {
843        struct MetaStringVisitor;
844
845        impl<'de> serde::de::Visitor<'de> for MetaStringVisitor {
846            type Value = MetaString;
847
848            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
849                formatter.write_str("a string")
850            }
851
852            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
853            where
854                E: serde::de::Error,
855            {
856                Ok(MetaString::from(v))
857            }
858        }
859
860        deserializer.deserialize_str(MetaStringVisitor)
861    }
862}
863
864impl From<String> for MetaString {
865    fn from(s: String) -> Self {
866        Self { inner: Inner::owned(s) }
867    }
868}
869
870impl From<&str> for MetaString {
871    fn from(s: &str) -> Self {
872        Self::try_inline(s).unwrap_or_else(|| Self::from(s.to_owned()))
873    }
874}
875
876impl From<InternedString> for MetaString {
877    fn from(s: InternedString) -> Self {
878        Self {
879            inner: Inner::interned(s),
880        }
881    }
882}
883
884impl From<Arc<str>> for MetaString {
885    fn from(s: Arc<str>) -> Self {
886        Self {
887            inner: Inner::shared(s),
888        }
889    }
890}
891
892impl From<MetaString> for protobuf::Chars {
893    fn from(value: MetaString) -> Self {
894        value.into_owned().into()
895    }
896}
897
898impl<'a> From<&'a MetaString> for protobuf::Chars {
899    fn from(value: &'a MetaString) -> Self {
900        // TODO: We're foregoing additional code/complexity to recover a static reference, when our string storage is
901        // static, in order to optimize the conversion to `Bytes` and then `Chars`.
902        //
903        // Static strings being written to Protocol Buffers should be decently rare across the codebase, so no biggie
904        // for now.
905        value.deref().into()
906    }
907}
908
909impl Deref for MetaString {
910    type Target = str;
911
912    fn deref(&self) -> &str {
913        self.inner.as_str()
914    }
915}
916
917impl AsRef<str> for MetaString {
918    fn as_ref(&self) -> &str {
919        self.deref()
920    }
921}
922
923impl fmt::Debug for MetaString {
924    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
925        self.deref().fmt(f)
926    }
927}
928
929impl fmt::Display for MetaString {
930    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
931        self.deref().fmt(f)
932    }
933}
934
935#[cfg(test)]
936mod tests {
937    use std::{num::NonZeroUsize, sync::Arc};
938
939    use proptest::{prelude::*, proptest};
940
941    use super::{interning::GenericMapInterner, InlinedUnion, Inner, MetaString, UnionType};
942    use crate::interning::{FixedSizeInterner, Interner as _};
943
944    #[test]
945    fn struct_sizes() {
946        // Overall, we expect `MetaString`, and thus `Inner`, to always be three machine words.
947        assert_eq!(std::mem::size_of::<MetaString>(), std::mem::size_of::<usize>() * 3);
948        assert_eq!(std::mem::size_of::<Inner>(), std::mem::size_of::<usize>() * 3);
949
950        // We also expect all of inlined union variant to be the exact size of `Inner`, which means we're properly
951        // maximizing the available space for inlining.
952        assert_eq!(std::mem::size_of::<InlinedUnion>(), std::mem::size_of::<Inner>());
953    }
954
955    #[test]
956    fn static_str_inlineable() {
957        // We always use the static variant, even if the string is inlineable, because this lets us make `from_static` const.
958        let s = "hello";
959        let meta = MetaString::from_static(s);
960
961        assert_eq!(meta.inner.get_union_type(), UnionType::Static);
962        assert_eq!(s, &*meta);
963        assert_eq!(s, meta.into_owned());
964    }
965
966    #[test]
967    fn static_str_not_inlineable() {
968        let s = "hello there, world! it's me, margaret!";
969        let meta = MetaString::from_static(s);
970
971        assert_eq!(s, &*meta);
972        assert_eq!(meta.inner.get_union_type(), UnionType::Static);
973        assert_eq!(s, meta.into_owned());
974    }
975
976    #[test]
977    fn owned_string() {
978        let s_orig = "hello";
979        let s = String::from(s_orig);
980        let meta = MetaString::from(s);
981
982        assert_eq!(s_orig, &*meta);
983        assert_eq!(meta.inner.get_union_type(), UnionType::Owned);
984        assert_eq!(s_orig, meta.into_owned());
985    }
986
987    #[test]
988    fn inlined_string() {
989        let s = "hello";
990        let meta = MetaString::from(s);
991
992        assert_eq!(s, &*meta);
993        assert_eq!(meta.inner.get_union_type(), UnionType::Inlined);
994        assert_eq!(s, meta.into_owned());
995    }
996
997    #[test]
998    fn interned_string() {
999        let intern_str = "hello interned str!";
1000
1001        let fs_interner = FixedSizeInterner::<1>::new(NonZeroUsize::new(1024).unwrap());
1002        let s = fs_interner.try_intern(intern_str).unwrap();
1003        assert_eq!(intern_str, &*s);
1004
1005        let meta = MetaString::from(s);
1006        assert_eq!(intern_str, &*meta);
1007        assert_eq!(meta.inner.get_union_type(), UnionType::InternedFixedSize);
1008        assert_eq!(intern_str, meta.into_owned());
1009
1010        let gm_interner = GenericMapInterner::new(NonZeroUsize::new(1024).unwrap());
1011        let s = gm_interner.try_intern(intern_str).unwrap();
1012        assert_eq!(intern_str, &*s);
1013
1014        let meta = MetaString::from(s);
1015        assert_eq!(intern_str, &*meta);
1016        assert_eq!(meta.inner.get_union_type(), UnionType::InternedGenericMap);
1017        assert_eq!(intern_str, meta.into_owned());
1018    }
1019
1020    #[test]
1021    fn shared_string() {
1022        let shared_str = "hello shared str!";
1023
1024        let s = Arc::<str>::from(shared_str);
1025        assert_eq!(shared_str, &*s);
1026
1027        let meta = MetaString::from(s);
1028        assert_eq!(shared_str, &*meta);
1029        assert_eq!(meta.inner.get_union_type(), UnionType::Shared);
1030        assert_eq!(shared_str, meta.into_owned());
1031    }
1032
1033    #[test]
1034    fn shared_string_clone() {
1035        let shared_str = "hello shared str!";
1036        let s = Arc::<str>::from(shared_str);
1037        let meta = MetaString::from(s);
1038
1039        // Clone the `MetaString` to make sure we can still access the original string.
1040        let meta2 = meta.clone();
1041        assert_eq!(shared_str, &*meta2);
1042
1043        // Drop the original `MetaString` to ensure we can still access the string from our clone after going through
1044        // the drop logic for the shared variant.
1045        drop(meta);
1046        assert_eq!(shared_str, &*meta2);
1047    }
1048
1049    #[test]
1050    fn empty_string_shared() {
1051        let shared_str = "";
1052
1053        let s = Arc::<str>::from(shared_str);
1054        assert_eq!(shared_str, &*s);
1055
1056        let meta = MetaString::from(s);
1057        assert_eq!(shared_str, &*meta);
1058        assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1059        assert_eq!(shared_str, meta.into_owned());
1060    }
1061
1062    #[test]
1063    fn empty_string_interned() {
1064        let intern_str = "";
1065
1066        let interner = GenericMapInterner::new(NonZeroUsize::new(1024).unwrap());
1067        let s = interner.try_intern(intern_str).unwrap();
1068        assert_eq!(intern_str, &*s);
1069
1070        let meta = MetaString::from(s);
1071        assert_eq!(intern_str, &*meta);
1072        assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1073        assert_eq!(intern_str, meta.into_owned());
1074    }
1075
1076    #[test]
1077    fn empty_string_static() {
1078        let s = "";
1079
1080        let meta = MetaString::from_static(s);
1081        assert_eq!(s, &*meta);
1082        assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1083        assert_eq!(s, meta.into_owned());
1084    }
1085
1086    #[test]
1087    fn empty_string_inlined() {
1088        let s = "";
1089
1090        let meta = MetaString::try_inline(s).expect("empty string definitely 'fits'");
1091        assert_eq!(s, &*meta);
1092        assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1093        assert_eq!(s, meta.into_owned());
1094    }
1095
1096    #[test]
1097    fn empty_string_owned() {
1098        // When a string has capacity, we don't care if it's actually empty or not, because we want to preserve the
1099        // allocation... so our string here is empty but _does_ have capacity.
1100        let s = String::with_capacity(32);
1101        let actual_cap = s.capacity();
1102
1103        let meta = MetaString::from(s);
1104        assert_eq!("", &*meta);
1105        assert_eq!(meta.inner.get_union_type(), UnionType::Owned);
1106
1107        let owned = meta.into_owned();
1108        assert_eq!(owned.capacity(), actual_cap);
1109        assert_eq!("", owned);
1110    }
1111
1112    #[test]
1113    fn empty_string_owned_zero_capacity() {
1114        // When a string has _no_ capacity, it's effectively empty, and we treat it that way.
1115        let s = String::new();
1116
1117        let meta = MetaString::from(s);
1118        assert_eq!("", &*meta);
1119        assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1120        assert_eq!("", meta.into_owned());
1121    }
1122
1123    #[test]
1124    fn test_is_cheaply_cloneable() {
1125        // Owned strings are never cheap to clone.
1126        let s =
1127            String::from("big ol' stringy string that can't be inlined and lives out its bleak existence in the heap");
1128        let ms = MetaString::from(s);
1129        assert!(!ms.is_cheaply_cloneable());
1130
1131        // Empty strings are always cheap to clone.
1132        let ms = MetaString::empty();
1133        assert!(ms.is_cheaply_cloneable());
1134
1135        // Inlined strings are always cheap to clone.
1136        let s = "hello";
1137        let ms = MetaString::try_inline(s).expect("inlined string should fit");
1138        assert!(ms.is_cheaply_cloneable());
1139
1140        // Static strings are always cheap to clone.
1141        let s = "hello there, world! it's me, margaret!";
1142        let ms = MetaString::from_static(s);
1143        assert!(ms.is_cheaply_cloneable());
1144
1145        // Interned strings are always cheap to clone.
1146        let interner = GenericMapInterner::new(NonZeroUsize::new(1024).unwrap());
1147        let s = interner.try_intern("hello interned str!").unwrap();
1148        let ms = MetaString::from(s);
1149        assert!(ms.is_cheaply_cloneable());
1150
1151        // Shared strings are always cheap to clone.
1152        let s = Arc::from("hello shared str!");
1153        let ms = MetaString::from(s);
1154        assert!(ms.is_cheaply_cloneable());
1155    }
1156
1157    fn arb_unicode_str_max_len(max_len: usize) -> impl Strategy<Value = String> {
1158        ".{0,23}".prop_filter("resulting string is too longer", move |s| s.len() <= max_len)
1159    }
1160
1161    proptest! {
1162        #![proptest_config(ProptestConfig::with_cases(10000))]
1163
1164        #[test]
1165        #[cfg_attr(miri, ignore)]
1166        fn property_test_inlined_string(
1167            input in arb_unicode_str_max_len(23),
1168        ) {
1169            assert!(input.len() <= 23, "input should be 23 bytes or less");
1170            let meta = MetaString::try_inline(&input).expect("input should fit");
1171
1172            if input.is_empty() {
1173                assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1174            } else {
1175                assert_eq!(input, &*meta);
1176                assert_eq!(meta.inner.get_union_type(), UnionType::Inlined);
1177            }
1178        }
1179
1180        #[test]
1181        #[cfg_attr(miri, ignore)]
1182        fn property_test_owned_string(
1183            input in ".*",
1184        ) {
1185            let is_empty = input.is_empty();
1186            let meta = MetaString::from(input);
1187
1188            if is_empty {
1189                assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1190            } else {
1191                assert_eq!(meta.inner.get_union_type(), UnionType::Owned);
1192            }
1193        }
1194    }
1195}