stringtheory/lib.rs
1//! Sharing-optimized strings and string interning utilities.
2//!
3//! `stringtheory` provides two main components: a sharing-optimized string type, `MetaString`, and string interning
4//! implementations (`FixedSizeInterner`, `GenericMapInterner`, etc). These components are meant to work in concert,
5//! allowing for using a single string type that can handle owned, shared, and interned strings, and providing a way to
6//! efficiently intern strings strings when possible.
7#![deny(warnings)]
8#![deny(missing_docs)]
9// We only support 64-bit little-endian platforms anyways, so there's no risk of our enum variants having their values truncated.
10#![allow(clippy::enum_clike_unportable_variant)]
11
12use std::{
13 borrow::Borrow, fmt, hash, mem::ManuallyDrop, ops::Deref, ptr::NonNull, slice::from_raw_parts,
14 str::from_utf8_unchecked, sync::Arc,
15};
16
17use serde::Serialize;
18
19mod clone;
20pub use self::clone::CheapMetaString;
21
22pub mod interning;
23use self::interning::{InternedString, InternedStringState};
24
25const ZERO_VALUE: usize = 0;
26const TOP_MOST_BIT: usize = usize::MAX & !(isize::MAX as usize);
27const INLINED_STR_DATA_BUF_LEN: usize = std::mem::size_of::<usize>() * 3;
28const INLINED_STR_MAX_LEN: usize = INLINED_STR_DATA_BUF_LEN - 1;
29const INLINED_STR_MAX_LEN_U8: u8 = INLINED_STR_MAX_LEN as u8;
30
31const UNION_TYPE_TAG_VALUE_STATIC: u8 = get_offset_tag_value(0);
32const UNION_TYPE_TAG_VALUE_INTERNED_FIXED_SIZE: u8 = get_offset_tag_value(1);
33const UNION_TYPE_TAG_VALUE_INTERNED_GENERIC_MAP: u8 = get_offset_tag_value(2);
34const UNION_TYPE_TAG_VALUE_SHARED: u8 = get_offset_tag_value(3);
35
36const fn get_offset_tag_value(tag: u8) -> u8 {
37 const UNION_TYPE_TAG_VALUE_BASE: u8 = INLINED_STR_MAX_LEN as u8 + 1;
38
39 if tag > (u8::MAX - INLINED_STR_MAX_LEN as u8) {
40 panic!("Union type tag value must be less than 232 to fit.");
41 }
42
43 tag + UNION_TYPE_TAG_VALUE_BASE
44}
45
46const fn get_scaled_union_tag(tag: u8) -> usize {
47 // We want to shift our tag value (single byte) to make it the top most byte in a `usize`.
48 const UNION_TYPE_TAG_VALUE_SHIFT: u32 = (std::mem::size_of::<usize>() as u32 - 1) * 8;
49
50 (tag as usize) << UNION_TYPE_TAG_VALUE_SHIFT
51}
52
53// High-level invariant checks to ensure `stringtheory` isn't being used on an unsupported platform.
54#[cfg(not(all(target_pointer_width = "64", target_endian = "little")))]
55const _INVARIANTS_CHECK: () = {
56 compile_error!("`stringtheory` is only supported on 64-bit little-endian platforms.");
57};
58
59const fn is_tagged(cap: u8) -> bool {
60 cap & 0b10000000 != 0
61}
62
63const fn tag_cap(cap: usize) -> usize {
64 cap | TOP_MOST_BIT
65}
66
67const fn untag_cap(cap: usize) -> usize {
68 cap & !(TOP_MOST_BIT)
69}
70
71#[derive(Clone, Copy, Eq, PartialEq)]
72#[repr(usize)]
73enum Zero {
74 Zero = ZERO_VALUE,
75}
76
77#[derive(Clone, Copy, Debug, Eq, PartialEq)]
78#[repr(usize)]
79enum Tag {
80 Static = get_scaled_union_tag(UNION_TYPE_TAG_VALUE_STATIC),
81 InternedFixedSize = get_scaled_union_tag(UNION_TYPE_TAG_VALUE_INTERNED_FIXED_SIZE),
82 InternedGenericMap = get_scaled_union_tag(UNION_TYPE_TAG_VALUE_INTERNED_GENERIC_MAP),
83 Shared = get_scaled_union_tag(UNION_TYPE_TAG_VALUE_SHARED),
84}
85
86#[repr(C)]
87#[derive(Clone, Copy)]
88struct EmptyUnion {
89 ptr: Zero, // Field one.
90 len: Zero, // Field two.
91 cap: Zero, // Field three.
92}
93
94impl EmptyUnion {
95 const fn new() -> Self {
96 Self {
97 ptr: Zero::Zero,
98 len: Zero::Zero,
99 cap: Zero::Zero,
100 }
101 }
102}
103
104#[repr(C)]
105#[derive(Clone, Copy)]
106struct OwnedUnion {
107 ptr: *mut u8, // Field one
108 len: usize, // Field two.
109 cap: usize, // Field three.
110}
111
112impl OwnedUnion {
113 #[inline]
114 fn as_str(&self) -> &str {
115 // SAFETY: We know our pointer is valid, and non-null, since it's derived from a valid `String`, and that the
116 // data it points to is valid UTF-8, again, by virtue of it being derived from a valid `String`.
117 unsafe { from_utf8_unchecked(from_raw_parts(self.ptr, self.len)) }
118 }
119
120 fn into_owned(self) -> String {
121 // SAFETY: We know our pointer is valid, and non-null, since it's derived from a valid `String`, and that the
122 // data it points to is valid UTF-8, again, by virtue of it being derived from a valid `String`.
123 unsafe { String::from_raw_parts(self.ptr, self.len, untag_cap(self.cap)) }
124 }
125}
126
127#[repr(C)]
128#[derive(Clone, Copy)]
129struct StaticUnion {
130 value: &'static str, // Fields one and two.
131 _cap: Tag, // Field three.
132}
133
134impl StaticUnion {
135 #[inline]
136 const fn as_str(&self) -> &str {
137 self.value
138 }
139}
140
141#[repr(C)]
142struct InternedUnion {
143 state: InternedStateUnion, // Fields one and two.
144 _cap: Tag, // Field three.
145}
146
147impl Drop for InternedUnion {
148 fn drop(&mut self) {
149 match self._cap {
150 Tag::InternedFixedSize => {
151 let state = unsafe { ManuallyDrop::take(&mut self.state.fixed_size) };
152 drop(state);
153 }
154 Tag::InternedGenericMap => {
155 let state = unsafe { ManuallyDrop::take(&mut self.state.generic_map) };
156 drop(state);
157 }
158 _ => unreachable!(),
159 }
160 }
161}
162
163impl Clone for InternedUnion {
164 fn clone(&self) -> Self {
165 // SAFETY: We use our tag (stored in `_cap`) to determine which field to access and clone, which
166 // ensures we only access fields which are initialized.
167 let state = unsafe {
168 match self._cap {
169 Tag::InternedFixedSize => InternedStateUnion {
170 fixed_size: self.state.fixed_size.clone(),
171 },
172 Tag::InternedGenericMap => InternedStateUnion {
173 generic_map: self.state.generic_map.clone(),
174 },
175 _ => unreachable!(),
176 }
177 };
178
179 Self { state, _cap: self._cap }
180 }
181}
182
183#[repr(C)]
184union InternedStateUnion {
185 fixed_size: ManuallyDrop<self::interning::fixed_size::StringState>,
186 generic_map: ManuallyDrop<self::interning::map::StringState>,
187}
188
189#[repr(C)]
190struct SharedUnion {
191 ptr: NonNull<str>, // Fields one and two
192 _cap: Tag, // Field three.
193}
194
195impl SharedUnion {
196 #[inline]
197 const fn as_str(&self) -> &str {
198 // SAFETY: The pointee is still live by virtue of being held in an `Arc`.
199 unsafe { self.ptr.as_ref() }
200 }
201}
202
203#[repr(C)]
204#[derive(Clone, Copy)]
205struct InlinedUnion {
206 // Data is arranged as 23 bytes for string data, and the remaining 1 byte for the string length.
207 data: [u8; INLINED_STR_DATA_BUF_LEN], // Fields one, two, and three.
208}
209
210impl InlinedUnion {
211 #[inline]
212 fn as_str(&self) -> &str {
213 let len = self.data[INLINED_STR_MAX_LEN] as usize;
214
215 // SAFETY: We know our data is valid UTF-8 since we only ever derive inlined strings from a valid string
216 // reference.
217 unsafe { from_utf8_unchecked(&self.data[0..len]) }
218 }
219}
220
221#[repr(C)]
222#[derive(Clone, Copy)]
223struct DiscriminantUnion {
224 unused: [u8; INLINED_STR_DATA_BUF_LEN - 1], // Fields one, two, and three, minus the last byte.
225 tag_byte: u8, // Last byte, overlapped with the "highest" byte of field three.
226}
227
228#[derive(Debug, Eq, PartialEq)]
229enum UnionType {
230 Empty,
231 Owned,
232 Static,
233 InternedFixedSize,
234 InternedGenericMap,
235 Inlined,
236 Shared,
237}
238
239impl UnionType {
240 #[inline]
241 const fn is_owned(&self) -> bool {
242 matches!(self, UnionType::Owned)
243 }
244}
245
246impl DiscriminantUnion {
247 #[inline]
248 const fn get_union_type(&self) -> UnionType {
249 // At a high level, we encode the type of the union into the last byte of the struct, which overlaps with the
250 // capacity of owned strings, the length of an inlined string, and the unused field of other string types.
251 //
252 // Our logic is simple here:
253 //
254 // - Allocations can only ever be as large as `isize::MAX`, which means that the top-most bit of the capacity
255 // value would never be used for a valid allocation.
256 // - In turn, the length of a string can also only ever be as large as `isize::MAX`, which means that the
257 // top-most bit of the length byte for any string type would never be used
258 // - Inlined strings can only ever be up to 23 bytes long, which means that the top-most bit of the length byte
259 // for an inlined string would never be used.
260 // - Static strings and interned strings only occupy the first two fields, which means their capacity should not
261 // be used.
262 //
263 // As such, we encode the possible string types as follows:
264 //
265 // - when all fields are zero, we have an empty string
266 // - when the last byte does have the top-bit set, we have an owned string
267 // - when the last byte does _not_ have the top-bit set, and the value is less than or equal to 23, we have an
268 // inlined string
269 // - when the last byte does _not_ have the top-bit set, and the value is greater than 23, we interpret the
270 // specific value of the last byte as a discriminant for the remaining string types (static, interned, etc)
271 //
272 // We abuse the layout of little endian integers here to ensure that the upper-most bits of the capacity
273 // field overlaps with the last byte of an inlined string, and the unused field of a static/interned string.
274 //
275 // In an inlined string, its length byte is situated as the very last byte (data[23]) in its layout. In an
276 // owned string, we instead have a pointer, length, and capacity, in that order. This means that the length
277 // byte of the inlined string and the capacity field of the owned string overlap, as seen below:
278 //
279 // ~ an inlined string, "hello, world", with a length of 12 (0C) ~
280 // ┌───────────────────────────────────────────────────────────────────────────────┐
281 // │ 68 65 6C 6C 6F 20 77 6F 72 6C 64 21 ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? 0C │
282 // └───────────────────────────────────────────────────────────────────────────────┘
283 // ▲
284 // ├──── overlapping
285 // ~ an owned string with a capacity of 64 ~ ▼ byte
286 // ┌─────────────────────────┐┌─────────────────────────┐┌─────────────────────────┐
287 // │ ?? ?? ?? ?? ?? ?? ?? ?? ││ ?? ?? ?? ?? ?? ?? ?? ?? ││ 40 00 00 00 00 00 00 80 │
288 // └─────────────────────────┘└─────────────────────────┘└─────────────────────────┘
289 // ▲
290 // original capacity: 64 (0x0000000000000040) │
291 // "tagged" capacity: 9223372036854775872 (0x8000000000000040) │
292 // ▲ │
293 // (tag bit) ────────────────────────────┘ │
294 // │
295 // │
296 // inlined last byte (0x0C) [0 0 0 0 1 1 0 0] ◀────────────────┤
297 // owned last byte (0x80) [1 0 0 0 0 0 0 0] ◀────────────────┤
298 // │
299 // inlined last byte │
300 // maximum of 23 (0x17) [0 0 0 1 0 1 1 1] ◀────────────────┤
301 // │
302 // "owned" discriminant │
303 // bitmask (any X bit) [X X X ? ? ? ? ?] ◀────────────────┘
304 //
305 // As such, the length byte of an inlined string overlaps with the capacity field of an owned string.
306 // Additionally, due to the integer fields being written in little-endian order, the "upper" byte of the
307 // capacity field -- the eight highest bits -- is actually written in the same location as the length byte
308 // of an inlined string.
309 //
310 // Given that we know an inlined string cannot be any longer than 23 bytes, we know that the top-most bit in
311 // the last byte (data[23]) can never be set, as it would imply a length of _at least_ 128. With that, we
312 // utilize invariant #3 of `Inner` -- allocations can never be larger than `isize::MAX` -- which lets us
313 // safely "tag" an owned string's capacity -- setting the upper most bit to 1 -- to indicate that it's an
314 // owned string.
315 //
316 // An inlined string couldn't possibly have a length that occupied that bit, and so we know if we're down
317 // here, and our pointer field isn't all zeroes and our length field isn't all zeroes, that we _have_ to be
318 // an owned string if our capacity is tagged.
319
320 // If the top bit is set, we know we're dealing with an owned string.
321 if is_tagged(self.tag_byte) {
322 return UnionType::Owned;
323 }
324
325 // The top-most bit has to be set for an owned string, but isn't set for any other type, so try differentiating
326 // at this point.
327 match self.tag_byte {
328 // Empty string. Easy.
329 0 => UnionType::Empty,
330
331 // Anything between 1 and INLINED_STR_MAX_LEN (23, inclusive) is an inlined string.
332 1..=INLINED_STR_MAX_LEN_U8 => UnionType::Inlined,
333
334 // These are fixed values between 24 and 128, so we just match them directly.
335 UNION_TYPE_TAG_VALUE_STATIC => UnionType::Static,
336 UNION_TYPE_TAG_VALUE_INTERNED_FIXED_SIZE => UnionType::InternedFixedSize,
337 UNION_TYPE_TAG_VALUE_INTERNED_GENERIC_MAP => UnionType::InternedGenericMap,
338 UNION_TYPE_TAG_VALUE_SHARED => UnionType::Shared,
339
340 // If we haven't matched any specific type tag value, then this is something else that we don't handle or
341 // know about... which we handle as just acting like we're an empty string for simplicity.
342 _ => UnionType::Empty,
343 }
344 }
345}
346
347/// The core data structure for holding all different string variants.
348///
349/// This union has six data fields -- one for each possible string variant -- and a discriminant field, used to
350/// determine which string variant is actually present. The discriminant field interrogates the bits in each machine
351/// word field (all variants are three machine words) to determine which bit patterns are valid or not for a given
352/// variant, allowing the string variant to be authoritatively determined.
353///
354/// # Invariants
355///
356/// This code depends on a number of invariants in order to work correctly:
357///
358/// 1. Only used on 64-bit little-endian platforms. (checked at compile-time via _INVARIANTS_CHECK)
359/// 2. The data pointers for `String` and `&'static str` cannot ever be null when the strings are non-empty.
360/// 3. Allocations can never be larger than `isize::MAX` (see [here][rust_isize_alloc_limit]), meaning that any
361/// length/capacity field for a string cannot ever be larger than `isize::MAX`, implying the 64th bit (top-most bit)
362/// for length/capacity should always be 0.
363/// 4. An inlined string can only hold up to 23 bytes of data, meaning that the length byte for that string can never
364/// have a value greater than 23. (_We_ have to provide this invariant, which is handled in `Inner::try_inlined`.)
365///
366/// [rust_isize_alloc_limit]: https://doc.rust-lang.org/stable/std/alloc/struct.Layout.html#method.from_size_align
367union Inner {
368 empty: EmptyUnion,
369 owned: OwnedUnion,
370 static_: StaticUnion,
371 interned: ManuallyDrop<InternedUnion>,
372 shared: ManuallyDrop<SharedUnion>,
373 inlined: InlinedUnion,
374 discriminant: DiscriminantUnion,
375}
376
377impl Inner {
378 const fn empty() -> Self {
379 Self {
380 empty: EmptyUnion::new(),
381 }
382 }
383
384 fn owned(value: String) -> Self {
385 match value.capacity() {
386 // The string we got is empty.
387 0 => Self::empty(),
388 cap => {
389 let mut value = value.into_bytes();
390
391 let ptr = value.as_mut_ptr();
392 let len = value.len();
393
394 // We're taking ownership of the underlying string allocation so we can't let it drop.
395 std::mem::forget(value);
396
397 Self {
398 owned: OwnedUnion {
399 ptr,
400 len,
401 cap: tag_cap(cap),
402 },
403 }
404 }
405 }
406 }
407
408 const fn static_str(value: &'static str) -> Self {
409 match value.len() {
410 0 => Self::empty(),
411 _ => Self {
412 static_: StaticUnion {
413 value,
414 _cap: Tag::Static,
415 },
416 },
417 }
418 }
419
420 fn interned(value: InternedString) -> Self {
421 match value.len() {
422 0 => Self::empty(),
423 _len => {
424 let (state, tag) = match value.into_state() {
425 InternedStringState::FixedSize(fixed_size) => (
426 InternedStateUnion {
427 fixed_size: ManuallyDrop::new(fixed_size),
428 },
429 Tag::InternedFixedSize,
430 ),
431 InternedStringState::GenericMap(generic_map) => (
432 InternedStateUnion {
433 generic_map: ManuallyDrop::new(generic_map),
434 },
435 Tag::InternedGenericMap,
436 ),
437 };
438
439 Self {
440 interned: ManuallyDrop::new(InternedUnion { state, _cap: tag }),
441 }
442 }
443 }
444 }
445
446 fn shared(value: Arc<str>) -> Self {
447 match value.len() {
448 0 => Self::empty(),
449 _len => Self {
450 shared: ManuallyDrop::new(SharedUnion {
451 // SAFETY: We know `ptr` is non-null because `Arc::into_raw` is called on a valid `Arc<str>`.
452 ptr: unsafe { NonNull::new_unchecked(Arc::into_raw(value).cast_mut()) },
453 _cap: Tag::Shared,
454 }),
455 },
456 }
457 }
458
459 fn try_inlined(value: &str) -> Option<Self> {
460 match value.len() {
461 0 => Some(Self::empty()),
462 len => {
463 if len > INLINED_STR_MAX_LEN {
464 return None;
465 }
466
467 let mut data = [0; INLINED_STR_DATA_BUF_LEN];
468
469 // SAFETY: We know it fits because we just checked that the string length is 23 or less.
470 data[INLINED_STR_MAX_LEN] = len as u8;
471
472 let buf = value.as_bytes();
473 data[0..len].copy_from_slice(buf);
474
475 Some(Self {
476 inlined: InlinedUnion { data },
477 })
478 }
479 }
480 }
481
482 #[inline]
483 fn as_str(&self) -> &str {
484 let union_type = unsafe { self.discriminant.get_union_type() };
485 match union_type {
486 UnionType::Empty => "",
487 UnionType::Owned => {
488 let owned = unsafe { &self.owned };
489 owned.as_str()
490 }
491 UnionType::Static => {
492 let static_ = unsafe { &self.static_ };
493 static_.as_str()
494 }
495 UnionType::InternedFixedSize => {
496 let interned = unsafe { &self.interned.state.fixed_size };
497 interned.as_str()
498 }
499 UnionType::InternedGenericMap => {
500 let interned = unsafe { &self.interned.state.generic_map };
501 interned.as_str()
502 }
503 UnionType::Shared => {
504 let shared = unsafe { &self.shared };
505 shared.as_str()
506 }
507 UnionType::Inlined => {
508 let inlined = unsafe { &self.inlined };
509 inlined.as_str()
510 }
511 }
512 }
513
514 #[inline]
515 const fn get_union_type(&self) -> UnionType {
516 unsafe { self.discriminant.get_union_type() }
517 }
518
519 fn into_owned(mut self) -> String {
520 let union_type = unsafe { self.discriminant.get_union_type() };
521 match union_type {
522 UnionType::Empty => String::new(),
523 UnionType::Owned => {
524 // We're (`Inner`) being consumed here, but we need to update our internal state to ensure that our drop
525 // logic doesn't try to double free the string allocation.
526 let owned = unsafe { self.owned };
527 self.empty = EmptyUnion::new();
528
529 owned.into_owned()
530 }
531 UnionType::Static => {
532 let static_ = unsafe { self.static_ };
533 static_.as_str().to_owned()
534 }
535 UnionType::InternedFixedSize => {
536 let interned = unsafe { &self.interned.state.fixed_size };
537 interned.as_str().to_owned()
538 }
539 UnionType::InternedGenericMap => {
540 let interned = unsafe { &self.interned.state.generic_map };
541 interned.as_str().to_owned()
542 }
543 UnionType::Shared => {
544 let shared = unsafe { &self.shared };
545 shared.as_str().to_owned()
546 }
547 UnionType::Inlined => {
548 let inlined = unsafe { self.inlined };
549 inlined.as_str().to_owned()
550 }
551 }
552 }
553}
554
555impl Drop for Inner {
556 fn drop(&mut self) {
557 let union_type = unsafe { self.discriminant.get_union_type() };
558 match union_type {
559 UnionType::Owned => {
560 let owned = unsafe { &mut self.owned };
561
562 let ptr = owned.ptr;
563 let len = owned.len;
564 let cap = untag_cap(owned.cap);
565
566 // SAFETY: The pointer has to be non-null, because we only ever construct an owned variant when the
567 // `String` has a non-zero capacity, which implies a valid allocation, and thus a valid, non-null
568 // pointer.
569 let data = unsafe { Vec::<u8>::from_raw_parts(ptr, len, cap) };
570 drop(data);
571 }
572 UnionType::InternedFixedSize | UnionType::InternedGenericMap => {
573 let interned = unsafe { &mut self.interned };
574
575 // SAFETY: We're already dropping `Inner`, so nothing else can use the `InternedString` value after we
576 // consume it here.
577 let data = unsafe { ManuallyDrop::take(interned) };
578 drop(data);
579 }
580 UnionType::Shared => {
581 let shared = unsafe { &mut self.shared };
582
583 // Decrement the strong count before we drop, ensuring the `Arc` has a chance to clean itself up if this
584 // is the less strong reference.
585 //
586 // SAFETY: We know `shared.ptr` was obtained from `Arc::into_raw`, so it's valid to decrement on. We
587 // also know the backing storage is still live because it has to be by virtue of us being here.
588 unsafe {
589 Arc::decrement_strong_count(shared.ptr.as_ptr().cast_const());
590 }
591 }
592 _ => {}
593 }
594 }
595}
596
597impl Clone for Inner {
598 fn clone(&self) -> Self {
599 let union_type = unsafe { self.discriminant.get_union_type() };
600 match union_type {
601 UnionType::Empty => Self::empty(),
602 UnionType::Owned => {
603 let owned = unsafe { self.owned };
604 let s = owned.as_str();
605
606 // We specifically try to inline here.
607 //
608 // At a high-level, when we're _given_ an owned string, we avoid inlining because we don't want to trash
609 // the underlying allocation since we may be asked to give it back later (via `MetaString::into_owned`).
610 // However, when we're _cloning_, we'd rather avoiding allocating if we can help it.
611 Self::try_inlined(s).unwrap_or_else(|| Self::owned(s.to_owned()))
612 }
613 UnionType::Static => Self {
614 static_: unsafe { self.static_ },
615 },
616 UnionType::InternedFixedSize | UnionType::InternedGenericMap => Self {
617 interned: unsafe { self.interned.clone() },
618 },
619 UnionType::Shared => {
620 let shared = unsafe { &self.shared };
621
622 // We have to increment the strong count before cloning.
623 //
624 // SAFETY: We know `shared.ptr` was obtained from `Arc::into_raw`. We also know that if we're cloning
625 // this value, that the underlying `Arc` must still be live, since we're holding a reference to it.
626 unsafe {
627 Arc::increment_strong_count(shared.ptr.as_ptr().cast_const());
628 }
629
630 Self {
631 shared: ManuallyDrop::new(SharedUnion {
632 ptr: shared.ptr,
633 _cap: Tag::Shared,
634 }),
635 }
636 }
637 UnionType::Inlined => Self {
638 inlined: unsafe { self.inlined },
639 },
640 }
641 }
642}
643
644// SAFETY: None of our union variants are tied to the original thread they were created on.
645unsafe impl Send for Inner {}
646
647// SAFETY: None of our union variants use any form of interior mutability and are thus safe to be shared between
648// threads.
649unsafe impl Sync for Inner {}
650
651/// An immutable string type that abstracts over various forms of string storage.
652///
653/// Normally, developers will work with either `String` (owned) or `&str` (borrowed) when dealing with strings. In some
654/// cases, though, it can be useful to work with strings that use alternative storage, such as those that are atomically
655/// shared (e.g. `InternedString`). While using those string types themselves isn't complex, using them and _also_
656/// supporting normal string types can be complex.
657///
658/// `MetaString` is an opinionated string type that abstracts over the normal string types like `String` and `&str`
659/// while also supporting alternative storage, such as interned strings from `FixedSizeInterner`, unifying these
660/// different variants behind a single concrete type.
661///
662/// ## Supported types
663///
664/// `MetaString` supports the following "variants":
665///
666/// - owned (`String` and non-inlineable `&str`)
667/// - static (`&'static str`)
668/// - interned (`InternedString`)
669/// - shared (`Arc<str>`)
670/// - inlined (up to 23 bytes)
671///
672/// ### Owned and borrowed strings
673///
674/// `MetaString` can be created from `String` and `&str` directly. For owned scenarios (`String`), the string value is
675/// simply wrapped. For borrowed strings, we attempt to inline them (see more below) or, if they cannot be inlined, they
676/// are copied into a new `String`.
677///
678/// ### Static strings
679///
680/// `MetaString` can be created from `&'static str` directly. This is useful for string literals and other static
681/// strings that are too large to be inlined.
682///
683/// ### Interned strings
684///
685/// `MetaString` can also be created from `InternedString`, which is a string that has been interned (using an interner
686/// like [`FixedSizeInterner`][crate::interning::FixedSizeInterner] or
687/// [`GenericMapInterner`][crate::interning::GenericMapInterner]). Interned strings are essentially a combination of the
688/// properties of `Arc<T>` -- owned wrappers around an atomically reference counted piece of data -- and a fixed-size
689/// buffer, where we allocate one large buffer, and write many small strings into it, and provide references to those
690/// strings through `InternedString`.
691///
692/// ### Shared strings
693///
694/// `MetaString` can be created from `Arc<str>`, which is a string slice that can be atomically shared between threads.
695/// This is a simpler version of interned strings where strict memory control and re-use is not required.
696///
697/// ### Inlined strings
698///
699/// Finally, `MetaString` can also be created by inlining small strings into `MetaString` itself, avoiding the need for
700/// any backing allocation. "Small string optimization" is a common optimization for string types where small strings
701/// can be stored directly in a string type itself by utilizing a "union"-style layout.
702///
703/// As `MetaString` utilizes such a layout, we can provide a small string optimization that allows for strings up to 23
704/// bytes in length.
705///
706/// ## Conversion methods
707///
708/// Implementations of `From<T>` exist for all of the aforementioned types to allow for easily converting to
709/// `MetaString`. Once a caller has a `MetaString` value, they are generally expected to interact with the string in a
710/// read-only way, as `MetaString` can be dereferenced directly to `&str`.
711///
712/// If a caller needs to be able to modify the string data, they can call `into_owned` to get an owned version of the
713/// string, make their modifications to the owned version, and then convert that back to `MetaString`.
714#[derive(Clone)]
715pub struct MetaString {
716 inner: Inner,
717}
718
719impl MetaString {
720 /// Creates an empty `MetaString`.
721 ///
722 /// This does not allocate.
723 pub const fn empty() -> Self {
724 Self { inner: Inner::empty() }
725 }
726
727 /// Creates a new `MetaString` from the given static string.
728 ///
729 /// This does not allocate.
730 pub const fn from_static(s: &'static str) -> Self {
731 Self {
732 inner: Inner::static_str(s),
733 }
734 }
735
736 /// Attempts to create a new `MetaString` from the given string if it can be inlined.
737 pub fn try_inline(s: &str) -> Option<Self> {
738 Inner::try_inlined(s).map(|inner| Self { inner })
739 }
740
741 /// Returns `true` if `self` has a length of zero bytes.
742 pub fn is_empty(&self) -> bool {
743 self.deref().is_empty()
744 }
745
746 /// Returns `true` if `self` can be cheaply cloned.
747 pub const fn is_cheaply_cloneable(&self) -> bool {
748 // If we're wrapping an owned string, cloning means cloning that allocation. All other types are cheap to clone.
749 !self.inner.get_union_type().is_owned()
750 }
751
752 /// Consumes `self` and returns an owned `String`.
753 ///
754 /// If the `MetaString` is already owned, this will simply return the inner `String` directly. Otherwise, this will
755 /// allocate an owned version (`String`) of the string data.
756 pub fn into_owned(self) -> String {
757 self.inner.into_owned()
758 }
759}
760
761impl Default for MetaString {
762 fn default() -> Self {
763 Self::empty()
764 }
765}
766
767impl hash::Hash for MetaString {
768 fn hash<H: hash::Hasher>(&self, state: &mut H) {
769 self.deref().hash(state)
770 }
771}
772
773impl PartialEq<str> for MetaString {
774 fn eq(&self, other: &str) -> bool {
775 self.deref() == other
776 }
777}
778
779impl PartialEq<&str> for MetaString {
780 fn eq(&self, other: &&str) -> bool {
781 self.deref() == *other
782 }
783}
784
785impl PartialEq<MetaString> for MetaString {
786 fn eq(&self, other: &MetaString) -> bool {
787 self.deref() == other.deref()
788 }
789}
790
791impl Eq for MetaString {}
792
793impl PartialOrd for MetaString {
794 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
795 Some(self.cmp(other))
796 }
797}
798
799impl Ord for MetaString {
800 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
801 self.deref().cmp(other.deref())
802 }
803}
804
805impl Borrow<str> for MetaString {
806 fn borrow(&self) -> &str {
807 self.deref()
808 }
809}
810
811impl Serialize for MetaString {
812 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
813 where
814 S: serde::Serializer,
815 {
816 serializer.serialize_str(self.deref())
817 }
818}
819
820impl From<String> for MetaString {
821 fn from(s: String) -> Self {
822 Self { inner: Inner::owned(s) }
823 }
824}
825
826impl From<&str> for MetaString {
827 fn from(s: &str) -> Self {
828 Self::try_inline(s).unwrap_or_else(|| Self::from(s.to_owned()))
829 }
830}
831
832impl From<InternedString> for MetaString {
833 fn from(s: InternedString) -> Self {
834 Self {
835 inner: Inner::interned(s),
836 }
837 }
838}
839
840impl From<Arc<str>> for MetaString {
841 fn from(s: Arc<str>) -> Self {
842 Self {
843 inner: Inner::shared(s),
844 }
845 }
846}
847
848impl From<MetaString> for protobuf::Chars {
849 fn from(value: MetaString) -> Self {
850 value.into_owned().into()
851 }
852}
853
854impl<'a> From<&'a MetaString> for protobuf::Chars {
855 fn from(value: &'a MetaString) -> Self {
856 // TODO: We're foregoing additional code/complexity to recover a static reference, when our string storage is
857 // static, in order to optimize the conversion to `Bytes` and then `Chars`.
858 //
859 // Static strings being written to Protocol Buffers should be decently rare across the codebase, so no biggie
860 // for now.
861 value.deref().into()
862 }
863}
864
865impl Deref for MetaString {
866 type Target = str;
867
868 fn deref(&self) -> &str {
869 self.inner.as_str()
870 }
871}
872
873impl AsRef<str> for MetaString {
874 fn as_ref(&self) -> &str {
875 self.deref()
876 }
877}
878
879impl fmt::Debug for MetaString {
880 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
881 self.deref().fmt(f)
882 }
883}
884
885impl fmt::Display for MetaString {
886 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
887 self.deref().fmt(f)
888 }
889}
890
891#[cfg(test)]
892mod tests {
893 use std::{num::NonZeroUsize, sync::Arc};
894
895 use proptest::{prelude::*, proptest};
896
897 use super::{interning::GenericMapInterner, InlinedUnion, Inner, MetaString, UnionType};
898 use crate::interning::{FixedSizeInterner, Interner as _};
899
900 #[test]
901 fn struct_sizes() {
902 // Overall, we expect `MetaString`, and thus `Inner`, to always be three machine words.
903 assert_eq!(std::mem::size_of::<MetaString>(), std::mem::size_of::<usize>() * 3);
904 assert_eq!(std::mem::size_of::<Inner>(), std::mem::size_of::<usize>() * 3);
905
906 // We also expect all of inlined union variant to be the exact size of `Inner`, which means we're properly
907 // maximizing the available space for inlining.
908 assert_eq!(std::mem::size_of::<InlinedUnion>(), std::mem::size_of::<Inner>());
909 }
910
911 #[test]
912 fn static_str_inlineable() {
913 // We always use the static variant, even if the string is inlineable, because this lets us make `from_static` const.
914 let s = "hello";
915 let meta = MetaString::from_static(s);
916
917 assert_eq!(meta.inner.get_union_type(), UnionType::Static);
918 assert_eq!(s, &*meta);
919 assert_eq!(s, meta.into_owned());
920 }
921
922 #[test]
923 fn static_str_not_inlineable() {
924 let s = "hello there, world! it's me, margaret!";
925 let meta = MetaString::from_static(s);
926
927 assert_eq!(s, &*meta);
928 assert_eq!(meta.inner.get_union_type(), UnionType::Static);
929 assert_eq!(s, meta.into_owned());
930 }
931
932 #[test]
933 fn owned_string() {
934 let s_orig = "hello";
935 let s = String::from(s_orig);
936 let meta = MetaString::from(s);
937
938 assert_eq!(s_orig, &*meta);
939 assert_eq!(meta.inner.get_union_type(), UnionType::Owned);
940 assert_eq!(s_orig, meta.into_owned());
941 }
942
943 #[test]
944 fn inlined_string() {
945 let s = "hello";
946 let meta = MetaString::from(s);
947
948 assert_eq!(s, &*meta);
949 assert_eq!(meta.inner.get_union_type(), UnionType::Inlined);
950 assert_eq!(s, meta.into_owned());
951 }
952
953 #[test]
954 fn interned_string() {
955 let intern_str = "hello interned str!";
956
957 let fs_interner = FixedSizeInterner::<1>::new(NonZeroUsize::new(1024).unwrap());
958 let s = fs_interner.try_intern(intern_str).unwrap();
959 assert_eq!(intern_str, &*s);
960
961 let meta = MetaString::from(s);
962 assert_eq!(intern_str, &*meta);
963 assert_eq!(meta.inner.get_union_type(), UnionType::InternedFixedSize);
964 assert_eq!(intern_str, meta.into_owned());
965
966 let gm_interner = GenericMapInterner::new(NonZeroUsize::new(1024).unwrap());
967 let s = gm_interner.try_intern(intern_str).unwrap();
968 assert_eq!(intern_str, &*s);
969
970 let meta = MetaString::from(s);
971 assert_eq!(intern_str, &*meta);
972 assert_eq!(meta.inner.get_union_type(), UnionType::InternedGenericMap);
973 assert_eq!(intern_str, meta.into_owned());
974 }
975
976 #[test]
977 fn shared_string() {
978 let shared_str = "hello shared str!";
979
980 let s = Arc::<str>::from(shared_str);
981 assert_eq!(shared_str, &*s);
982
983 let meta = MetaString::from(s);
984 assert_eq!(shared_str, &*meta);
985 assert_eq!(meta.inner.get_union_type(), UnionType::Shared);
986 assert_eq!(shared_str, meta.into_owned());
987 }
988
989 #[test]
990 fn shared_string_clone() {
991 let shared_str = "hello shared str!";
992 let s = Arc::<str>::from(shared_str);
993 let meta = MetaString::from(s);
994
995 // Clone the `MetaString` to make sure we can still access the original string.
996 let meta2 = meta.clone();
997 assert_eq!(shared_str, &*meta2);
998
999 // Drop the original `MetaString` to ensure we can still access the string from our clone after going through
1000 // the drop logic for the shared variant.
1001 drop(meta);
1002 assert_eq!(shared_str, &*meta2);
1003 }
1004
1005 #[test]
1006 fn empty_string_shared() {
1007 let shared_str = "";
1008
1009 let s = Arc::<str>::from(shared_str);
1010 assert_eq!(shared_str, &*s);
1011
1012 let meta = MetaString::from(s);
1013 assert_eq!(shared_str, &*meta);
1014 assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1015 assert_eq!(shared_str, meta.into_owned());
1016 }
1017
1018 #[test]
1019 fn empty_string_interned() {
1020 let intern_str = "";
1021
1022 let interner = GenericMapInterner::new(NonZeroUsize::new(1024).unwrap());
1023 let s = interner.try_intern(intern_str).unwrap();
1024 assert_eq!(intern_str, &*s);
1025
1026 let meta = MetaString::from(s);
1027 assert_eq!(intern_str, &*meta);
1028 assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1029 assert_eq!(intern_str, meta.into_owned());
1030 }
1031
1032 #[test]
1033 fn empty_string_static() {
1034 let s = "";
1035
1036 let meta = MetaString::from_static(s);
1037 assert_eq!(s, &*meta);
1038 assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1039 assert_eq!(s, meta.into_owned());
1040 }
1041
1042 #[test]
1043 fn empty_string_inlined() {
1044 let s = "";
1045
1046 let meta = MetaString::try_inline(s).expect("empty string definitely 'fits'");
1047 assert_eq!(s, &*meta);
1048 assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1049 assert_eq!(s, meta.into_owned());
1050 }
1051
1052 #[test]
1053 fn empty_string_owned() {
1054 // When a string has capacity, we don't care if it's actually empty or not, because we want to preserve the
1055 // allocation... so our string here is empty but _does_ have capacity.
1056 let s = String::with_capacity(32);
1057 let actual_cap = s.capacity();
1058
1059 let meta = MetaString::from(s);
1060 assert_eq!("", &*meta);
1061 assert_eq!(meta.inner.get_union_type(), UnionType::Owned);
1062
1063 let owned = meta.into_owned();
1064 assert_eq!(owned.capacity(), actual_cap);
1065 assert_eq!("", owned);
1066 }
1067
1068 #[test]
1069 fn empty_string_owned_zero_capacity() {
1070 // When a string has _no_ capacity, it's effectively empty, and we treat it that way.
1071 let s = String::new();
1072
1073 let meta = MetaString::from(s);
1074 assert_eq!("", &*meta);
1075 assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1076 assert_eq!("", meta.into_owned());
1077 }
1078
1079 #[test]
1080 fn test_is_cheaply_cloneable() {
1081 // Owned strings are never cheap to clone.
1082 let s =
1083 String::from("big ol' stringy string that can't be inlined and lives out its bleak existence in the heap");
1084 let ms = MetaString::from(s);
1085 assert!(!ms.is_cheaply_cloneable());
1086
1087 // Empty strings are always cheap to clone.
1088 let ms = MetaString::empty();
1089 assert!(ms.is_cheaply_cloneable());
1090
1091 // Inlined strings are always cheap to clone.
1092 let s = "hello";
1093 let ms = MetaString::try_inline(s).expect("inlined string should fit");
1094 assert!(ms.is_cheaply_cloneable());
1095
1096 // Static strings are always cheap to clone.
1097 let s = "hello there, world! it's me, margaret!";
1098 let ms = MetaString::from_static(s);
1099 assert!(ms.is_cheaply_cloneable());
1100
1101 // Interned strings are always cheap to clone.
1102 let interner = GenericMapInterner::new(NonZeroUsize::new(1024).unwrap());
1103 let s = interner.try_intern("hello interned str!").unwrap();
1104 let ms = MetaString::from(s);
1105 assert!(ms.is_cheaply_cloneable());
1106
1107 // Shared strings are always cheap to clone.
1108 let s = Arc::from("hello shared str!");
1109 let ms = MetaString::from(s);
1110 assert!(ms.is_cheaply_cloneable());
1111 }
1112
1113 fn arb_unicode_str_max_len(max_len: usize) -> impl Strategy<Value = String> {
1114 ".{0,23}".prop_filter("resulting string is too longer", move |s| s.len() <= max_len)
1115 }
1116
1117 proptest! {
1118 #![proptest_config(ProptestConfig::with_cases(10000))]
1119
1120 #[test]
1121 #[cfg_attr(miri, ignore)]
1122 fn property_test_inlined_string(
1123 input in arb_unicode_str_max_len(23),
1124 ) {
1125 assert!(input.len() <= 23, "input should be 23 bytes or less");
1126 let meta = MetaString::try_inline(&input).expect("input should fit");
1127
1128 if input.is_empty() {
1129 assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1130 } else {
1131 assert_eq!(input, &*meta);
1132 assert_eq!(meta.inner.get_union_type(), UnionType::Inlined);
1133 }
1134 }
1135
1136 #[test]
1137 #[cfg_attr(miri, ignore)]
1138 fn property_test_owned_string(
1139 input in ".*",
1140 ) {
1141 let is_empty = input.is_empty();
1142 let meta = MetaString::from(input);
1143
1144 if is_empty {
1145 assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1146 } else {
1147 assert_eq!(meta.inner.get_union_type(), UnionType::Owned);
1148 }
1149 }
1150 }
1151}