stringtheory/lib.rs
1//! Sharing-optimized strings and string interning utilities.
2//!
3//! `stringtheory` provides two main components: a sharing-optimized string type, `MetaString`, and string interning
4//! implementations (`FixedSizeInterner`, `GenericMapInterner`, etc). These components are meant to work in concert,
5//! allowing for using a single string type that can handle owned, shared, and interned strings, and providing a way to
6//! efficiently intern strings strings when possible.
7#![deny(warnings)]
8#![deny(missing_docs)]
9// We only support 64-bit little-endian platforms anyways, so there's no risk of our enum variants having their values truncated.
10#![allow(clippy::enum_clike_unportable_variant)]
11
12use std::{
13 borrow::Borrow, fmt, hash, mem::ManuallyDrop, ops::Deref, ptr::NonNull, slice::from_raw_parts,
14 str::from_utf8_unchecked, sync::Arc,
15};
16
17use serde::{Deserialize, Serialize};
18
19mod clone;
20
21pub use self::clone::CheapMetaString;
22
23pub mod interning;
24use self::interning::{InternedString, InternedStringState, Interner};
25
26const ZERO_VALUE: usize = 0;
27const TOP_MOST_BIT: usize = usize::MAX & !(isize::MAX as usize);
28const INLINED_STR_DATA_BUF_LEN: usize = std::mem::size_of::<usize>() * 3;
29const INLINED_STR_MAX_LEN: usize = INLINED_STR_DATA_BUF_LEN - 1;
30const INLINED_STR_MAX_LEN_U8: u8 = INLINED_STR_MAX_LEN as u8;
31
32const UNION_TYPE_TAG_VALUE_STATIC: u8 = get_offset_tag_value(0);
33const UNION_TYPE_TAG_VALUE_INTERNED_FIXED_SIZE: u8 = get_offset_tag_value(1);
34const UNION_TYPE_TAG_VALUE_INTERNED_GENERIC_MAP: u8 = get_offset_tag_value(2);
35const UNION_TYPE_TAG_VALUE_SHARED: u8 = get_offset_tag_value(3);
36
37const fn get_offset_tag_value(tag: u8) -> u8 {
38 const UNION_TYPE_TAG_VALUE_BASE: u8 = INLINED_STR_MAX_LEN as u8 + 1;
39
40 if tag > (u8::MAX - INLINED_STR_MAX_LEN as u8) {
41 panic!("Union type tag value must be less than 232 to fit.");
42 }
43
44 tag + UNION_TYPE_TAG_VALUE_BASE
45}
46
47const fn get_scaled_union_tag(tag: u8) -> usize {
48 // We want to shift our tag value (single byte) to make it the top most byte in a `usize`.
49 const UNION_TYPE_TAG_VALUE_SHIFT: u32 = (std::mem::size_of::<usize>() as u32 - 1) * 8;
50
51 (tag as usize) << UNION_TYPE_TAG_VALUE_SHIFT
52}
53
54// High-level invariant checks to ensure `stringtheory` isn't being used on an unsupported platform.
55#[cfg(not(all(target_pointer_width = "64", target_endian = "little")))]
56const _INVARIANTS_CHECK: () = {
57 compile_error!("`stringtheory` is only supported on 64-bit little-endian platforms.");
58};
59
60const fn is_tagged(cap: u8) -> bool {
61 cap & 0b10000000 != 0
62}
63
64const fn tag_cap(cap: usize) -> usize {
65 cap | TOP_MOST_BIT
66}
67
68const fn untag_cap(cap: usize) -> usize {
69 cap & !(TOP_MOST_BIT)
70}
71
72#[derive(Clone, Copy, Eq, PartialEq)]
73#[repr(usize)]
74enum Zero {
75 Zero = ZERO_VALUE,
76}
77
78#[derive(Clone, Copy, Debug, Eq, PartialEq)]
79#[repr(usize)]
80enum Tag {
81 Static = get_scaled_union_tag(UNION_TYPE_TAG_VALUE_STATIC),
82 InternedFixedSize = get_scaled_union_tag(UNION_TYPE_TAG_VALUE_INTERNED_FIXED_SIZE),
83 InternedGenericMap = get_scaled_union_tag(UNION_TYPE_TAG_VALUE_INTERNED_GENERIC_MAP),
84 Shared = get_scaled_union_tag(UNION_TYPE_TAG_VALUE_SHARED),
85}
86
87#[repr(C)]
88#[derive(Clone, Copy)]
89struct EmptyUnion {
90 ptr: Zero, // Field one.
91 len: Zero, // Field two.
92 cap: Zero, // Field three.
93}
94
95impl EmptyUnion {
96 const fn new() -> Self {
97 Self {
98 ptr: Zero::Zero,
99 len: Zero::Zero,
100 cap: Zero::Zero,
101 }
102 }
103}
104
105#[repr(C)]
106#[derive(Clone, Copy)]
107struct OwnedUnion {
108 ptr: *mut u8, // Field one
109 len: usize, // Field two.
110 cap: usize, // Field three.
111}
112
113impl OwnedUnion {
114 #[inline]
115 fn as_str(&self) -> &str {
116 // SAFETY: We know our pointer is valid, and non-null, since it's derived from a valid `String`, and that the
117 // data it points to is valid UTF-8, again, by virtue of it being derived from a valid `String`.
118 unsafe { from_utf8_unchecked(from_raw_parts(self.ptr, self.len)) }
119 }
120
121 fn into_owned(self) -> String {
122 // SAFETY: We know our pointer is valid, and non-null, since it's derived from a valid `String`, and that the
123 // data it points to is valid UTF-8, again, by virtue of it being derived from a valid `String`.
124 unsafe { String::from_raw_parts(self.ptr, self.len, untag_cap(self.cap)) }
125 }
126}
127
128#[repr(C)]
129#[derive(Clone, Copy)]
130struct StaticUnion {
131 value: &'static str, // Fields one and two.
132 _cap: Tag, // Field three.
133}
134
135impl StaticUnion {
136 #[inline]
137 const fn as_str(&self) -> &str {
138 self.value
139 }
140}
141
142#[repr(C)]
143struct InternedUnion {
144 state: InternedStateUnion, // Fields one and two.
145 _cap: Tag, // Field three.
146}
147
148impl Drop for InternedUnion {
149 fn drop(&mut self) {
150 match self._cap {
151 Tag::InternedFixedSize => {
152 let state = unsafe { ManuallyDrop::take(&mut self.state.fixed_size) };
153 drop(state);
154 }
155 Tag::InternedGenericMap => {
156 let state = unsafe { ManuallyDrop::take(&mut self.state.generic_map) };
157 drop(state);
158 }
159 _ => unreachable!(),
160 }
161 }
162}
163
164impl Clone for InternedUnion {
165 fn clone(&self) -> Self {
166 // SAFETY: We use our tag (stored in `_cap`) to determine which field to access and clone, which
167 // ensures we only access fields which are initialized.
168 let state = unsafe {
169 match self._cap {
170 Tag::InternedFixedSize => InternedStateUnion {
171 fixed_size: self.state.fixed_size.clone(),
172 },
173 Tag::InternedGenericMap => InternedStateUnion {
174 generic_map: self.state.generic_map.clone(),
175 },
176 _ => unreachable!(),
177 }
178 };
179
180 Self { state, _cap: self._cap }
181 }
182}
183
184#[repr(C)]
185union InternedStateUnion {
186 fixed_size: ManuallyDrop<self::interning::fixed_size::StringState>,
187 generic_map: ManuallyDrop<self::interning::map::StringState>,
188}
189
190#[repr(C)]
191struct SharedUnion {
192 ptr: NonNull<str>, // Fields one and two
193 _cap: Tag, // Field three.
194}
195
196impl SharedUnion {
197 #[inline]
198 const fn as_str(&self) -> &str {
199 // SAFETY: The pointee is still live by virtue of being held in an `Arc`.
200 unsafe { self.ptr.as_ref() }
201 }
202}
203
204#[repr(C)]
205#[derive(Clone, Copy)]
206struct InlinedUnion {
207 // Data is arranged as 23 bytes for string data, and the remaining 1 byte for the string length.
208 data: [u8; INLINED_STR_DATA_BUF_LEN], // Fields one, two, and three.
209}
210
211impl InlinedUnion {
212 #[inline]
213 fn as_str(&self) -> &str {
214 let len = self.data[INLINED_STR_MAX_LEN] as usize;
215
216 // SAFETY: We know our data is valid UTF-8 since we only ever derive inlined strings from a valid string
217 // reference.
218 unsafe { from_utf8_unchecked(&self.data[0..len]) }
219 }
220}
221
222#[repr(C)]
223#[derive(Clone, Copy)]
224struct DiscriminantUnion {
225 unused: [u8; INLINED_STR_DATA_BUF_LEN - 1], // Fields one, two, and three, minus the last byte.
226 tag_byte: u8, // Last byte, overlapped with the "highest" byte of field three.
227}
228
229#[derive(Debug, Eq, PartialEq)]
230enum UnionType {
231 Empty,
232 Owned,
233 Static,
234 InternedFixedSize,
235 InternedGenericMap,
236 Inlined,
237 Shared,
238}
239
240impl UnionType {
241 #[inline]
242 const fn is_owned(&self) -> bool {
243 matches!(self, UnionType::Owned)
244 }
245}
246
247impl DiscriminantUnion {
248 #[inline]
249 const fn get_union_type(&self) -> UnionType {
250 // At a high level, we encode the type of the union into the last byte of the struct, which overlaps with the
251 // capacity of owned strings, the length of an inlined string, and the unused field of other string types.
252 //
253 // Our logic is simple here:
254 //
255 // - Allocations can only ever be as large as `isize::MAX`, which means that the top-most bit of the capacity
256 // value would never be used for a valid allocation.
257 // - In turn, the length of a string can also only ever be as large as `isize::MAX`, which means that the
258 // top-most bit of the length byte for any string type would never be used
259 // - Inlined strings can only ever be up to 23 bytes long, which means that the top-most bit of the length byte
260 // for an inlined string would never be used.
261 // - Static strings and interned strings only occupy the first two fields, which means their capacity should not
262 // be used.
263 //
264 // As such, we encode the possible string types as follows:
265 //
266 // - when all fields are zero, we have an empty string
267 // - when the last byte does have the top-bit set, we have an owned string
268 // - when the last byte does _not_ have the top-bit set, and the value is less than or equal to 23, we have an
269 // inlined string
270 // - when the last byte does _not_ have the top-bit set, and the value is greater than 23, we interpret the
271 // specific value of the last byte as a discriminant for the remaining string types (static, interned, etc)
272 //
273 // We abuse the layout of little endian integers here to ensure that the upper-most bits of the capacity
274 // field overlaps with the last byte of an inlined string, and the unused field of a static/interned string.
275 //
276 // In an inlined string, its length byte is situated as the very last byte (data[23]) in its layout. In an
277 // owned string, we instead have a pointer, length, and capacity, in that order. This means that the length
278 // byte of the inlined string and the capacity field of the owned string overlap, as seen below:
279 //
280 // ~ an inlined string, "hello, world", with a length of 12 (0C) ~
281 // ┌───────────────────────────────────────────────────────────────────────────────┐
282 // │ 68 65 6C 6C 6F 20 77 6F 72 6C 64 21 ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? 0C │
283 // └───────────────────────────────────────────────────────────────────────────────┘
284 // ▲
285 // ├──── overlapping
286 // ~ an owned string with a capacity of 64 ~ ▼ byte
287 // ┌─────────────────────────┐┌─────────────────────────┐┌─────────────────────────┐
288 // │ ?? ?? ?? ?? ?? ?? ?? ?? ││ ?? ?? ?? ?? ?? ?? ?? ?? ││ 40 00 00 00 00 00 00 80 │
289 // └─────────────────────────┘└─────────────────────────┘└─────────────────────────┘
290 // ▲
291 // original capacity: 64 (0x0000000000000040) │
292 // "tagged" capacity: 9223372036854775872 (0x8000000000000040) │
293 // ▲ │
294 // (tag bit) ────────────────────────────┘ │
295 // │
296 // │
297 // inlined last byte (0x0C) [0 0 0 0 1 1 0 0] ◀────────────────┤
298 // owned last byte (0x80) [1 0 0 0 0 0 0 0] ◀────────────────┤
299 // │
300 // inlined last byte │
301 // maximum of 23 (0x17) [0 0 0 1 0 1 1 1] ◀────────────────┤
302 // │
303 // "owned" discriminant │
304 // bitmask (any X bit) [X X X ? ? ? ? ?] ◀────────────────┘
305 //
306 // As such, the length byte of an inlined string overlaps with the capacity field of an owned string.
307 // Additionally, due to the integer fields being written in little-endian order, the "upper" byte of the
308 // capacity field -- the eight highest bits -- is actually written in the same location as the length byte
309 // of an inlined string.
310 //
311 // Given that we know an inlined string cannot be any longer than 23 bytes, we know that the top-most bit in
312 // the last byte (data[23]) can never be set, as it would imply a length of _at least_ 128. With that, we
313 // utilize invariant #3 of `Inner` -- allocations can never be larger than `isize::MAX` -- which lets us
314 // safely "tag" an owned string's capacity -- setting the upper most bit to 1 -- to indicate that it's an
315 // owned string.
316 //
317 // An inlined string couldn't possibly have a length that occupied that bit, and so we know if we're down
318 // here, and our pointer field isn't all zeroes and our length field isn't all zeroes, that we _have_ to be
319 // an owned string if our capacity is tagged.
320
321 // If the top bit is set, we know we're dealing with an owned string.
322 if is_tagged(self.tag_byte) {
323 return UnionType::Owned;
324 }
325
326 // The top-most bit has to be set for an owned string, but isn't set for any other type, so try differentiating
327 // at this point.
328 match self.tag_byte {
329 // Empty string. Easy.
330 0 => UnionType::Empty,
331
332 // Anything between 1 and INLINED_STR_MAX_LEN (23, inclusive) is an inlined string.
333 1..=INLINED_STR_MAX_LEN_U8 => UnionType::Inlined,
334
335 // These are fixed values between 24 and 128, so we just match them directly.
336 UNION_TYPE_TAG_VALUE_STATIC => UnionType::Static,
337 UNION_TYPE_TAG_VALUE_INTERNED_FIXED_SIZE => UnionType::InternedFixedSize,
338 UNION_TYPE_TAG_VALUE_INTERNED_GENERIC_MAP => UnionType::InternedGenericMap,
339 UNION_TYPE_TAG_VALUE_SHARED => UnionType::Shared,
340
341 // If we haven't matched any specific type tag value, then this is something else that we don't handle or
342 // know about... which we handle as just acting like we're an empty string for simplicity.
343 _ => UnionType::Empty,
344 }
345 }
346}
347
348/// The core data structure for holding all different string variants.
349///
350/// This union has six data fields -- one for each possible string variant -- and a discriminant field, used to
351/// determine which string variant is actually present. The discriminant field interrogates the bits in each machine
352/// word field (all variants are three machine words) to determine which bit patterns are valid or not for a given
353/// variant, allowing the string variant to be authoritatively determined.
354///
355/// # Invariants
356///
357/// This code depends on a number of invariants in order to work correctly:
358///
359/// 1. Only used on 64-bit little-endian platforms. (checked at compile-time via _INVARIANTS_CHECK)
360/// 2. The data pointers for `String` and `&'static str` cannot ever be null when the strings are non-empty.
361/// 3. Allocations can never be larger than `isize::MAX` (see [here][rust_isize_alloc_limit]), meaning that any
362/// length/capacity field for a string cannot ever be larger than `isize::MAX`, implying the 64th bit (top-most bit)
363/// for length/capacity should always be 0.
364/// 4. An inlined string can only hold up to 23 bytes of data, meaning that the length byte for that string can never
365/// have a value greater than 23. (_We_ have to provide this invariant, which is handled in `Inner::try_inlined`.)
366///
367/// [rust_isize_alloc_limit]: https://doc.rust-lang.org/stable/std/alloc/struct.Layout.html#method.from_size_align
368union Inner {
369 empty: EmptyUnion,
370 owned: OwnedUnion,
371 static_: StaticUnion,
372 interned: ManuallyDrop<InternedUnion>,
373 shared: ManuallyDrop<SharedUnion>,
374 inlined: InlinedUnion,
375 discriminant: DiscriminantUnion,
376}
377
378impl Inner {
379 const fn empty() -> Self {
380 Self {
381 empty: EmptyUnion::new(),
382 }
383 }
384
385 fn owned(value: String) -> Self {
386 match value.capacity() {
387 // The string we got is empty.
388 0 => Self::empty(),
389 cap => {
390 let mut value = value.into_bytes();
391
392 let ptr = value.as_mut_ptr();
393 let len = value.len();
394
395 // We're taking ownership of the underlying string allocation so we can't let it drop.
396 std::mem::forget(value);
397
398 Self {
399 owned: OwnedUnion {
400 ptr,
401 len,
402 cap: tag_cap(cap),
403 },
404 }
405 }
406 }
407 }
408
409 const fn static_str(value: &'static str) -> Self {
410 match value.len() {
411 0 => Self::empty(),
412 _ => Self {
413 static_: StaticUnion {
414 value,
415 _cap: Tag::Static,
416 },
417 },
418 }
419 }
420
421 fn interned(value: InternedString) -> Self {
422 match value.len() {
423 0 => Self::empty(),
424 _len => {
425 let (state, tag) = match value.into_state() {
426 InternedStringState::FixedSize(fixed_size) => (
427 InternedStateUnion {
428 fixed_size: ManuallyDrop::new(fixed_size),
429 },
430 Tag::InternedFixedSize,
431 ),
432 InternedStringState::GenericMap(generic_map) => (
433 InternedStateUnion {
434 generic_map: ManuallyDrop::new(generic_map),
435 },
436 Tag::InternedGenericMap,
437 ),
438 };
439
440 Self {
441 interned: ManuallyDrop::new(InternedUnion { state, _cap: tag }),
442 }
443 }
444 }
445 }
446
447 fn shared(value: Arc<str>) -> Self {
448 match value.len() {
449 0 => Self::empty(),
450 _len => Self {
451 shared: ManuallyDrop::new(SharedUnion {
452 // SAFETY: We know `ptr` is non-null because `Arc::into_raw` is called on a valid `Arc<str>`.
453 ptr: unsafe { NonNull::new_unchecked(Arc::into_raw(value).cast_mut()) },
454 _cap: Tag::Shared,
455 }),
456 },
457 }
458 }
459
460 fn try_inlined(value: &str) -> Option<Self> {
461 match value.len() {
462 0 => Some(Self::empty()),
463 len => {
464 if len > INLINED_STR_MAX_LEN {
465 return None;
466 }
467
468 let mut data = [0; INLINED_STR_DATA_BUF_LEN];
469
470 // SAFETY: We know it fits because we just checked that the string length is 23 or less.
471 data[INLINED_STR_MAX_LEN] = len as u8;
472
473 let buf = value.as_bytes();
474 data[0..len].copy_from_slice(buf);
475
476 Some(Self {
477 inlined: InlinedUnion { data },
478 })
479 }
480 }
481 }
482
483 #[inline]
484 fn as_str(&self) -> &str {
485 let union_type = unsafe { self.discriminant.get_union_type() };
486 match union_type {
487 UnionType::Empty => "",
488 UnionType::Owned => {
489 let owned = unsafe { &self.owned };
490 owned.as_str()
491 }
492 UnionType::Static => {
493 let static_ = unsafe { &self.static_ };
494 static_.as_str()
495 }
496 UnionType::InternedFixedSize => {
497 let interned = unsafe { &self.interned.state.fixed_size };
498 interned.as_str()
499 }
500 UnionType::InternedGenericMap => {
501 let interned = unsafe { &self.interned.state.generic_map };
502 interned.as_str()
503 }
504 UnionType::Shared => {
505 let shared = unsafe { &self.shared };
506 shared.as_str()
507 }
508 UnionType::Inlined => {
509 let inlined = unsafe { &self.inlined };
510 inlined.as_str()
511 }
512 }
513 }
514
515 #[inline]
516 const fn get_union_type(&self) -> UnionType {
517 unsafe { self.discriminant.get_union_type() }
518 }
519
520 fn into_owned(mut self) -> String {
521 let union_type = unsafe { self.discriminant.get_union_type() };
522 match union_type {
523 UnionType::Empty => String::new(),
524 UnionType::Owned => {
525 // We're (`Inner`) being consumed here, but we need to update our internal state to ensure that our drop
526 // logic doesn't try to double free the string allocation.
527 let owned = unsafe { self.owned };
528 self.empty = EmptyUnion::new();
529
530 owned.into_owned()
531 }
532 UnionType::Static => {
533 let static_ = unsafe { self.static_ };
534 static_.as_str().to_owned()
535 }
536 UnionType::InternedFixedSize => {
537 let interned = unsafe { &self.interned.state.fixed_size };
538 interned.as_str().to_owned()
539 }
540 UnionType::InternedGenericMap => {
541 let interned = unsafe { &self.interned.state.generic_map };
542 interned.as_str().to_owned()
543 }
544 UnionType::Shared => {
545 let shared = unsafe { &self.shared };
546 shared.as_str().to_owned()
547 }
548 UnionType::Inlined => {
549 let inlined = unsafe { self.inlined };
550 inlined.as_str().to_owned()
551 }
552 }
553 }
554}
555
556impl Drop for Inner {
557 fn drop(&mut self) {
558 let union_type = unsafe { self.discriminant.get_union_type() };
559 match union_type {
560 UnionType::Owned => {
561 let owned = unsafe { &mut self.owned };
562
563 let ptr = owned.ptr;
564 let len = owned.len;
565 let cap = untag_cap(owned.cap);
566
567 // SAFETY: The pointer has to be non-null, because we only ever construct an owned variant when the
568 // `String` has a non-zero capacity, which implies a valid allocation, and thus a valid, non-null
569 // pointer.
570 let data = unsafe { Vec::<u8>::from_raw_parts(ptr, len, cap) };
571 drop(data);
572 }
573 UnionType::InternedFixedSize | UnionType::InternedGenericMap => {
574 let interned = unsafe { &mut self.interned };
575
576 // SAFETY: We're already dropping `Inner`, so nothing else can use the `InternedString` value after we
577 // consume it here.
578 let data = unsafe { ManuallyDrop::take(interned) };
579 drop(data);
580 }
581 UnionType::Shared => {
582 let shared = unsafe { &mut self.shared };
583
584 // Decrement the strong count before we drop, ensuring the `Arc` has a chance to clean itself up if this
585 // is the less strong reference.
586 //
587 // SAFETY: We know `shared.ptr` was obtained from `Arc::into_raw`, so it's valid to decrement on. We
588 // also know the backing storage is still live because it has to be by virtue of us being here.
589 unsafe {
590 Arc::decrement_strong_count(shared.ptr.as_ptr().cast_const());
591 }
592 }
593 _ => {}
594 }
595 }
596}
597
598impl Clone for Inner {
599 fn clone(&self) -> Self {
600 let union_type = unsafe { self.discriminant.get_union_type() };
601 match union_type {
602 UnionType::Empty => Self::empty(),
603 UnionType::Owned => {
604 let owned = unsafe { self.owned };
605 let s = owned.as_str();
606
607 // We specifically try to inline here.
608 //
609 // At a high-level, when we're _given_ an owned string, we avoid inlining because we don't want to trash
610 // the underlying allocation since we may be asked to give it back later (via `MetaString::into_owned`).
611 // However, when we're _cloning_, we'd rather avoiding allocating if we can help it.
612 Self::try_inlined(s).unwrap_or_else(|| Self::owned(s.to_owned()))
613 }
614 UnionType::Static => Self {
615 static_: unsafe { self.static_ },
616 },
617 UnionType::InternedFixedSize | UnionType::InternedGenericMap => Self {
618 interned: unsafe { self.interned.clone() },
619 },
620 UnionType::Shared => {
621 let shared = unsafe { &self.shared };
622
623 // We have to increment the strong count before cloning.
624 //
625 // SAFETY: We know `shared.ptr` was obtained from `Arc::into_raw`. We also know that if we're cloning
626 // this value, that the underlying `Arc` must still be live, since we're holding a reference to it.
627 unsafe {
628 Arc::increment_strong_count(shared.ptr.as_ptr().cast_const());
629 }
630
631 Self {
632 shared: ManuallyDrop::new(SharedUnion {
633 ptr: shared.ptr,
634 _cap: Tag::Shared,
635 }),
636 }
637 }
638 UnionType::Inlined => Self {
639 inlined: unsafe { self.inlined },
640 },
641 }
642 }
643}
644
645// SAFETY: None of our union variants are tied to the original thread they were created on.
646unsafe impl Send for Inner {}
647
648// SAFETY: None of our union variants use any form of interior mutability and are thus safe to be shared between
649// threads.
650unsafe impl Sync for Inner {}
651
652/// An immutable string type that abstracts over various forms of string storage.
653///
654/// Normally, developers will work with either `String` (owned) or `&str` (borrowed) when dealing with strings. In some
655/// cases, though, it can be useful to work with strings that use alternative storage, such as those that are atomically
656/// shared (e.g. `InternedString`). While using those string types themselves isn't complex, using them and _also_
657/// supporting normal string types can be complex.
658///
659/// `MetaString` is an opinionated string type that abstracts over the normal string types like `String` and `&str`
660/// while also supporting alternative storage, such as interned strings from `FixedSizeInterner`, unifying these
661/// different variants behind a single concrete type.
662///
663/// ## Supported types
664///
665/// `MetaString` supports the following "variants":
666///
667/// - owned (`String` and non-inlineable `&str`)
668/// - static (`&'static str`)
669/// - interned (`InternedString`)
670/// - shared (`Arc<str>`)
671/// - inlined (up to 23 bytes)
672///
673/// ### Owned and borrowed strings
674///
675/// `MetaString` can be created from `String` and `&str` directly. For owned scenarios (`String`), the string value is
676/// simply wrapped. For borrowed strings, we attempt to inline them (see more below) or, if they cannot be inlined, they
677/// are copied into a new `String`.
678///
679/// ### Static strings
680///
681/// `MetaString` can be created from `&'static str` directly. This is useful for string literals and other static
682/// strings that are too large to be inlined.
683///
684/// ### Interned strings
685///
686/// `MetaString` can also be created from `InternedString`, which is a string that has been interned (using an interner
687/// like [`FixedSizeInterner`][crate::interning::FixedSizeInterner] or
688/// [`GenericMapInterner`][crate::interning::GenericMapInterner]). Interned strings are essentially a combination of the
689/// properties of `Arc<T>` -- owned wrappers around an atomically reference counted piece of data -- and a fixed-size
690/// buffer, where we allocate one large buffer, and write many small strings into it, and provide references to those
691/// strings through `InternedString`.
692///
693/// ### Shared strings
694///
695/// `MetaString` can be created from `Arc<str>`, which is a string slice that can be atomically shared between threads.
696/// This is a simpler version of interned strings where strict memory control and re-use is not required.
697///
698/// ### Inlined strings
699///
700/// Finally, `MetaString` can also be created by inlining small strings into `MetaString` itself, avoiding the need for
701/// any backing allocation. "Small string optimization" is a common optimization for string types where small strings
702/// can be stored directly in a string type itself by utilizing a "union"-style layout.
703///
704/// As `MetaString` utilizes such a layout, we can provide a small string optimization that allows for strings up to 23
705/// bytes in length.
706///
707/// ## Conversion methods
708///
709/// Implementations of `From<T>` exist for all of the aforementioned types to allow for easily converting to
710/// `MetaString`. Once a caller has a `MetaString` value, they are generally expected to interact with the string in a
711/// read-only way, as `MetaString` can be dereferenced directly to `&str`.
712///
713/// If a caller needs to be able to modify the string data, they can call `into_owned` to get an owned version of the
714/// string, make their modifications to the owned version, and then convert that back to `MetaString`.
715#[derive(Clone)]
716pub struct MetaString {
717 inner: Inner,
718}
719
720impl MetaString {
721 /// Creates an empty `MetaString`.
722 ///
723 /// This does not allocate.
724 pub const fn empty() -> Self {
725 Self { inner: Inner::empty() }
726 }
727
728 /// Creates a new `MetaString` from the given static string.
729 ///
730 /// This does not allocate.
731 pub const fn from_static(s: &'static str) -> Self {
732 Self {
733 inner: Inner::static_str(s),
734 }
735 }
736
737 /// Attempts to create a new `MetaString` from the given string if it can be inlined.
738 pub fn try_inline(s: &str) -> Option<Self> {
739 Inner::try_inlined(s).map(|inner| Self { inner })
740 }
741
742 /// Creates a new `MetaString` from the given string, using the provided interner.
743 ///
744 /// The string is inlined if possible. If it cannot be inlined, the interner is tried. If interning fails, an owned
745 /// string is allocated.
746 pub fn from_interner<I>(s: &str, interner: &I) -> Self
747 where
748 I: Interner,
749 {
750 if let Some(inlined) = Self::try_inline(s) {
751 return inlined;
752 }
753 if let Some(interned) = interner.try_intern(s) {
754 return Self::from(interned);
755 }
756 Self::from(s.to_owned())
757 }
758
759 /// Returns `true` if `self` has a length of zero bytes.
760 pub fn is_empty(&self) -> bool {
761 self.deref().is_empty()
762 }
763
764 /// Returns `true` if `self` can be cheaply cloned.
765 pub const fn is_cheaply_cloneable(&self) -> bool {
766 // If we're wrapping an owned string, cloning means cloning that allocation. All other types are cheap to clone.
767 !self.inner.get_union_type().is_owned()
768 }
769
770 /// Consumes `self` and returns an owned `String`.
771 ///
772 /// If the `MetaString` is already owned, this will simply return the inner `String` directly. Otherwise, this will
773 /// allocate an owned version (`String`) of the string data.
774 pub fn into_owned(self) -> String {
775 self.inner.into_owned()
776 }
777}
778
779impl Default for MetaString {
780 fn default() -> Self {
781 Self::empty()
782 }
783}
784
785impl hash::Hash for MetaString {
786 fn hash<H: hash::Hasher>(&self, state: &mut H) {
787 self.deref().hash(state)
788 }
789}
790
791impl PartialEq<str> for MetaString {
792 fn eq(&self, other: &str) -> bool {
793 self.deref() == other
794 }
795}
796
797impl PartialEq<&str> for MetaString {
798 fn eq(&self, other: &&str) -> bool {
799 self.deref() == *other
800 }
801}
802
803impl PartialEq<MetaString> for MetaString {
804 fn eq(&self, other: &MetaString) -> bool {
805 self.deref() == other.deref()
806 }
807}
808
809impl Eq for MetaString {}
810
811impl PartialOrd for MetaString {
812 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
813 Some(self.cmp(other))
814 }
815}
816
817impl Ord for MetaString {
818 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
819 self.deref().cmp(other.deref())
820 }
821}
822
823impl Borrow<str> for MetaString {
824 fn borrow(&self) -> &str {
825 self.deref()
826 }
827}
828
829impl Serialize for MetaString {
830 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
831 where
832 S: serde::Serializer,
833 {
834 serializer.serialize_str(self.deref())
835 }
836}
837
838impl<'de> Deserialize<'de> for MetaString {
839 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
840 where
841 D: serde::Deserializer<'de>,
842 {
843 struct MetaStringVisitor;
844
845 impl<'de> serde::de::Visitor<'de> for MetaStringVisitor {
846 type Value = MetaString;
847
848 fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
849 formatter.write_str("a string")
850 }
851
852 fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
853 where
854 E: serde::de::Error,
855 {
856 Ok(MetaString::from(v))
857 }
858 }
859
860 deserializer.deserialize_str(MetaStringVisitor)
861 }
862}
863
864impl From<String> for MetaString {
865 fn from(s: String) -> Self {
866 Self { inner: Inner::owned(s) }
867 }
868}
869
870impl From<&str> for MetaString {
871 fn from(s: &str) -> Self {
872 Self::try_inline(s).unwrap_or_else(|| Self::from(s.to_owned()))
873 }
874}
875
876impl From<InternedString> for MetaString {
877 fn from(s: InternedString) -> Self {
878 Self {
879 inner: Inner::interned(s),
880 }
881 }
882}
883
884impl From<Arc<str>> for MetaString {
885 fn from(s: Arc<str>) -> Self {
886 Self {
887 inner: Inner::shared(s),
888 }
889 }
890}
891
892impl From<MetaString> for protobuf::Chars {
893 fn from(value: MetaString) -> Self {
894 value.into_owned().into()
895 }
896}
897
898impl<'a> From<&'a MetaString> for protobuf::Chars {
899 fn from(value: &'a MetaString) -> Self {
900 // TODO: We're foregoing additional code/complexity to recover a static reference, when our string storage is
901 // static, in order to optimize the conversion to `Bytes` and then `Chars`.
902 //
903 // Static strings being written to Protocol Buffers should be decently rare across the codebase, so no biggie
904 // for now.
905 value.deref().into()
906 }
907}
908
909impl Deref for MetaString {
910 type Target = str;
911
912 fn deref(&self) -> &str {
913 self.inner.as_str()
914 }
915}
916
917impl AsRef<str> for MetaString {
918 fn as_ref(&self) -> &str {
919 self.deref()
920 }
921}
922
923impl fmt::Debug for MetaString {
924 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
925 self.deref().fmt(f)
926 }
927}
928
929impl fmt::Display for MetaString {
930 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
931 self.deref().fmt(f)
932 }
933}
934
935#[cfg(test)]
936mod tests {
937 use std::{num::NonZeroUsize, sync::Arc};
938
939 use proptest::{prelude::*, proptest};
940
941 use super::{interning::GenericMapInterner, InlinedUnion, Inner, MetaString, UnionType};
942 use crate::interning::{FixedSizeInterner, Interner as _};
943
944 #[test]
945 fn struct_sizes() {
946 // Overall, we expect `MetaString`, and thus `Inner`, to always be three machine words.
947 assert_eq!(std::mem::size_of::<MetaString>(), std::mem::size_of::<usize>() * 3);
948 assert_eq!(std::mem::size_of::<Inner>(), std::mem::size_of::<usize>() * 3);
949
950 // We also expect all of inlined union variant to be the exact size of `Inner`, which means we're properly
951 // maximizing the available space for inlining.
952 assert_eq!(std::mem::size_of::<InlinedUnion>(), std::mem::size_of::<Inner>());
953 }
954
955 #[test]
956 fn static_str_inlineable() {
957 // We always use the static variant, even if the string is inlineable, because this lets us make `from_static` const.
958 let s = "hello";
959 let meta = MetaString::from_static(s);
960
961 assert_eq!(meta.inner.get_union_type(), UnionType::Static);
962 assert_eq!(s, &*meta);
963 assert_eq!(s, meta.into_owned());
964 }
965
966 #[test]
967 fn static_str_not_inlineable() {
968 let s = "hello there, world! it's me, margaret!";
969 let meta = MetaString::from_static(s);
970
971 assert_eq!(s, &*meta);
972 assert_eq!(meta.inner.get_union_type(), UnionType::Static);
973 assert_eq!(s, meta.into_owned());
974 }
975
976 #[test]
977 fn owned_string() {
978 let s_orig = "hello";
979 let s = String::from(s_orig);
980 let meta = MetaString::from(s);
981
982 assert_eq!(s_orig, &*meta);
983 assert_eq!(meta.inner.get_union_type(), UnionType::Owned);
984 assert_eq!(s_orig, meta.into_owned());
985 }
986
987 #[test]
988 fn inlined_string() {
989 let s = "hello";
990 let meta = MetaString::from(s);
991
992 assert_eq!(s, &*meta);
993 assert_eq!(meta.inner.get_union_type(), UnionType::Inlined);
994 assert_eq!(s, meta.into_owned());
995 }
996
997 #[test]
998 fn interned_string() {
999 let intern_str = "hello interned str!";
1000
1001 let fs_interner = FixedSizeInterner::<1>::new(NonZeroUsize::new(1024).unwrap());
1002 let s = fs_interner.try_intern(intern_str).unwrap();
1003 assert_eq!(intern_str, &*s);
1004
1005 let meta = MetaString::from(s);
1006 assert_eq!(intern_str, &*meta);
1007 assert_eq!(meta.inner.get_union_type(), UnionType::InternedFixedSize);
1008 assert_eq!(intern_str, meta.into_owned());
1009
1010 let gm_interner = GenericMapInterner::new(NonZeroUsize::new(1024).unwrap());
1011 let s = gm_interner.try_intern(intern_str).unwrap();
1012 assert_eq!(intern_str, &*s);
1013
1014 let meta = MetaString::from(s);
1015 assert_eq!(intern_str, &*meta);
1016 assert_eq!(meta.inner.get_union_type(), UnionType::InternedGenericMap);
1017 assert_eq!(intern_str, meta.into_owned());
1018 }
1019
1020 #[test]
1021 fn shared_string() {
1022 let shared_str = "hello shared str!";
1023
1024 let s = Arc::<str>::from(shared_str);
1025 assert_eq!(shared_str, &*s);
1026
1027 let meta = MetaString::from(s);
1028 assert_eq!(shared_str, &*meta);
1029 assert_eq!(meta.inner.get_union_type(), UnionType::Shared);
1030 assert_eq!(shared_str, meta.into_owned());
1031 }
1032
1033 #[test]
1034 fn shared_string_clone() {
1035 let shared_str = "hello shared str!";
1036 let s = Arc::<str>::from(shared_str);
1037 let meta = MetaString::from(s);
1038
1039 // Clone the `MetaString` to make sure we can still access the original string.
1040 let meta2 = meta.clone();
1041 assert_eq!(shared_str, &*meta2);
1042
1043 // Drop the original `MetaString` to ensure we can still access the string from our clone after going through
1044 // the drop logic for the shared variant.
1045 drop(meta);
1046 assert_eq!(shared_str, &*meta2);
1047 }
1048
1049 #[test]
1050 fn empty_string_shared() {
1051 let shared_str = "";
1052
1053 let s = Arc::<str>::from(shared_str);
1054 assert_eq!(shared_str, &*s);
1055
1056 let meta = MetaString::from(s);
1057 assert_eq!(shared_str, &*meta);
1058 assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1059 assert_eq!(shared_str, meta.into_owned());
1060 }
1061
1062 #[test]
1063 fn empty_string_interned() {
1064 let intern_str = "";
1065
1066 let interner = GenericMapInterner::new(NonZeroUsize::new(1024).unwrap());
1067 let s = interner.try_intern(intern_str).unwrap();
1068 assert_eq!(intern_str, &*s);
1069
1070 let meta = MetaString::from(s);
1071 assert_eq!(intern_str, &*meta);
1072 assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1073 assert_eq!(intern_str, meta.into_owned());
1074 }
1075
1076 #[test]
1077 fn empty_string_static() {
1078 let s = "";
1079
1080 let meta = MetaString::from_static(s);
1081 assert_eq!(s, &*meta);
1082 assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1083 assert_eq!(s, meta.into_owned());
1084 }
1085
1086 #[test]
1087 fn empty_string_inlined() {
1088 let s = "";
1089
1090 let meta = MetaString::try_inline(s).expect("empty string definitely 'fits'");
1091 assert_eq!(s, &*meta);
1092 assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1093 assert_eq!(s, meta.into_owned());
1094 }
1095
1096 #[test]
1097 fn empty_string_owned() {
1098 // When a string has capacity, we don't care if it's actually empty or not, because we want to preserve the
1099 // allocation... so our string here is empty but _does_ have capacity.
1100 let s = String::with_capacity(32);
1101 let actual_cap = s.capacity();
1102
1103 let meta = MetaString::from(s);
1104 assert_eq!("", &*meta);
1105 assert_eq!(meta.inner.get_union_type(), UnionType::Owned);
1106
1107 let owned = meta.into_owned();
1108 assert_eq!(owned.capacity(), actual_cap);
1109 assert_eq!("", owned);
1110 }
1111
1112 #[test]
1113 fn empty_string_owned_zero_capacity() {
1114 // When a string has _no_ capacity, it's effectively empty, and we treat it that way.
1115 let s = String::new();
1116
1117 let meta = MetaString::from(s);
1118 assert_eq!("", &*meta);
1119 assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1120 assert_eq!("", meta.into_owned());
1121 }
1122
1123 #[test]
1124 fn test_is_cheaply_cloneable() {
1125 // Owned strings are never cheap to clone.
1126 let s =
1127 String::from("big ol' stringy string that can't be inlined and lives out its bleak existence in the heap");
1128 let ms = MetaString::from(s);
1129 assert!(!ms.is_cheaply_cloneable());
1130
1131 // Empty strings are always cheap to clone.
1132 let ms = MetaString::empty();
1133 assert!(ms.is_cheaply_cloneable());
1134
1135 // Inlined strings are always cheap to clone.
1136 let s = "hello";
1137 let ms = MetaString::try_inline(s).expect("inlined string should fit");
1138 assert!(ms.is_cheaply_cloneable());
1139
1140 // Static strings are always cheap to clone.
1141 let s = "hello there, world! it's me, margaret!";
1142 let ms = MetaString::from_static(s);
1143 assert!(ms.is_cheaply_cloneable());
1144
1145 // Interned strings are always cheap to clone.
1146 let interner = GenericMapInterner::new(NonZeroUsize::new(1024).unwrap());
1147 let s = interner.try_intern("hello interned str!").unwrap();
1148 let ms = MetaString::from(s);
1149 assert!(ms.is_cheaply_cloneable());
1150
1151 // Shared strings are always cheap to clone.
1152 let s = Arc::from("hello shared str!");
1153 let ms = MetaString::from(s);
1154 assert!(ms.is_cheaply_cloneable());
1155 }
1156
1157 fn arb_unicode_str_max_len(max_len: usize) -> impl Strategy<Value = String> {
1158 ".{0,23}".prop_filter("resulting string is too longer", move |s| s.len() <= max_len)
1159 }
1160
1161 proptest! {
1162 #![proptest_config(ProptestConfig::with_cases(10000))]
1163
1164 #[test]
1165 #[cfg_attr(miri, ignore)]
1166 fn property_test_inlined_string(
1167 input in arb_unicode_str_max_len(23),
1168 ) {
1169 assert!(input.len() <= 23, "input should be 23 bytes or less");
1170 let meta = MetaString::try_inline(&input).expect("input should fit");
1171
1172 if input.is_empty() {
1173 assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1174 } else {
1175 assert_eq!(input, &*meta);
1176 assert_eq!(meta.inner.get_union_type(), UnionType::Inlined);
1177 }
1178 }
1179
1180 #[test]
1181 #[cfg_attr(miri, ignore)]
1182 fn property_test_owned_string(
1183 input in ".*",
1184 ) {
1185 let is_empty = input.is_empty();
1186 let meta = MetaString::from(input);
1187
1188 if is_empty {
1189 assert_eq!(meta.inner.get_union_type(), UnionType::Empty);
1190 } else {
1191 assert_eq!(meta.inner.get_union_type(), UnionType::Owned);
1192 }
1193 }
1194 }
1195}