stringtheory/interning/
fixed_size.rs

1#[cfg(not(feature = "loom"))]
2use std::sync::{atomic::AtomicUsize, Arc, Mutex};
3use std::{
4    num::NonZeroUsize,
5    ptr::NonNull,
6    sync::atomic::Ordering::{AcqRel, Acquire},
7};
8
9#[cfg(feature = "loom")]
10use loom::sync::{atomic::AtomicUsize, Arc, Mutex};
11
12use super::{
13    helpers::{aligned, aligned_string, hash_string, layout_for_data, PackedLengthCapacity},
14    InternedString, Interner,
15};
16use crate::interning::helpers::{ReclaimedEntries, ReclaimedEntry};
17
18const HEADER_LEN: usize = std::mem::size_of::<EntryHeader>();
19const HEADER_ALIGN: usize = std::mem::align_of::<EntryHeader>();
20
21/// The minimum possible length of an entry.
22///
23/// For any entry in the interner, there is already an `EntryHeader` followed by the string data itself. In order to
24/// ensure that entries can be written contiguously, we additionally ensure that the number of bytes we utilize for the
25/// string data is aligned at least as much as `EntryHeader` itself.
26///
27/// This means that the minimum possible length of an entry, or the minimum number of bytes a valid entry could consume,
28/// is the length of the header plus the alignment of the header.
29const MINIMUM_ENTRY_LEN: usize = HEADER_LEN + HEADER_ALIGN;
30
31#[derive(Debug)]
32pub(crate) struct StringState {
33    interner: Arc<Mutex<InternerShardState>>,
34    header: NonNull<EntryHeader>,
35}
36
37impl StringState {
38    #[inline]
39    pub const fn as_str(&self) -> &str {
40        // SAFETY: We ensure `self.header` is well-aligned and points to an initialized `EntryHeader` value when creating `StringState`.
41        unsafe { get_entry_string(self.header) }
42    }
43}
44
45impl PartialEq for StringState {
46    fn eq(&self, other: &Self) -> bool {
47        self.header == other.header
48    }
49}
50
51impl Clone for StringState {
52    fn clone(&self) -> Self {
53        // SAFETY: The caller that creates `StringState` is responsible for ensuring that `self.header` is well-aligned
54        // and points to an initialized `EntryHeader` value.
55        let header = unsafe { self.header.as_ref() };
56        header.increment_active_refs();
57
58        Self {
59            interner: self.interner.clone(),
60            header: self.header,
61        }
62    }
63}
64
65impl Drop for StringState {
66    fn drop(&mut self) {
67        // SAFETY: The caller that creates `StringState` is responsible for ensuring that `self.header` is well-aligned
68        // and points to an initialized `EntryHeader` value.
69        let header = unsafe { self.header.as_ref() };
70        if header.decrement_active_refs() {
71            // We decremented the reference count to zero, so try to mark this entry for reclamation.
72            let mut interner = self.interner.lock().unwrap();
73            interner.mark_for_reclamation(self.header);
74        }
75    }
76}
77
78// SAFETY: We don't take references to the entry header pointer that outlast `StringState`, and the only modification we
79// do to the entry header is through atomic operations, so it's safe to both send and share `StringState` between
80// threads.
81unsafe impl Send for StringState {}
82unsafe impl Sync for StringState {}
83
84/// Metadata about an interner entry.
85///
86/// `EntryHeader` represents the smallest amount of information about an interned entry that is needed to support both
87/// lookup of existing interned strings, as well as the ability to reclaim space in the interner when an entry is no
88/// longer in use.
89struct EntryHeader {
90    /// The hash of the string that this entry represents.
91    hash: u64,
92
93    /// The number of active references to this entry.
94    ///
95    /// Only incremented by the interner itself, and decremented by `InternedString` when it is dropped.
96    refs: AtomicUsize,
97
98    /// Combined length/capacity of the entry, in terms of the string itself.
99    ///
100    /// Notably, this does _not_ include the length of the header itself. For example, an entry holding the string
101    /// "hello, world!" has a string length of 13 bytes, but since we have to pad out to meet our alignment requirements
102    /// for `EntryHeader`, we would end up with a capacity of 16 bytes. As such, `EntryHeader::len` would report `13`,
103    /// while `EntryHeader::capacity` would report `16`. Likewise, `EntryHeader::entry_len` would report `40`,
104    /// accounting for the string capacity (16) as well as the header length itself (24).
105    ///
106    /// As explained in the description of `PackedLengthCapacity`, this does mean strings can't be larger than ~4GB on
107    /// 64-bit platforms, which is not a problem we have.
108    len_cap: PackedLengthCapacity,
109}
110
111impl EntryHeader {
112    /// Creates a tombstone entry with the given capacity.
113    ///
114    /// This is to allow for updating a region in the data buffer, which has been reclaimed, such that it is
115    /// identifiable as being unused.
116    fn tombstone(entry: ReclaimedEntry) -> Self {
117        // The usable capacity for a reclaimed entry is the full capacity minus the size of `EntryHeader` itself, as
118        // reclaimed entries represent the _entire_ region in the data buffer, but `EntryHeader` only cares about the
119        // string portion itself.
120        let cap = Self::usable_from_reclaimed(entry);
121
122        Self {
123            hash: 0,
124            refs: AtomicUsize::new(0),
125            len_cap: PackedLengthCapacity::new(cap, 0),
126        }
127    }
128
129    /// Creates a new entry for the given string.
130    fn from_string(hash: u64, s: &str) -> Self {
131        // We're dictating the necessary capacity here, which is the length of the string rounded to the nearest
132        // multiple of the alignment of `EntryHeader`, which ensures that any subsequent entry will be properly aligned.
133        let cap = aligned_string::<Self>(s);
134
135        Self {
136            hash,
137            refs: AtomicUsize::new(1),
138            len_cap: PackedLengthCapacity::new(cap, s.len()),
139        }
140    }
141
142    /// Creates a new entry for the given string, based on the given reclaimed entry.
143    ///
144    /// This maps the entry header to the underlying capacity of the given reclaimed entry, which is done in cases where
145    /// a reclaimed entry is being used when interning a new string, and the reclaimed entry is larger than the string
146    /// being interned, but not large enough that we could split the excess capacity into a new reclaimed entry.
147    fn from_reclaimed_entry(mut entry: ReclaimedEntry, hash: u64, s: &str) -> (Self, Option<ReclaimedEntry>) {
148        // The usable capacity for a reclaimed entry is the full capacity minus the size of `EntryHeader` itself, as
149        // reclaimed entries represent the _entire_ region in the data buffer, but `EntryHeader` only cares about the
150        // string portion itself.
151        let entry_cap = EntryHeader::usable_from_reclaimed(entry);
152        let required_cap = aligned_string::<Self>(s);
153
154        // If the reclaimed entry has enough additional space beyond what we need for the string, we'll split it off and
155        // return it for the caller to keep around in the reclaimed entries list.
156        let remainder = entry_cap - required_cap;
157        let (adjusted_cap, maybe_split_entry) = if remainder >= MINIMUM_ENTRY_LEN {
158            let entry_len = EntryHeader::len_for(s);
159            let split_entry = entry.split_off(entry_len);
160
161            (entry_len - HEADER_LEN, Some(split_entry))
162        } else {
163            (entry_cap, None)
164        };
165
166        let header = Self {
167            hash,
168            refs: AtomicUsize::new(1),
169            len_cap: PackedLengthCapacity::new(adjusted_cap, s.len()),
170        };
171
172        (header, maybe_split_entry)
173    }
174
175    /// Returns the computed length of a complete entry, in bytes, for the given string.
176    ///
177    /// This includes the size of the entry header itself and the string data, when padded for alignment, and represents
178    /// the number of bytes that would be consumed in the data buffer.
179    const fn len_for(s: &str) -> usize {
180        HEADER_LEN + aligned_string::<Self>(s)
181    }
182
183    /// Returns the usable capacity of a reclaimed entry, in bytes.
184    ///
185    /// Usable refers to the number of bytes in a reclaimed entry that could be used for string data, after accounting
186    /// for the size of `EntryHeader` itself.
187    const fn usable_from_reclaimed(entry: ReclaimedEntry) -> usize {
188        entry.capacity() - HEADER_LEN
189    }
190
191    /// Returns the total number of bytes that this entry takes up in the data buffer.
192    const fn entry_len(&self) -> usize {
193        HEADER_LEN + self.capacity()
194    }
195
196    /// Returns the size of the string, in bytes, that this entry can hold.
197    const fn capacity(&self) -> usize {
198        self.len_cap.capacity()
199    }
200
201    /// Returns the size of the string, in bytes, that this entry _actually_ holds.
202    const fn len(&self) -> usize {
203        self.len_cap.len()
204    }
205
206    /// Returns `true` if this entry is currently referenced.
207    fn is_active(&self) -> bool {
208        self.refs.load(Acquire) != 0
209    }
210
211    /// Increments the active reference count by one.
212    fn increment_active_refs(&self) {
213        self.refs.fetch_add(1, AcqRel);
214    }
215
216    /// Decrements the active reference count by one.
217    ///
218    /// Returns `true` if the active reference count is zero _after_ calling this method.
219    fn decrement_active_refs(&self) -> bool {
220        self.refs.fetch_sub(1, AcqRel) == 1
221    }
222}
223
224#[derive(Debug)]
225struct InternerShardState {
226    // Direct pieces of our buffer allocation.
227    ptr: NonNull<u8>,
228    offset: usize,
229    capacity: NonZeroUsize,
230
231    // Number of active entries (strings) in the interner.
232    entries: usize,
233
234    // Length of all active entries, in bytes.
235    //
236    // This is equivalent to `self.offset` minus the total size of all reclaimed entries.
237    len: usize,
238
239    // Markers for entries that can be reused.
240    reclaimed: ReclaimedEntries,
241}
242
243impl InternerShardState {
244    /// Creates a new `InternerShardState` with a pre-allocated buffer that has the given capacity.
245    pub fn with_capacity(capacity: NonZeroUsize) -> Self {
246        assert!(
247            aligned::<EntryHeader>(capacity.get()) <= isize::MAX as usize,
248            "capacity would overflow isize::MAX, which violates layout constraints"
249        );
250
251        // Allocate our data buffer. This is the main backing allocation for all interned strings, and is well-aligned
252        // for `EntryHeader`.
253        //
254        // SAFETY: `layout_for_data` ensures the layout is non-zero.
255        let data_layout = layout_for_data::<EntryHeader>(capacity);
256        let data_ptr = unsafe { std::alloc::alloc(data_layout) };
257        let ptr = match NonNull::new(data_ptr) {
258            Some(ptr) => ptr,
259            None => std::alloc::handle_alloc_error(data_layout),
260        };
261
262        Self {
263            ptr,
264            offset: 0,
265            capacity,
266            entries: 0,
267            len: 0,
268            reclaimed: ReclaimedEntries::new(),
269        }
270    }
271
272    /// Returns the total number of unused bytes that are available for interning.
273    fn available(&self) -> usize {
274        self.capacity.get() - self.offset
275    }
276
277    fn get_entry_ptr(&self, offset: usize) -> NonNull<EntryHeader> {
278        debug_assert!(
279            offset + MINIMUM_ENTRY_LEN <= self.capacity.get(),
280            "offset would point to entry that cannot possibly avoid extending past end of data buffer"
281        );
282
283        // SAFETY: The caller is responsible for ensuring that `offset` is within the bounds of the data buffer, and
284        // that `offset` is well-aligned for `EntryHeader`.
285        let entry_ptr = unsafe { self.ptr.as_ptr().add(offset).cast::<EntryHeader>() };
286        debug_assert!(entry_ptr.is_aligned(), "entry header pointer must be well-aligned");
287
288        // SAFETY: `entry_ptr` is derived from `self.ptr`, which itself is `NonNull<u8>`, and the caller is responsible
289        // for ensuring that `offset` is within the bounds of the data buffer, so we know `entry_ptr` is non-null.
290        unsafe { NonNull::new_unchecked(entry_ptr) }
291    }
292
293    fn find_entry(&self, hash: u64, s: &str) -> Option<NonNull<EntryHeader>> {
294        let mut offset = 0;
295
296        while offset < self.offset {
297            // Construct a pointer to the entry at `offset`, and get a reference to the header value.
298            let header_ptr = self.get_entry_ptr(offset);
299            let header = unsafe { header_ptr.as_ref() };
300
301            // See if this entry is active or not. If it's active, then we'll quickly check the hash/length of the
302            // string to see if this is likely to be a match for `s`.
303            if header.is_active() && header.hash == hash && header.len() == s.len() {
304                // As a final check, we make sure that the entry string and `s` are equal. If they are, then we
305                // have an exact match and will return the entry.
306                //
307                // SAFETY: We know that our header is valid and initialized.
308                let s_entry = unsafe { get_entry_string(header_ptr) };
309                if s_entry == s {
310                    // Increment the reference count for this entry so it's not prematurely reclaimed.
311                    header.increment_active_refs();
312
313                    return Some(header_ptr);
314                }
315            }
316
317            // Either this was a reclaimed entry or we didn't have a match, so we move on to the next entry.
318            offset += header.entry_len();
319        }
320
321        None
322    }
323
324    fn write_entry(&mut self, offset: usize, entry_header: EntryHeader, s: &str) -> NonNull<EntryHeader> {
325        debug_assert_eq!(
326            entry_header.len(),
327            s.len(),
328            "entry header length must match string length"
329        );
330
331        let entry_ptr = self.get_entry_ptr(offset);
332        let entry_len = entry_header.entry_len();
333
334        // Write the entry header.
335        unsafe { entry_ptr.as_ptr().write(entry_header) };
336
337        let s_buf = s.as_bytes();
338
339        // Write the string.
340        let entry_s_buf = unsafe {
341            // Take the entry pointer and add 1, which sets our pointer to right _after_ the header.
342            let entry_s_ptr = entry_ptr.as_ptr().add(1).cast::<u8>();
343            std::slice::from_raw_parts_mut(entry_s_ptr, s_buf.len())
344        };
345        entry_s_buf.copy_from_slice(s_buf);
346
347        // Update our internal statistics.
348        self.entries += 1;
349        self.len += entry_len;
350
351        entry_ptr
352    }
353
354    fn write_to_unoccupied(&mut self, s_hash: u64, s: &str) -> NonNull<EntryHeader> {
355        let entry_header = EntryHeader::from_string(s_hash, s);
356
357        // Write the entry to the end of the data buffer.
358        let entry_offset = self.offset;
359        self.offset += entry_header.entry_len();
360
361        self.write_entry(entry_offset, entry_header, s)
362    }
363
364    fn write_to_reclaimed_entry(&mut self, entry: ReclaimedEntry, s_hash: u64, s: &str) -> NonNull<EntryHeader> {
365        let entry_offset = entry.offset();
366        let (entry_header, maybe_split_entry) = EntryHeader::from_reclaimed_entry(entry, s_hash, s);
367
368        // If we had enough capacity in the reclaimed entry to hold this string _and_ potentially hold another entry, we
369        // split it off and store that remainder entry.
370        if let Some(split_entry) = maybe_split_entry {
371            self.add_reclaimed(split_entry);
372        }
373
374        // Write the entry in place of the reclaimed entry.
375        self.write_entry(entry_offset, entry_header, s)
376    }
377
378    fn add_reclaimed_from_header(&mut self, header_ptr: NonNull<EntryHeader>) {
379        // Get the offset of the header within the data buffer.
380        //
381        // SAFETY: The caller is responsible for ensuring the entry header reference belongs to this interner. If that
382        // is upheld, then we know that entry header belongs to our data buffer, and that the pointer to the entry
383        // header is not less than the base pointer of the data buffer, ensuring the offset is non-negative.
384        let entry_offset = unsafe {
385            header_ptr
386                .cast::<u8>()
387                .as_ptr()
388                .offset_from(self.ptr.as_ptr().cast_const())
389        };
390        debug_assert!(entry_offset >= 0, "entry offset must be non-negative");
391
392        let header = unsafe { header_ptr.as_ref() };
393
394        let entry = ReclaimedEntry::new(entry_offset as usize, header.entry_len());
395        self.len -= entry.capacity();
396        self.add_reclaimed(entry);
397    }
398
399    fn add_reclaimed(&mut self, entry: ReclaimedEntry) {
400        // Reclamation is a two-step process: first, we have to actually keep track of the reclaimed entry, which
401        // potentially involves merging adjacent reclaimed entries, and then once all of that has happened, we tombstone
402        // the entry (whether merged or not).
403        //
404        // However, if the merged reclaimed entry immediately precedes any available capacity, we can skip tombstoning
405        // it, since we can just wind back `offset` to reclaim the space.
406        let merged_entry = self.reclaimed.insert(entry);
407        if merged_entry.offset() + merged_entry.capacity() == self.offset {
408            self.offset -= merged_entry.capacity();
409            self.reclaimed.remove(&merged_entry);
410            return;
411        }
412
413        self.clear_reclaimed_entry(merged_entry);
414    }
415
416    fn mark_for_reclamation(&mut self, header_ptr: NonNull<EntryHeader>) {
417        // See if the reference count is zero.
418        //
419        // Only interned string values (the frontend handle that wraps the pointer to a specific entry) can decrement
420        // the reference count for their specific entry when dropped, and only `InternerShardState` -- with its access
421        // mediated through a mutex -- can increment the reference count for entries. This means that if the reference
422        // count is zero, then we know that nobody else is holding a reference to this entry, and no concurrent call to
423        // `try_intern` could be updating the reference count, either... so it's safe to be marked as reclaimed.
424        //
425        // SAFETY: The caller is responsible for ensuring that `header_ptr` is well-aligned and points to an initialized
426        // `EntryHeader` value.
427        let header = unsafe { header_ptr.as_ref() };
428        if !header.is_active() {
429            self.entries -= 1;
430            self.add_reclaimed_from_header(header_ptr);
431        }
432    }
433
434    fn clear_reclaimed_entry(&mut self, entry: ReclaimedEntry) {
435        let entry_ptr = self.get_entry_ptr(entry.offset);
436
437        // Write the entry tombstone itself, which clears out the hash and sets the reference count to zero.
438        //
439        // SAFETY: We know that `entry_ptr` is valid for writes (reclaimed entries are, by definition, inactive regions
440        // in the data buffer) and is well-aligned for `EntryHeader`.
441        let tombstone = EntryHeader::tombstone(entry);
442        let str_cap = tombstone.capacity();
443
444        unsafe {
445            entry_ptr.as_ptr().write(EntryHeader::tombstone(entry));
446        }
447
448        // Write a magic value to the entire string capacity for the entry. This ensures that there's a known repeating
449        // value which, in the case of debugging issues, can be a signal that offsets/reclaimed entries are incorrect
450        // and overlapping with active entries.
451        //
452        // SAFETY: Like above, the caller is responsible for ensuring that `offset` is within the bounds of the data
453        // buffer, and that `offset + capacity` does not extend past the bounds of the data buffer.
454        unsafe {
455            // Take the entry pointer and add 1, which sets our pointer to right _after_ the header.
456            let str_ptr = entry_ptr.as_ptr().add(1).cast::<u8>();
457            let str_buf = std::slice::from_raw_parts_mut(str_ptr, str_cap);
458            str_buf.fill(0xAA);
459        }
460    }
461
462    fn try_intern(&mut self, s_hash: u64, s: &str) -> Option<NonNull<EntryHeader>> {
463        // We can only intern strings with a size that fits within a packed length/capacity value, so if `s` is larger
464        // than that, we can't intern it, and there's existing entry we could have for it either.
465        if s.len() > PackedLengthCapacity::maximum_value() {
466            return None;
467        }
468
469        // Try and find an existing entry for this string.
470        if self.entries != 0 {
471            if let Some(existing_entry) = self.find_entry(s_hash, s) {
472                return Some(existing_entry);
473            }
474        }
475
476        let required_cap = EntryHeader::len_for(s);
477
478        // We didn't find an existing entry, so we're going to intern it.
479        //
480        // First, try and see if we have a reclaimed entry that can fit this string. If nothing suitable is found, or we
481        // have no reclaimed entries, then we'll just try to fit it in the remaining capacity of our data buffer.
482        if !self.reclaimed.is_empty() {
483            let maybe_reclaimed_entry = self.reclaimed.take_if(|entry| entry.capacity() >= required_cap);
484            if let Some(reclaimed_entry) = maybe_reclaimed_entry {
485                return Some(self.write_to_reclaimed_entry(reclaimed_entry, s_hash, s));
486            }
487        }
488
489        if required_cap <= self.available() {
490            Some(self.write_to_unoccupied(s_hash, s))
491        } else {
492            None
493        }
494    }
495}
496
497impl Drop for InternerShardState {
498    fn drop(&mut self) {
499        // SAFETY: We allocated this buffer with the global allocator, and we're generating the same layout that was
500        // used to allocate it in the first place.
501        unsafe {
502            std::alloc::dealloc(self.ptr.as_ptr(), layout_for_data::<EntryHeader>(self.capacity));
503        }
504    }
505}
506
507// SAFETY: We don't take references to the data buffer pointer that outlast `InternerShardState`, and all access to
508// `InternerShardState` itself is mediated through a mutex, so we're safe to send it around and share it between
509// threads.
510unsafe impl Send for InternerShardState {}
511unsafe impl Sync for InternerShardState {}
512
513#[derive(Debug)]
514struct InternerState<const SHARD_FACTOR: usize> {
515    shards: [Arc<Mutex<InternerShardState>>; SHARD_FACTOR],
516    capacity: usize,
517}
518
519impl<const SHARD_FACTOR: usize> InternerState<SHARD_FACTOR> {
520    // Ensure our shard factor is a power of two at compile, so that we can just use a mask to get the shard index.
521    const _POWER_OF_TWO_SHARD_FACTOR: () = {
522        if !SHARD_FACTOR.is_power_of_two() {
523            panic!("shard factor must be a power of two")
524        }
525    };
526
527    pub fn with_capacity(capacity: NonZeroUsize) -> Self {
528        let shard_capacity = match NonZeroUsize::new(capacity.get() / SHARD_FACTOR) {
529            Some(shard_capacity) => shard_capacity,
530            // If we can't divide the capacity evenly, we just specify a capacity of one which will force every shard to
531            // upsize the capacity so that a single entry can fit, and satisfies the need for `NonZeroUsize`.
532            //
533            // SAFETY: One is obviously not zero.
534            None => unsafe { NonZeroUsize::new_unchecked(1) },
535        };
536
537        let shards = std::iter::repeat_with(|| Arc::new(Mutex::new(InternerShardState::with_capacity(shard_capacity))))
538            .take(SHARD_FACTOR)
539            .collect::<Vec<_>>();
540        let capacity = shards
541            .iter()
542            .map(|shard| {
543                let shard = shard.lock().unwrap();
544                shard.capacity.get()
545            })
546            .sum();
547        let shards: [Arc<Mutex<InternerShardState>>; SHARD_FACTOR] =
548            shards.try_into().expect("should not fail to convert to array");
549
550        Self { shards, capacity }
551    }
552
553    fn is_empty(&self) -> bool {
554        self.shards.iter().any(|shard| shard.lock().unwrap().entries != 0)
555    }
556
557    fn len(&self) -> usize {
558        self.shards.iter().map(|shard| shard.lock().unwrap().entries).sum()
559    }
560
561    fn len_bytes(&self) -> usize {
562        self.shards.iter().map(|shard| shard.lock().unwrap().offset).sum()
563    }
564
565    fn capacity_bytes(&self) -> usize {
566        self.capacity
567    }
568
569    fn try_intern(&self, s: &str) -> Option<InternedString> {
570        let hash = hash_string(s);
571        let shard_idx = (hash as usize) & (SHARD_FACTOR - 1);
572
573        let shard = &self.shards[shard_idx];
574        intern_with_shard_and_hash(shard, hash, s)
575    }
576}
577
578/// A string interner based on a single, fixed-size backing buffer with support for reclamation.
579///
580/// ## Overview
581///
582/// This interner uses a single, fixed-size backing buffer where interned strings are stored contiguously. This provides
583/// bounded memory usage, and the interner will not allocate additional memory for new strings once the buffer is full.
584/// Since interned strings are not likely to need to live for the life of the program, the interner supports
585/// reclamation. Once all references to an interned string have been dropped, the storage for that string is reclaimed
586/// and can be used to hold new strings.
587///
588/// ## Storage layout
589///
590/// The backing buffer stores strings contiguously, with an entry "header" prepended to each string. The header contains
591/// relevant data -- hash of the string, reference count, and length of the string -- needed to work with the entry
592/// either when searching for existing entries or when using the entry itself.
593///
594/// The layout of an entry is as follows:
595///
596/// ```text
597/// ┌───────────────────────── entry #1 ──────────────────────────┐ ┌─ entry #2 ─┐ ┌─ entry .. ─┐
598/// ▼                                                             ▼ ▼            ▼ ▼            ▼
599/// ┏━━━━━━━━━━━┯━━━━━━━━━━━┯━━━━━━━━━━━┯━━━━━━━━━━━┯━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━┓
600/// ┃ str hash  │  ref cnt  │  str len  │ str data  │   padding   ┃ ┃   header   ┃ ┃   header   ┃
601/// ┃ (8 bytes) │ (8 bytes) │ (8 bytes) │ (N bytes) │ (1-7 bytes) ┃ ┃  & string  ┃ ┃  & string  ┃
602/// ┗━━━━━━━━━━━┷━━━━━━━━━━━┷━━━━━━━━━━━┷━━━━━━━━━━━┷━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━┛
603/// ▲                                   ▲                           ▲
604/// └────────── `EntryHeader` ──────────┘                           └── aligned for `EntryHeader`
605///          (8 byte alignment)                                         via trailing padding
606/// ```
607///
608/// The backing buffer is always aligned properly for `EntryHeader`, so that the first entry can be referenced
609/// correctly. However, when appending additional entries to the buffer, we need to ensure that those entries also have
610/// an aligned start for accessing the header. This is complicated due to the variable number of bytes for the string
611/// data.
612///
613/// Alignment padding is added to the end of the entry to ensure that when appending the next entry, the start of the
614/// entry is properly aligned for `EntryHeader`. In the worst case, up to 7 bytes could be added (and thus wasted) on
615/// this alignment padding.
616///
617/// ## `InternedString`
618///
619/// The `InternedString` type is a handle to the entry header, and thus the string data, for an interned string. It is
620/// designed to be small -- 8 bytes! -- and cheap to clone, as it contains an atomic reference to the entry header and a
621/// reference to the interner that owns the string. It dereferences to the underlying string with relatively low
622/// overhead: two pointer indirections.
623///
624/// When an `InternedString` is dropped, it decrements the reference count for the entry it points to. If the reference
625/// count drops to zero, it will attempt to mark the entry for reclamation.
626///
627/// ## Reclamation
628///
629/// As we want to bound the memory used by the interner, but also not allow it to be filled up with strings that
630/// eventually end up going entirely unused, we need a way to remove those unused strings so their underlying storage
631/// can be used for new strings. This is where reclamation comes in.
632///
633/// When a string is interned, the entry header tracks how many active references there are to it. When that reference
634/// count drops to zero, the last reference to the string attempts to mark the entry for reclamation. Assuming no other
635/// reference has been taken out on the entry in the meantime, the entry gets added to a list of "reclaimed" entries.
636///
637/// Reclaimed entries are simple markers -- start and end position in the data buffer -- which are stored in a freelist.
638/// When attempting to intern a new string, this freelist is searched to see if there's an entry large enough to fit the
639/// new string, and if so, it is used.
640///
641/// Additionally, when entries are reclaimed, adjacent entries are merged together where possible. This helps to avoid
642/// unnecessary fragmentation over time, although not as effectively as reconstructing the data buffer to re-pack
643/// entries.
644#[derive(Clone, Debug)]
645pub struct FixedSizeInterner<const SHARD_FACTOR: usize> {
646    state: Arc<InternerState<SHARD_FACTOR>>,
647}
648
649impl<const SHARD_FACTOR: usize> FixedSizeInterner<SHARD_FACTOR> {
650    /// Creates a new `FixedSizeInterner` with the given capacity.
651    ///
652    /// The given capacity will potentially be rounded up by a small number of bytes (up to 7) in order to ensure the
653    /// backing buffer is properly aligned.
654    pub fn new(capacity: NonZeroUsize) -> Self {
655        Self {
656            state: Arc::new(InternerState::with_capacity(capacity)),
657        }
658    }
659}
660
661impl<const SHARD_FACTOR: usize> Interner for FixedSizeInterner<SHARD_FACTOR> {
662    fn is_empty(&self) -> bool {
663        self.state.is_empty()
664    }
665
666    fn len(&self) -> usize {
667        self.state.len()
668    }
669
670    fn len_bytes(&self) -> usize {
671        self.state.len_bytes()
672    }
673
674    fn capacity_bytes(&self) -> usize {
675        self.state.capacity_bytes()
676    }
677
678    fn try_intern(&self, s: &str) -> Option<InternedString> {
679        self.state.try_intern(s)
680    }
681}
682
683#[inline]
684const unsafe fn get_entry_string_parts(header_ptr: NonNull<EntryHeader>) -> (NonNull<u8>, usize) {
685    // SAFETY: The caller is responsible for ensuring that `header_ptr` is well-aligned and points to an initialized
686    // `EntryHeader` value.
687    let header = header_ptr.as_ref();
688
689    // Advance past the header and get the pointer to the string.
690    //
691    // SAFETY: We know that we're simply skipping over the header by advancing the pointer by one when it's still typed
692    // as `*mut EntryHeader`.
693    let s_ptr = header_ptr.add(1).cast::<u8>();
694    (s_ptr, header.len())
695}
696
697#[inline]
698const unsafe fn get_entry_string<'a>(header_ptr: NonNull<EntryHeader>) -> &'a str {
699    let (s_ptr, s_len) = get_entry_string_parts(header_ptr);
700
701    // SAFETY: We depend on `get_entry_string_parts` to give us a valid pointer and length for the string.
702    std::str::from_utf8_unchecked(std::slice::from_raw_parts(s_ptr.as_ptr() as *const _, s_len))
703}
704
705fn intern_with_shard_and_hash(shard: &Arc<Mutex<InternerShardState>>, hash: u64, s: &str) -> Option<InternedString> {
706    let header = {
707        let mut shard = shard.lock().unwrap();
708        shard.try_intern(hash, s)?
709    };
710
711    Some(InternedString::from(StringState {
712        interner: Arc::clone(shard),
713        header,
714    }))
715}
716
717#[cfg(test)]
718mod tests {
719    use std::{
720        collections::HashSet,
721        ops::{Deref as _, RangeInclusive},
722    };
723
724    use prop::sample::Index;
725    use proptest::{
726        collection::{hash_set, vec as arb_vec},
727        prelude::*,
728    };
729
730    use super::*;
731    use crate::interning::InternedStringState;
732
733    pub(super) fn create_shard(capacity: NonZeroUsize) -> Arc<Mutex<InternerShardState>> {
734        Arc::new(Mutex::new(InternerShardState::with_capacity(capacity)))
735    }
736
737    pub(super) fn intern_for_shard(shard: &Arc<Mutex<InternerShardState>>, s: &str) -> Option<InternedString> {
738        let hash = hash_string(s);
739        intern_with_shard_and_hash(shard, hash, s)
740    }
741
742    fn shard_capacity(shard: &Arc<Mutex<InternerShardState>>) -> usize {
743        shard.lock().unwrap().capacity.get()
744    }
745
746    fn shard_available(shard: &Arc<Mutex<InternerShardState>>) -> usize {
747        shard.lock().unwrap().available()
748    }
749
750    fn shard_entries(shard: &Arc<Mutex<InternerShardState>>) -> usize {
751        shard.lock().unwrap().entries
752    }
753
754    fn shard_reclaimed_len(shard: &Arc<Mutex<InternerShardState>>) -> usize {
755        shard.lock().unwrap().reclaimed.len()
756    }
757
758    fn shard_first_reclaimed_entry(shard: &Arc<Mutex<InternerShardState>>) -> ReclaimedEntry {
759        shard.lock().unwrap().reclaimed.first().unwrap()
760    }
761
762    pub(super) fn get_reclaimed_entry_for_string(s: &InternedString) -> ReclaimedEntry {
763        let state = match &s.state {
764            InternedStringState::FixedSize(state) => state,
765            _ => panic!("unexpected string state"),
766        };
767
768        let ptr = state.interner.lock().unwrap().ptr.as_ptr();
769        let header = unsafe { state.header.as_ref() };
770        let offset = unsafe { state.header.as_ptr().cast::<u8>().offset_from(ptr) as usize };
771        ReclaimedEntry::new(offset, header.entry_len())
772    }
773
774    fn entry_len(s: &str) -> usize {
775        EntryHeader::len_for(s)
776    }
777
778    fn arb_alphanum_strings(
779        str_len: RangeInclusive<usize>, unique_strs: RangeInclusive<usize>,
780    ) -> impl Strategy<Value = Vec<String>> {
781        // Create characters between 0x20 (32) and 0x7E (126), which are all printable ASCII characters.
782        let char_gen = any::<u8>().prop_map(|c| std::cmp::max(c % 127, 32));
783
784        let str_gen = any::<usize>()
785            .prop_map(move |n| std::cmp::max(n % *str_len.end(), *str_len.start()))
786            .prop_flat_map(move |len| arb_vec(char_gen.clone(), len))
787            // SAFETY: We know our characters are all valid UTF-8 because they're in the ASCII range.
788            .prop_map(|xs| unsafe { String::from_utf8_unchecked(xs) });
789
790        // Create a hash set, which handles the deduplication aspect for us, ensuring we have N unique strings where N
791        // is within the `unique_strs` range... and then convert it to `Vec<String>` for easier consumption.
792        hash_set(str_gen, unique_strs).prop_map(|unique_strs| unique_strs.into_iter().collect::<Vec<_>>())
793    }
794
795    #[test]
796    fn basic() {
797        let interner = FixedSizeInterner::<1>::new(NonZeroUsize::new(1024).unwrap());
798
799        let s1 = interner.try_intern("hello").unwrap();
800        let s2 = interner.try_intern("world").unwrap();
801        let s3 = interner.try_intern("hello").unwrap();
802
803        assert_eq!(s1.deref(), "hello");
804        assert_eq!(s2.deref(), "world");
805        assert_eq!(s3.deref(), "hello");
806
807        // The pointers from the interned strings should be the same, but not between the interned string and a pointer
808        // to an equivalent (but not interned) string:
809        assert!(std::ptr::eq(s1.deref() as *const _, s3.deref() as *const _));
810
811        let local_hello = "hello";
812        assert!(!std::ptr::eq(s1.deref() as *const _, local_hello as *const _));
813    }
814
815    #[test]
816    fn try_intern_without_capacity() {
817        // Big enough to fit a single "hello world!" string, but not big enough to fit two.
818        let interner = FixedSizeInterner::<1>::new(NonZeroUsize::new(64).unwrap());
819
820        let s1 = interner.try_intern("hello world!");
821        assert!(s1.is_some());
822
823        let s2 = interner.try_intern("hello, world");
824        assert!(s2.is_none());
825    }
826
827    #[test]
828    fn reclaim_after_dropped() {
829        let shard = create_shard(NonZeroUsize::new(1024).unwrap());
830
831        let s1 = intern_for_shard(&shard, "hello").expect("should not fail to intern");
832        let s1_entry_len = entry_len(&s1);
833
834        assert_eq!(shard_entries(&shard), 1);
835        assert_eq!(shard_available(&shard), 1024 - s1_entry_len);
836        assert_eq!(shard_reclaimed_len(&shard), 0);
837
838        // Drop the interned string, which should decrement the reference count to zero and then reclaim the entry.
839        drop(s1);
840
841        assert_eq!(shard_entries(&shard), 0);
842        assert_eq!(shard_available(&shard), 1024);
843        assert_eq!(shard_reclaimed_len(&shard), 0);
844    }
845
846    #[test]
847    fn interns_to_reclaimed_entry_with_leftover() {
848        // We want to intern a string initially, which takes up almost all of the capacity, and then drop it so it gets
849        // reclaimed. After that, we'll intern a much smaller string which should lead to utilizing that reclaimed
850        // entry, but only a part of it. Finally, we'll intern another new string.
851        //
852        // The point is to demonstrate that our reclamation logic is sound in terms of allowing reclaimed entries to be
853        // split while the search/insertion logic is operating.
854        let shard_capacity = NonZeroUsize::new(256).unwrap();
855        let shard = create_shard(shard_capacity);
856
857        // We craft four strings such that the first two (`s_large` and `s_medium1`) will take up enough capacity that
858        // `s_small` can't possibly be interned in the available capacity. We'll also craft `s_medium2` so it can fit
859        // within the reclaimed entry for `s_large` but takes enough capacity that `s_small` cannot fit in the leftover
860        // reclaimed entry that we split off.
861        let s_large = "99 bottles of beer on the wall, 99 bottles of beer! take one down, pass it around, 98 bottles of beer on the wall!";
862        let s_medium1 = "no act of kindness, no matter how small, is ever wasted";
863        let s_medium2 = "if you want to go fast, go alone; if you want to go far, go together";
864        let s_small = "are you there god? it's me, margaret";
865
866        let phase1_available_capacity = shard_capacity.get() - entry_len(s_large) - entry_len(s_medium1);
867        assert!(phase1_available_capacity < entry_len(s_small));
868        assert!((entry_len(s_large) - entry_len(s_medium2)) < entry_len(s_small));
869        assert!(entry_len(s_medium2) < entry_len(s_large));
870
871        // Phase 1: intern our two larger strings.
872        let s1 = intern_for_shard(&shard, s_large).expect("should not fail to intern");
873        let s2 = intern_for_shard(&shard, s_medium1).expect("should not fail to intern");
874
875        assert_eq!(shard_entries(&shard), 2);
876        assert_eq!(shard_available(&shard), phase1_available_capacity);
877        assert_eq!(shard_reclaimed_len(&shard), 0);
878
879        // Phase 2: drop `s_large` so it gets reclaimed.
880        drop(s1);
881        assert_eq!(shard_entries(&shard), 1);
882        assert_eq!(shard_reclaimed_len(&shard), 1);
883
884        // Phase 3: intern `s_medium2`, which should fit in the reclaimed entry for `s_large`. This should leave a
885        // small, split off reclaimed entry.
886        let s3 = intern_for_shard(&shard, s_medium2).expect("should not fail to intern");
887
888        assert_eq!(shard_entries(&shard), 2);
889        assert_eq!(shard_reclaimed_len(&shard), 1);
890
891        // Phase 4: intern `s_small`, which should not fit in the leftover reclaimed entry from `s_large` _or_ the
892        // available capacity.
893        let s4 = intern_for_shard(&shard, s_small);
894        assert_eq!(s4, None);
895
896        assert_eq!(shard_entries(&shard), 2);
897        assert_eq!(shard_reclaimed_len(&shard), 1);
898
899        // And make sure we can still dereference the interned strings we _do_ have left:
900        assert_eq!(s2.deref(), s_medium1);
901        assert_eq!(s3.deref(), s_medium2);
902    }
903
904    #[test]
905    fn has_reclaimed_entries_string_fits_exactly() {
906        // The shard is large enough to fit two of the identically-sized strings, but not all three. We show that when we drop
907        // an entry and it is reclaimed, a string of identical size should always be able to reuse that reclaimed entry.
908        const S1_VALUE: &str = "hello world!";
909        const S2_VALUE: &str = "hello, world";
910        const S3_VALUE: &str = "hello--world";
911
912        let shard = create_shard(NonZeroUsize::new(80).unwrap());
913
914        // Intern the first two strings, which should fit without issue.
915        let s1 = intern_for_shard(&shard, S1_VALUE).expect("should not fail to intern");
916        let s1_reclaimed_expected = get_reclaimed_entry_for_string(&s1);
917        let _s2 = intern_for_shard(&shard, S2_VALUE).expect("should not fail to intern");
918
919        assert_eq!(shard_entries(&shard), 2);
920        assert_eq!(shard_reclaimed_len(&shard), 0);
921
922        // Try to intern the third string, which should fail as we don't have the space.
923        let s3 = intern_for_shard(&shard, S3_VALUE);
924        assert_eq!(s3, None);
925
926        // Drop the first string, which should decrement the reference count to zero and then reclaim the entry.
927        drop(s1);
928
929        assert_eq!(shard_entries(&shard), 1);
930        assert_eq!(shard_reclaimed_len(&shard), 1);
931
932        let s1_reclaimed = shard_first_reclaimed_entry(&shard);
933        assert_eq!(s1_reclaimed_expected, s1_reclaimed);
934
935        // Try again to intern the third string, which should now succeed and take over the reclaimed entry entirely
936        // as the strings are identical in length.
937        let _s3 = intern_for_shard(&shard, S3_VALUE).expect("should not fail to intern");
938
939        assert_eq!(shard_entries(&shard), 2);
940        assert_eq!(shard_reclaimed_len(&shard), 0);
941    }
942
943    #[test]
944    fn reclaimed_entry_reuse_split_too_small() {
945        // This situation is slightly contrived, but: we want to test that when there's a reclaimed entry of a certain
946        // size, reusing that reclaimed entry won't lead to it being split if the resulting split entry would be too
947        // "small": unable to hold another minimum-sized entry.
948        //
949        // We have to intern three strings to do this because we only track reclaimed entries when they're followed by
950        // in-use entries, and the string we drop to create a reclaimed entry has to be big enough, but not _too_ big,
951        // to hold the string we want to intern after it.
952        let shard = create_shard(NonZeroUsize::new(128).unwrap());
953
954        // Declare our strings to intern and just check some preconditions by hand.
955        let s_one = "a horse, a horse, my kingdom for a horse!";
956        let s_one_entry_len = entry_len(s_one);
957        let s_two = "why hello there, beautiful";
958        let s_two_entry_len = entry_len(s_two);
959        let s_three = "real gs move in silence like lasagna";
960        let s_three_entry_len = entry_len(s_three);
961
962        assert!(s_one_entry_len <= shard_capacity(&shard));
963        assert!(s_two_entry_len <= shard_capacity(&shard));
964        assert!(s_one_entry_len + s_two_entry_len + s_three_entry_len > shard_capacity(&shard));
965        assert!(s_one_entry_len > s_two_entry_len);
966        assert!(s_three_entry_len > s_two_entry_len);
967        assert!((s_one_entry_len - s_three_entry_len) < MINIMUM_ENTRY_LEN);
968
969        // Intern the first two strings, which should fit without issue.
970        let s1 = intern_for_shard(&shard, s_one).expect("should not fail to intern");
971        let s1_reclaimed_expected = get_reclaimed_entry_for_string(&s1);
972        let _s2 = intern_for_shard(&shard, s_two).expect("should not fail to intern");
973
974        assert_eq!(shard_entries(&shard), 2);
975        assert_eq!(shard_reclaimed_len(&shard), 0);
976
977        // Try to intern the third string, which should fail as we don't have the space.
978        let s3 = intern_for_shard(&shard, s_three);
979        assert_eq!(s3, None);
980
981        // Drop the first string, which should decrement the reference count to zero and then reclaim the entry.
982        drop(s1);
983
984        assert_eq!(shard_entries(&shard), 1);
985        assert_eq!(shard_reclaimed_len(&shard), 1);
986
987        let s1_reclaimed = shard_first_reclaimed_entry(&shard);
988        assert_eq!(s1_reclaimed_expected, s1_reclaimed);
989
990        // Try again to intern the third string, which should now succeed and take over the reclaimed entry, but since
991        // the remainder of the reclaimed entry after taking the necessary capacity for `s_three` is not large enough
992        // (`MINIMUM_ENTRY_LEN`), we shouldn't end up splitting the reclaimed entry, and instead, `s3` should consume
993        // the entire reclaimed entry.
994        let s3 = intern_for_shard(&shard, s_three).expect("should not fail to intern");
995        let s3_reclaimed_expected = get_reclaimed_entry_for_string(&s3);
996
997        assert_eq!(shard_entries(&shard), 2);
998        assert_eq!(shard_reclaimed_len(&shard), 0);
999        assert_eq!(s1_reclaimed_expected, s3_reclaimed_expected);
1000    }
1001
1002    #[test]
1003    fn reclaimed_entry_adjacent_to_spare_capacity() {
1004        let shard = create_shard(NonZeroUsize::new(128).unwrap());
1005
1006        // Intern two smallish strings that fit without issue.
1007        let s1 = intern_for_shard(&shard, "hello, world!").expect("should not fail to intern");
1008        let s2 = intern_for_shard(&shard, "cheeeeeehooooo!").expect("should not fail to intern");
1009        let s1_entry_len = entry_len(&s1);
1010        let s2_entry_len = entry_len(&s2);
1011
1012        assert_eq!(shard_reclaimed_len(&shard), 0);
1013        assert_eq!(shard_available(&shard), 128 - s1_entry_len - s2_entry_len);
1014
1015        drop(s2);
1016        assert_eq!(shard_reclaimed_len(&shard), 0);
1017        assert_eq!(shard_available(&shard), 128 - s1_entry_len);
1018
1019        drop(s1);
1020        assert_eq!(shard_reclaimed_len(&shard), 0);
1021        assert_eq!(shard_available(&shard), 128);
1022    }
1023
1024    proptest! {
1025        #[test]
1026        #[cfg_attr(miri, ignore)]
1027        fn property_test_entry_count_accurate(
1028            strs in arb_alphanum_strings(1..=128, 16..=512),
1029            indices in arb_vec(any::<Index>(), 1..=1000),
1030        ) {
1031            // We ask `proptest` to generate a bunch of unique strings of varying lengths (1-128 bytes, 16-512 unique
1032            // strings) which we then randomly select out of those strings which strings we want to intern. The goal
1033            // here is to potentially select the same string multiple times, to exercise the actual interning logic...
1034            // but practically, to ensure that when we intern a string that has already been interned, we're not
1035            // incrementing the entries count again.
1036
1037            // Create an interner with enough capacity to hold all of the strings we've generated. This is the maximum
1038            // string size multiplied by the number of strings we've generated... plus a little constant factor per
1039            // string to account for the entry header.
1040            const ENTRY_SIZE: usize = 128 + HEADER_LEN;
1041            let interner = FixedSizeInterner::<1>::new(NonZeroUsize::new(ENTRY_SIZE * indices.len()).unwrap());
1042
1043            // For each index, pull out the string and both track it in `unique_strs` and intern it. We hold on to the
1044            // interned string handle to make sure the interned string is actually kept alive, keeping the entry count
1045            // stable.
1046            let mut interned = Vec::new();
1047            let mut unique_strs = HashSet::new();
1048            for index in &indices {
1049                let s = index.get(&strs);
1050                unique_strs.insert(s);
1051
1052                let s_interned = interner.try_intern(s).expect("should never fail to intern");
1053                interned.push(s_interned);
1054            }
1055
1056            assert_eq!(unique_strs.len(), interner.len());
1057        }
1058    }
1059}
1060
1061#[cfg(all(test, feature = "loom"))]
1062mod loom_tests {
1063    use std::{num::NonZeroUsize, ops::Deref};
1064
1065    use loom::sync::{Arc, Mutex};
1066
1067    use super::{
1068        tests::{create_shard, get_reclaimed_entry_for_string, intern_for_shard},
1069        *,
1070    };
1071
1072    #[test]
1073    fn concurrent_drop_and_intern() {
1074        fn shard_reclaimed_entries(shard: &Arc<Mutex<InternerShardState>>) -> Vec<ReclaimedEntry> {
1075            shard.lock().unwrap().reclaimed.iter().copied().collect()
1076        }
1077
1078        fn do_reclaimed_entries_overlap(a: ReclaimedEntry, b: ReclaimedEntry) -> bool {
1079            let a_start = a.offset;
1080            let a_end = a.offset + a.capacity - 1;
1081
1082            let b_start = b.offset;
1083            let b_end = b.offset + b.capacity - 1;
1084
1085            (a_start <= b_start && b_start <= a_end) || (a_start <= b_end && b_end <= a_end)
1086        }
1087
1088        const STRING_TO_INTERN: &str = "hello, world!";
1089
1090        // This test is meant to explore the thread orderings when one thread is trying to drop (and thus reclaim) the
1091        // last active reference to an interned string, and another thread is trying to intern that very same string.
1092        //
1093        // We accept, as a caveat, that a possible outcome is that we intern the "new" string again, even though an
1094        // existing entry to that string may have existed in an alternative ordering.
1095        loom::model(|| {
1096            let shard = create_shard(NonZeroUsize::new(1024).unwrap());
1097            let t2_shard = Arc::clone(&shard);
1098
1099            // Intern the string from thread T1.
1100            let t1_interned_s = intern_for_shard(&shard, STRING_TO_INTERN).expect("should not fail to intern");
1101            assert_eq!(t1_interned_s.deref(), STRING_TO_INTERN);
1102            let t1_reclaimed_entry = get_reclaimed_entry_for_string(&t1_interned_s);
1103
1104            // Spawn thread T2, which tries to intern the same string and returns the handle to it.
1105            let t2_result = loom::thread::spawn(move || {
1106                let interned_s = intern_for_shard(&t2_shard, STRING_TO_INTERN).expect("should not fail to intern");
1107                let reclaimed_entry = get_reclaimed_entry_for_string(&interned_s);
1108
1109                (interned_s, reclaimed_entry)
1110            });
1111
1112            drop(t1_interned_s);
1113
1114            let (t2_interned_s, t2_reclaimed_entry) = t2_result.join().expect("should not fail to join T2");
1115            assert_eq!(t2_interned_s.deref(), STRING_TO_INTERN);
1116
1117            // What we're checking for here is that either:
1118            // - there's no reclaimed entries (T2 found the existing entry for the string before T1 dropped it)
1119            // - there's a reclaimed entry (T2 didn't find the existing entry for the string before T1 marked it as
1120            //   inactive) but the reclaimed entry does _not_ overlap with the interned string from T2, meaning we
1121            //   didn't get confused and allow T2 to use an existing entry that T1 then later marked as reclaimed
1122            let reclaimed_entries = shard_reclaimed_entries(&shard);
1123            assert!(reclaimed_entries.len() <= 1, "should have at most one reclaimed entry");
1124
1125            if !reclaimed_entries.is_empty() {
1126                // If we do have a reclaimed entry, it needs to match exactly with only one of the interned strings.
1127                let is_t1_entry = reclaimed_entries.first().unwrap() == &t1_reclaimed_entry;
1128                let is_t2_entry = reclaimed_entries.first().unwrap() == &t2_reclaimed_entry;
1129
1130                assert!(
1131                    (is_t1_entry || is_t2_entry) && !(is_t1_entry && is_t2_entry),
1132                    "should only match one interned string"
1133                );
1134
1135                // Additionally, we ensure that the reclaimed entry does not overlap with the other interned string.
1136                assert!(
1137                    !do_reclaimed_entries_overlap(t1_reclaimed_entry, t2_reclaimed_entry),
1138                    "reclaimed entry should not overlap with remaining interned string"
1139                );
1140            }
1141        });
1142    }
1143}
stringtheory/interning/fixed_size.rs

stringtheory/interning/
fixed_size.rs