stringtheory/interning/
map.rs

1#[cfg(not(feature = "loom"))]
2use std::sync::{
3    atomic::{
4        AtomicUsize,
5        Ordering::{AcqRel, Acquire},
6    },
7    Arc, Mutex,
8};
9use std::{collections::HashMap, num::NonZeroUsize, ptr::NonNull};
10
11#[cfg(feature = "loom")]
12use loom::sync::{
13    atomic::{
14        AtomicUsize,
15        Ordering::{AcqRel, Acquire},
16    },
17    Arc, Mutex,
18};
19
20use super::{
21    helpers::{layout_for_data, PackedLengthCapacity},
22    InternedString, Interner,
23};
24use crate::interning::helpers::{aligned_string, ReclaimedEntries, ReclaimedEntry};
25
26const HEADER_LEN: usize = std::mem::size_of::<EntryHeader>();
27const HEADER_ALIGN: usize = std::mem::align_of::<EntryHeader>();
28
29/// The minimum possible length of an entry.
30///
31/// For any entry in the interner, there is already an `EntryHeader` followed by the string data itself. In order to
32/// ensure that entries can be written contiguously, we additionally ensure that the number of bytes we utilize for the
33/// string data is aligned at least as much as `EntryHeader` itself.
34///
35/// This means that the minimum possible length of an entry, or the minimum number of bytes a valid entry could consume,
36/// is the length of the header plus the alignment of the header.
37const MINIMUM_ENTRY_LEN: usize = HEADER_LEN + HEADER_ALIGN;
38
39#[derive(Debug)]
40pub(crate) struct StringState {
41    interner: Arc<Mutex<InternerState>>,
42    header: NonNull<EntryHeader>,
43}
44
45impl StringState {
46    #[inline]
47    pub const fn as_str(&self) -> &str {
48        // SAFETY: We ensure `self.header` is well-aligned and points to an initialized `EntryHeader` value when creating `StringState`.
49        unsafe { get_entry_string(self.header) }
50    }
51}
52
53impl PartialEq for StringState {
54    fn eq(&self, other: &Self) -> bool {
55        self.header == other.header
56    }
57}
58
59impl Clone for StringState {
60    fn clone(&self) -> Self {
61        // SAFETY: The caller that creates `StringState` is responsible for ensuring that `self.header` is well-aligned
62        // and points to an initialized `EntryHeader` value.
63        let header = unsafe { self.header.as_ref() };
64        header.increment_active_refs();
65
66        Self {
67            interner: self.interner.clone(),
68            header: self.header,
69        }
70    }
71}
72
73impl Drop for StringState {
74    fn drop(&mut self) {
75        // SAFETY: The caller that creates `StringState` is responsible for ensuring that `self.header` is well-aligned
76        // and points to an initialized `EntryHeader` value.
77        let header = unsafe { self.header.as_ref() };
78        if header.decrement_active_refs() {
79            // We decremented the reference count to zero, so try to mark this entry for reclamation.
80            let mut interner = self.interner.lock().unwrap();
81            interner.mark_for_reclamation(self.header);
82        }
83    }
84}
85
86/// Metadata about an interner entry.
87///
88/// `EntryHeader` represents the smallest amount of information about an interned entry that is needed to support both
89/// lookup of existing interned strings, as well as the ability to reclaim space in the interner when an entry is no
90/// longer in use.
91struct EntryHeader {
92    /// The number of active references to this entry.
93    ///
94    /// Only incremented by the interner itself, and decremented by `InternedString` when it is dropped.
95    refs: AtomicUsize,
96
97    /// Combined length/capacity of the entry, in terms of the string itself.
98    ///
99    /// Notably, this does _not_ include the length of the header itself. For example, an entry holding the string
100    /// "hello, world!" has a string length of 13 bytes, but since we have to pad out to meet our alignment requirements
101    /// for `EntryHeader`, we would end up with a capacity of 16 bytes. As such, `EntryHeader::len` would report `13`,
102    /// while `EntryHeader::capacity` would report `16`. Likewise, `EntryHeader::entry_len` would report `40`,
103    /// accounting for the string capacity (16) as well as the header length itself (24).
104    ///
105    /// As explained in the description of `PackedLengthCapacity`, this does mean strings can't be larger than ~4GB on
106    /// 64-bit platforms, which is not a problem we have.
107    len_cap: PackedLengthCapacity,
108}
109
110impl EntryHeader {
111    /// Creates a tombstone entry with the given capacity.
112    ///
113    /// This is to allow for updating a region in the data buffer, which has been reclaimed, such that it is
114    /// identifiable as being unused.
115    fn tombstone(entry: ReclaimedEntry) -> Self {
116        // The usable capacity for a reclaimed entry is the full capacity minus the size of `EntryHeader` itself, as
117        // reclaimed entries represent the _entire_ region in the data buffer, but `EntryHeader` only cares about the
118        // string portion itself.
119        let str_cap = EntryHeader::usable_from_reclaimed(entry);
120
121        Self {
122            refs: AtomicUsize::new(0),
123            len_cap: PackedLengthCapacity::new(str_cap, 0),
124        }
125    }
126
127    /// Creates a new entry for the given string.
128    fn from_string(s: &str) -> Self {
129        // We're dictating the necessary capacity here, which is the length of the string rounded to the nearest
130        // multiple of the alignment of `EntryHeader`, which ensures that any subsequent entry will be properly aligned.
131        let str_cap = aligned_string::<Self>(s);
132
133        Self {
134            refs: AtomicUsize::new(1),
135            len_cap: PackedLengthCapacity::new(str_cap, s.len()),
136        }
137    }
138
139    /// Creates a new entry for the given string, based on the given reclaimed entry.
140    ///
141    /// This maps the entry header to the underlying capacity of the given reclaimed entry, which is done in cases where
142    /// a reclaimed entry is being used when interning a new string, and the reclaimed entry is larger than the string
143    /// being interned, but not large enough that we could split the excess capacity into a new reclaimed entry.
144    fn from_reclaimed_entry(mut entry: ReclaimedEntry, s: &str) -> (Self, Option<ReclaimedEntry>) {
145        // The usable capacity for a reclaimed entry is the full capacity minus the size of `EntryHeader` itself, as
146        // reclaimed entries represent the _entire_ region in the data buffer, but `EntryHeader` only cares about the
147        // string portion itself.
148        let entry_str_cap = EntryHeader::usable_from_reclaimed(entry);
149        let required_str_cap = aligned_string::<Self>(s);
150
151        // If the reclaimed entry has enough additional space beyond what we need for the string, we'll split it off and
152        // return it for the caller to keep around in the reclaimed entries list.
153        let remainder = entry_str_cap - required_str_cap;
154        let (adjusted_str_cap, maybe_split_entry) = if remainder >= MINIMUM_ENTRY_LEN {
155            let entry_len = EntryHeader::len_for(s);
156            let split_entry = entry.split_off(entry_len);
157
158            (entry_len - HEADER_LEN, Some(split_entry))
159        } else {
160            (entry_str_cap, None)
161        };
162
163        let header = Self {
164            refs: AtomicUsize::new(1),
165            len_cap: PackedLengthCapacity::new(adjusted_str_cap, s.len()),
166        };
167
168        (header, maybe_split_entry)
169    }
170
171    /// Returns the computed length of a complete entry, in bytes, for the given string.
172    ///
173    /// This includes the size of the entry header itself and the string data, when padded for alignment, and represents
174    /// the number of bytes that would be consumed in the data buffer.
175    const fn len_for(s: &str) -> usize {
176        HEADER_LEN + aligned_string::<Self>(s)
177    }
178
179    /// Returns the usable capacity of a reclaimed entry, in bytes.
180    ///
181    /// Usable refers to the number of bytes in a reclaimed entry that could be used for string data, after accounting
182    /// for the size of `EntryHeader` itself.
183    const fn usable_from_reclaimed(entry: ReclaimedEntry) -> usize {
184        entry.capacity() - HEADER_LEN
185    }
186
187    /// Returns the size of the string, in bytes, that this entry can hold.
188    const fn capacity(&self) -> usize {
189        self.len_cap.capacity()
190    }
191
192    /// Returns the size of the string, in bytes, that this entry _actually_ holds.
193    const fn len(&self) -> usize {
194        self.len_cap.len()
195    }
196
197    /// Returns the total length of the entry, in bytes.
198    ///
199    /// This includes the length of the header in addition to the string data.
200    const fn entry_len(&self) -> usize {
201        HEADER_LEN + self.capacity()
202    }
203
204    /// Returns `true` if this entry is currently referenced.
205    fn is_active(&self) -> bool {
206        self.refs.load(Acquire) != 0
207    }
208
209    /// Increments the active reference count by one.
210    fn increment_active_refs(&self) {
211        self.refs.fetch_add(1, AcqRel);
212    }
213
214    /// Decrements the active reference count by one.
215    ///
216    /// Returns `true` if the active reference count is zero _after_ calling this method.
217    fn decrement_active_refs(&self) -> bool {
218        self.refs.fetch_sub(1, AcqRel) == 1
219    }
220}
221
222// SAFETY: We don't take references to the entry header pointer that outlast `StringState`, and the only modification we
223// do to the entry header is through atomic operations, so it's safe to both send and share `StringState` between
224// threads.
225unsafe impl Send for StringState {}
226unsafe impl Sync for StringState {}
227
228#[derive(Debug)]
229struct InternerStorage {
230    // Direct pieces of our buffer allocation.
231    ptr: NonNull<u8>,
232    offset: usize,
233    capacity: NonZeroUsize,
234
235    // Length of all active entries, in bytes.
236    //
237    // This is equivalent to `self.offset` minus the total size of all reclaimed entries.
238    len: usize,
239
240    // Markers for entries that can be reused.
241    reclaimed: ReclaimedEntries,
242}
243
244impl InternerStorage {
245    fn with_capacity(capacity: NonZeroUsize) -> Self {
246        assert!(
247            capacity.get() <= isize::MAX as usize,
248            "capacity would overflow isize::MAX, which violates layout constraints"
249        );
250
251        // Allocate our data buffer. This is the main backing allocation for all interned strings, and is well-aligned
252        // for `EntryHeader`.
253        //
254        // SAFETY: `layout_for_data` ensures the layout is non-zero.
255        let data_layout = layout_for_data::<EntryHeader>(capacity);
256        let data_ptr = unsafe { std::alloc::alloc(data_layout) };
257        let ptr = match NonNull::new(data_ptr) {
258            Some(ptr) => ptr,
259            None => std::alloc::handle_alloc_error(data_layout),
260        };
261
262        Self {
263            ptr,
264            offset: 0,
265            capacity,
266            len: 0,
267            reclaimed: ReclaimedEntries::new(),
268        }
269    }
270
271    #[cfg(test)]
272    /// Returns the total number of unused bytes that are available for interning.
273    fn available(&self) -> usize {
274        self.capacity.get() - self.len
275    }
276
277    /// Returns the total number of bytes of continguous, unoccupied space at the end of the data buffer.
278    fn available_unoccupied(&self) -> usize {
279        self.capacity.get() - self.offset
280    }
281
282    fn get_entry_ptr(&self, offset: usize) -> NonNull<EntryHeader> {
283        debug_assert!(
284            offset + MINIMUM_ENTRY_LEN <= self.capacity.get(),
285            "offset would point to entry that cannot possibly avoid extending past end of data buffer"
286        );
287
288        // SAFETY: The caller is responsible for ensuring that `offset` is within the bounds of the data buffer, and
289        // that `offset` is well-aligned for `EntryHeader`.
290        let entry_ptr = unsafe { self.ptr.as_ptr().add(offset).cast::<EntryHeader>() };
291        debug_assert!(entry_ptr.is_aligned(), "entry header pointer must be well-aligned");
292
293        // SAFETY: `entry_ptr` is derived from `self.ptr`, which itself is `NonNull<u8>`, and the caller is responsible
294        // for ensuring that `offset` is within the bounds of the data buffer, so we know `entry_ptr` is non-null.
295        unsafe { NonNull::new_unchecked(entry_ptr) }
296    }
297
298    fn write_entry(&mut self, offset: usize, entry_header: EntryHeader, s: &str) -> NonNull<EntryHeader> {
299        debug_assert_eq!(
300            entry_header.len(),
301            s.len(),
302            "entry header length must match string length"
303        );
304
305        let entry_ptr = self.get_entry_ptr(offset);
306        let entry_len = entry_header.entry_len();
307
308        // Write the entry header.
309        unsafe { entry_ptr.as_ptr().write(entry_header) };
310
311        let s_buf = s.as_bytes();
312
313        // Write the string.
314        let entry_s_buf = unsafe {
315            // Take the entry pointer and add 1, which sets our pointer to right _after_ the header.
316            let entry_s_ptr = entry_ptr.as_ptr().add(1).cast::<u8>();
317            std::slice::from_raw_parts_mut(entry_s_ptr, s_buf.len())
318        };
319        entry_s_buf.copy_from_slice(s_buf);
320
321        // Update our internal statistics.
322        self.len += entry_len;
323
324        entry_ptr
325    }
326
327    fn write_to_unoccupied(&mut self, s: &str) -> (usize, NonNull<EntryHeader>) {
328        let entry_header = EntryHeader::from_string(s);
329
330        // Write the entry to the end of the data buffer.
331        let entry_offset = self.offset;
332        self.offset += entry_header.entry_len();
333
334        (entry_offset, self.write_entry(entry_offset, entry_header, s))
335    }
336
337    fn write_to_reclaimed_entry(&mut self, entry: ReclaimedEntry, s: &str) -> (usize, NonNull<EntryHeader>) {
338        let entry_offset = entry.offset();
339        let (entry_header, maybe_split_entry) = EntryHeader::from_reclaimed_entry(entry, s);
340
341        // If we had enough capacity in the reclaimed entry to hold this string _and_ potentially hold another entry, we
342        // split it off and store that remainder entry.
343        if let Some(split_entry) = maybe_split_entry {
344            self.add_reclaimed(split_entry);
345        }
346
347        // Write the entry in place of the reclaimed entry.
348        (entry_offset, self.write_entry(entry_offset, entry_header, s))
349    }
350
351    fn add_reclaimed(&mut self, entry: ReclaimedEntry) {
352        // Reclamation is a two-step process: first, we have to actually keep track of the reclaimed entry, which
353        // potentially involves merging adjacent reclaimed entries, and then once all of that has happened, we tombstone
354        // the entry (whether merged or not).
355        //
356        // However, if the merged reclaimed entry immediately precedes any available capacity, we can skip tombstoning
357        // it, since we can just wind back `offset` to reclaim the space.
358        let merged_entry = self.reclaimed.insert(entry);
359        if merged_entry.offset() + merged_entry.capacity() == self.offset {
360            self.offset -= merged_entry.capacity();
361            self.reclaimed.remove(&merged_entry);
362            return;
363        }
364
365        self.clear_reclaimed_entry(merged_entry);
366    }
367
368    fn clear_reclaimed_entry(&mut self, entry: ReclaimedEntry) {
369        let entry_ptr = self.get_entry_ptr(entry.offset);
370
371        // Write the entry tombstone itself, which clears out the hash and sets the reference count to zero.
372        //
373        // SAFETY: We know that `entry_ptr` is valid for writes (reclaimed entries are, by definition, inactive regions
374        // in the data buffer) and is well-aligned for `EntryHeader`.
375        let tombstone = EntryHeader::tombstone(entry);
376        let str_cap = tombstone.capacity();
377
378        unsafe {
379            entry_ptr.as_ptr().write(EntryHeader::tombstone(entry));
380        }
381
382        // Write a magic value to the entire string capacity for the entry. This ensures that there's a known repeating
383        // value which, in the case of debugging issues, can be a signal that offsets/reclaimed entries are incorrect
384        // and overlapping with active entries.
385        //
386        // SAFETY: Like above, the caller is responsible for ensuring that `offset` is within the bounds of the data
387        // buffer, and that `offset + capacity` does not extend past the bounds of the data buffer.
388        unsafe {
389            // Take the entry pointer and add 1, which sets our pointer to right _after_ the header.
390            let str_ptr = entry_ptr.as_ptr().add(1).cast::<u8>();
391            let str_buf = std::slice::from_raw_parts_mut(str_ptr, str_cap);
392            str_buf.fill(0x21);
393        }
394    }
395
396    fn mark_for_reclamation(&mut self, header_ptr: NonNull<EntryHeader>) {
397        // Get the offset of the header within the data buffer.
398        //
399        // SAFETY: The caller is responsible for ensuring the entry header reference belongs to this interner. If that
400        // is upheld, then we know that entry header belongs to our data buffer, and that the pointer to the entry
401        // header is not less than the base pointer of the data buffer, ensuring the offset is non-negative.
402        let entry_offset = unsafe {
403            header_ptr
404                .cast::<u8>()
405                .as_ptr()
406                .offset_from(self.ptr.as_ptr().cast_const())
407        };
408        debug_assert!(entry_offset >= 0, "entry offset must be non-negative");
409
410        let header = unsafe { header_ptr.as_ref() };
411        let entry_len = header.entry_len();
412
413        let entry = ReclaimedEntry::new(entry_offset as usize, entry_len);
414        self.len -= entry.capacity();
415        self.add_reclaimed(entry);
416    }
417}
418
419impl Drop for InternerStorage {
420    fn drop(&mut self) {
421        // SAFETY: We allocated this buffer with the global allocator, and we're generating the same layout that was
422        // used to allocate it in the first place.
423        unsafe {
424            std::alloc::dealloc(self.ptr.as_ptr(), layout_for_data::<EntryHeader>(self.capacity));
425        }
426    }
427}
428
429#[derive(Debug)]
430struct InternerState {
431    // Backing storage for the interned strings.
432    storage: InternerStorage,
433
434    // Active entries in the interner.
435    entries: HashMap<&'static str, usize>,
436}
437
438impl InternerState {
439    /// Creates a new `InternerState` with a pre-allocated buffer that has the given capacity.
440    pub fn with_capacity(capacity: NonZeroUsize) -> Self {
441        Self {
442            storage: InternerStorage::with_capacity(capacity),
443            entries: HashMap::new(),
444        }
445    }
446
447    fn mark_for_reclamation(&mut self, header_ptr: NonNull<EntryHeader>) {
448        // See if the reference count is zero.
449        //
450        // Only interned string values (the frontend handle that wraps the pointer to a specific entry) can decrement
451        // the reference count for their specific entry when dropped, and only `InternerState` -- with its access
452        // mediated through a mutex -- can increment the reference count for entries. This means that if the reference
453        // count is zero, then we know that nobody else is holding a reference to this entry, and no concurrent call to
454        // `try_intern` could be updating the reference count, either... so it's safe to be marked as reclaimed.
455        //
456        // SAFETY: The caller is responsible for ensuring that `header_ptr` is well-aligned and points to an initialized
457        // `EntryHeader` value.
458        let header = unsafe { header_ptr.as_ref() };
459        if !header.is_active() {
460            // Remove the entry from the entries map first before we reclaim the entry, since doing so overwrites the
461            // entry data in the data buffer.
462            //
463            // SAFETY: The caller is responsible for ensuring that they've given us a pointer to an `EntryHeader` that
464            // was acquired from this interner.
465            let entry_str = unsafe { get_entry_string(header_ptr) };
466            self.entries.remove(entry_str);
467
468            self.storage.mark_for_reclamation(header_ptr);
469        }
470    }
471
472    fn try_intern(&mut self, s: &str) -> Option<NonNull<EntryHeader>> {
473        // We can only intern strings with a size that fits within a packed length/capacity value, so if `s` is larger
474        // than that, we can't intern it, and there's existing entry we could have for it either.
475        if s.len() > PackedLengthCapacity::maximum_value() {
476            return None;
477        }
478
479        // Try and find an existing entry for this string.
480        if let Some(entry_offset) = self.entries.get(s) {
481            let entry_ptr = self.storage.get_entry_ptr(*entry_offset);
482
483            let header = unsafe { entry_ptr.as_ref() };
484            header.increment_active_refs();
485
486            return Some(entry_ptr);
487        }
488
489        let required_cap = EntryHeader::len_for(s);
490
491        // We didn't find an existing entry, so we're going to intern it.
492        //
493        // First, try and see if we have a reclaimed entry that can fit this string. If nothing suitable is found, or we
494        // have no reclaimed entries, then we'll just try to fit it in the remaining capacity of our data buffer.
495        let maybe_reclaimed_entry = self.storage.reclaimed.take_if(|entry| entry.capacity() >= required_cap);
496        let (entry_offset, entry_ptr) = if let Some(reclaimed_entry) = maybe_reclaimed_entry {
497            self.storage.write_to_reclaimed_entry(reclaimed_entry, s)
498        } else if required_cap <= self.storage.available_unoccupied() {
499            self.storage.write_to_unoccupied(s)
500        } else {
501            // We don't have enough space to intern this string at all, so we'll just return `None`.
502            return None;
503        };
504
505        // SAFETY: Callers of `get_entry_string` are responsible for ensuring that the chosen lifetime of the string is
506        // valid with respect to ensuring that the underlying entry lives long enough, and that the string data is valid
507        // UTF-8 from as long as the reference is live.
508        //
509        // We're creating a `'static` reference here, which is generally frowned upon when the data is in fact not
510        // `'static`, but this is safe because we never leak this lifetime outside of the interner, and we ensure that
511        // we don't actually keep this reference around longer than the entry itself, as we only need a `'static`
512        // reference to use it as the key in the entries map.
513        let entry_str = unsafe { get_entry_string(entry_ptr) };
514        self.entries.insert(entry_str, entry_offset);
515
516        Some(entry_ptr)
517    }
518}
519
520// SAFETY: We don't take references to the data buffer pointer that outlast `InternerState`, and all access to
521// `InternerState` itself is mediated through a mutex, so we're safe to send it around and share it between threads.
522unsafe impl Send for InternerState {}
523unsafe impl Sync for InternerState {}
524
525/// A string interner based on a single, fixed-size backing buffer with support for reclamation.
526///
527/// ## Overview
528///
529/// This interner uses a single, fixed-size backing buffer where interned strings are stored contiguously. This provides
530/// bounded memory usage, and the interner will not allocate additional memory for new strings once the buffer is full.
531/// Since interned strings are not likely to need to live for the life of the program, the interner supports
532/// reclamation. Once all references to an interned string have been dropped, the storage for that string is reclaimed
533/// and can be used to hold new strings.
534///
535/// ## Storage layout
536///
537/// The backing buffer stores strings contiguously, with an entry "header" prepended to each string. The header contains
538/// relevant data -- hash of the string, reference count, and length of the string -- needed to work with the entry
539/// either when searching for existing entries or when using the entry itself.
540///
541/// The layout of an entry is as follows:
542///
543/// ```text
544/// ┌───────────────────────── entry #1 ──────────────────────────┐ ┌─ entry #2 ─┐ ┌─ entry .. ─┐
545/// ▼                                                             ▼ ▼            ▼ ▼            ▼
546/// ┏━━━━━━━━━━━┯━━━━━━━━━━━┯━━━━━━━━━━━┯━━━━━━━━━━━┯━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━┓
547/// ┃ str hash  │  ref cnt  │  str len  │ str data  │   padding   ┃ ┃   header   ┃ ┃   header   ┃
548/// ┃ (8 bytes) │ (8 bytes) │ (8 bytes) │ (N bytes) │ (1-7 bytes) ┃ ┃  & string  ┃ ┃  & string  ┃
549/// ┗━━━━━━━━━━━┷━━━━━━━━━━━┷━━━━━━━━━━━┷━━━━━━━━━━━┷━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━┛
550/// ▲                                   ▲                           ▲
551/// └────────── `EntryHeader` ──────────┘                           └── aligned for `EntryHeader`
552///          (8 byte alignment)                                         via trailing padding
553/// ```
554///
555/// The backing buffer is always aligned properly for `EntryHeader`, so that the first entry can be referenced
556/// correctly. However, when appending additional entries to the buffer, we need to ensure that those entries also have
557/// an aligned start for accessing the header. This is complicated due to the variable number of bytes for the string
558/// data.
559///
560/// Alignment padding is added to the end of the entry to ensure that when appending the next entry, the start of the
561/// entry is properly aligned for `EntryHeader`. In the worst case, up to 7 bytes could be added (and thus wasted) on
562/// this alignment padding.
563///
564/// ## `InternedString`
565///
566/// The `InternedString` type is a handle to the entry header, and thus the string data, for an interned string. It is
567/// designed to be small -- 8 bytes! -- and cheap to clone, as it contains an atomic reference to the entry header and a
568/// reference to the interner that owns the string. It dereferences to the underlying string with relatively low
569/// overhead: two pointer indirections.
570///
571/// When an `InternedString` is dropped, it decrements the reference count for the entry it points to. If the reference
572/// count drops to zero, it will attempt to mark the entry for reclamation.
573///
574/// ## Reclamation
575///
576/// As we want to bound the memory used by the interner, but also not allow it to be filled up with strings that
577/// eventually end up going entirely unused, we need a way to remove those unused strings so their underlying storage
578/// can be used for new strings. This is where reclamation comes in.
579///
580/// When a string is interned, the entry header tracks how many active references there are to it. When that reference
581/// count drops to zero, the last reference to the string attempts to mark the entry for reclamation. Assuming no other
582/// reference has been taken out on the entry in the meantime, the entry gets added to a list of "reclaimed" entries.
583///
584/// Reclaimed entries are simple markers -- start and end position in the data buffer -- which are stored in a freelist.
585/// When attempting to intern a new string, this freelist is searched to see if there's an entry large enough to fit the
586/// new string, and if so, it is used.
587///
588/// Additionally, when entries are reclaimed, adjacent entries are merged together where possible. This helps to avoid
589/// unnecessary fragmentation over time, although not as effectively as reconstructing the data buffer to re-pack
590/// entries.
591#[derive(Clone, Debug)]
592pub struct GenericMapInterner {
593    state: Arc<Mutex<InternerState>>,
594}
595
596impl GenericMapInterner {
597    /// Creates a new `GenericMapInterner` with the given capacity.
598    ///
599    /// The given capacity will potentially be rounded up by a small number of bytes (up to 7) in order to ensure the
600    /// backing buffer is properly aligned.
601    pub fn new(capacity: NonZeroUsize) -> Self {
602        Self {
603            state: Arc::new(Mutex::new(InternerState::with_capacity(capacity))),
604        }
605    }
606}
607
608impl Interner for GenericMapInterner {
609    fn is_empty(&self) -> bool {
610        self.state.lock().unwrap().entries.is_empty()
611    }
612
613    fn len(&self) -> usize {
614        self.state.lock().unwrap().entries.len()
615    }
616
617    fn len_bytes(&self) -> usize {
618        self.state.lock().unwrap().storage.len
619    }
620
621    fn capacity_bytes(&self) -> usize {
622        self.state.lock().unwrap().storage.capacity.get()
623    }
624
625    fn try_intern(&self, s: &str) -> Option<InternedString> {
626        let header = {
627            let mut state = self.state.lock().unwrap();
628            state.try_intern(s)?
629        };
630
631        Some(InternedString::from(StringState {
632            interner: Arc::clone(&self.state),
633            header,
634        }))
635    }
636}
637
638#[inline]
639const unsafe fn get_entry_string_parts(header_ptr: NonNull<EntryHeader>) -> (NonNull<u8>, usize) {
640    // SAFETY: The caller is responsible for ensuring that `header_ptr` is well-aligned and points to an initialized
641    // `EntryHeader` value.
642    let header = header_ptr.as_ref();
643
644    // Advance past the header and get the pointer to the string.
645    //
646    // SAFETY: We know that we're simply skipping over the header by advancing the pointer by one when it's still typed
647    // as `*mut EntryHeader`.
648    let s_ptr = header_ptr.add(1).cast::<u8>();
649    (s_ptr, header.len())
650}
651
652#[inline]
653const unsafe fn get_entry_string<'a>(header_ptr: NonNull<EntryHeader>) -> &'a str {
654    let (s_ptr, s_len) = get_entry_string_parts(header_ptr);
655
656    // SAFETY: We depend on `get_entry_string_parts` to give us a valid pointer and length for the string.
657    std::str::from_utf8_unchecked(std::slice::from_raw_parts(s_ptr.as_ptr() as *const _, s_len))
658}
659
660#[cfg(test)]
661mod tests {
662    use std::{
663        collections::HashSet,
664        ops::{Deref as _, RangeInclusive},
665    };
666
667    use prop::sample::Index;
668    use proptest::{
669        collection::{hash_set, vec as arb_vec},
670        prelude::*,
671    };
672
673    use super::*;
674    use crate::interning::InternedStringState;
675
676    fn create_interner(capacity: usize) -> GenericMapInterner {
677        assert!(capacity > 0, "capacity must be greater than zero");
678        GenericMapInterner::new(NonZeroUsize::new(capacity).unwrap())
679    }
680
681    fn available_len(interner: &GenericMapInterner) -> usize {
682        interner.state.lock().unwrap().storage.available()
683    }
684
685    fn reclaimed_len(interner: &GenericMapInterner) -> usize {
686        interner.state.lock().unwrap().storage.reclaimed.len()
687    }
688
689    fn first_reclaimed_entry(interner: &GenericMapInterner) -> ReclaimedEntry {
690        interner.state.lock().unwrap().storage.reclaimed.first().unwrap()
691    }
692
693    const fn entry_len(s: &str) -> usize {
694        EntryHeader::len_for(s)
695    }
696
697    fn get_reclaimed_entry_for_string(s: &InternedString) -> ReclaimedEntry {
698        let state = match &s.state {
699            InternedStringState::GenericMap(state) => state,
700            _ => panic!("unexpected string state"),
701        };
702
703        let ptr = state.interner.lock().unwrap().storage.ptr.as_ptr();
704        let header = unsafe { state.header.as_ref() };
705        let offset = unsafe { state.header.as_ptr().cast::<u8>().offset_from(ptr) as usize };
706        ReclaimedEntry::new(offset, header.entry_len())
707    }
708
709    fn arb_alphanum_strings(
710        str_len: RangeInclusive<usize>, unique_strs: RangeInclusive<usize>,
711    ) -> impl Strategy<Value = Vec<String>> {
712        // Create characters between 0x20 (32) and 0x7E (126), which are all printable ASCII characters.
713        let char_gen = any::<u8>().prop_map(|c| std::cmp::max(c % 127, 32));
714
715        let str_gen = any::<usize>()
716            .prop_map(move |n| std::cmp::max(n % *str_len.end(), *str_len.start()))
717            .prop_flat_map(move |len| arb_vec(char_gen.clone(), len))
718            // SAFETY: We know our characters are all valid UTF-8 because they're in the ASCII range.
719            .prop_map(|xs| unsafe { String::from_utf8_unchecked(xs) });
720
721        // Create a hash set, which handles the deduplication aspect for us, ensuring we have N unique strings where N
722        // is within the `unique_strs` range... and then convert it to `Vec<String>` for easier consumption.
723        hash_set(str_gen, unique_strs).prop_map(|unique_strs| unique_strs.into_iter().collect::<Vec<_>>())
724    }
725
726    #[test]
727    fn basic() {
728        let interner = create_interner(1024);
729
730        let s1 = interner.try_intern("hello").unwrap();
731        let s2 = interner.try_intern("world").unwrap();
732        let s3 = interner.try_intern("hello").unwrap();
733
734        assert_eq!(s1.deref(), "hello");
735        assert_eq!(s2.deref(), "world");
736        assert_eq!(s3.deref(), "hello");
737
738        // The pointers from the interned strings should be the same, but not between the interned string and a pointer
739        // to an equivalent (but not interned) string:
740        assert!(std::ptr::eq(s1.deref() as *const _, s3.deref() as *const _));
741
742        let local_hello = "hello";
743        assert!(!std::ptr::eq(s1.deref() as *const _, local_hello as *const _));
744    }
745
746    #[test]
747    fn try_intern_without_capacity() {
748        // Big enough to fit a single "hello world!" string, but not big enough to fit two.
749        let interner = create_interner(48);
750
751        let s1 = interner.try_intern("hello world!");
752        assert!(s1.is_some());
753
754        let s2 = interner.try_intern("hello, world");
755        assert!(s2.is_none());
756    }
757
758    #[test]
759    fn reclaim_after_dropped() {
760        let interner = create_interner(1024);
761
762        let s1 = interner.try_intern("hello world!").expect("should not fail to intern");
763        let s1_entry_len = entry_len(&s1);
764
765        assert_eq!(interner.len(), 1);
766        assert_eq!(available_len(&interner), 1024 - s1_entry_len);
767        assert_eq!(reclaimed_len(&interner), 0);
768
769        // Drop the interned string, which should decrement the reference count to zero and then reclaim the entry.
770        drop(s1);
771
772        assert_eq!(interner.len(), 0);
773        assert_eq!(available_len(&interner), 1024);
774        assert_eq!(reclaimed_len(&interner), 0);
775    }
776
777    #[test]
778    fn interns_to_reclaimed_entry_with_leftover() {
779        // We want to intern a string initially, which takes up almost all of the capacity, and then drop it so it gets
780        // reclaimed. After that, we'll intern a much smaller string which should lead to utilizing that reclaimed
781        // entry, but only a part of it. Finally, we'll intern another new string.
782        //
783        // The point is to demonstrate that our reclamation logic is sound in terms of allowing reclaimed entries to be
784        // split while the search/insertion logic is operating.
785        let capacity = 256;
786        let interner = create_interner(capacity);
787
788        // We craft four strings such that the first two (`s_large` and `s_medium1`) will take up enough capacity that
789        // `s_small` can't possibly be interned in the available capacity. We'll also craft `s_medium2` so it can fit
790        // within the reclaimed entry for `s_large` but takes enough capacity that `s_small` cannot fit in the leftover
791        // reclaimed entry that we split off.
792        let s_large = "99 bottles of beer on the wall, 99 bottles of beer! take one down, pass it around, 98 bottles of beer on the wall!";
793        let s_medium1 = "no act of kindness, no matter how small, is ever wasted";
794        let s_medium2 = "if you want to go fast, go alone; if you want to go far, go together";
795        let s_small = "are you there god? it's me, margaret";
796
797        let phase1_available_capacity = capacity - entry_len(s_large) - entry_len(s_medium1);
798        assert!(phase1_available_capacity < entry_len(s_small));
799        assert!((entry_len(s_large) - entry_len(s_medium2)) < entry_len(s_small));
800        assert!(entry_len(s_medium2) < entry_len(s_large));
801
802        // Phase 1: intern our two larger strings.
803        let s1 = interner.try_intern(s_large).expect("should not fail to intern");
804        let s2 = interner.try_intern(s_medium1).expect("should not fail to intern");
805
806        assert_eq!(interner.len(), 2);
807        assert_eq!(available_len(&interner), phase1_available_capacity);
808        assert_eq!(reclaimed_len(&interner), 0);
809
810        // Phase 2: drop `s_large` so it gets reclaimed.
811        drop(s1);
812        assert_eq!(interner.len(), 1);
813        assert_eq!(reclaimed_len(&interner), 1);
814
815        // Phase 3: intern `s_medium2`, which should fit in the reclaimed entry for `s_large`. This should leave a
816        // small, split off reclaimed entry.
817        let s3 = interner.try_intern(s_medium2).expect("should not fail to intern");
818
819        assert_eq!(interner.len(), 2);
820        assert_eq!(reclaimed_len(&interner), 1);
821
822        // Phase 4: intern `s_small`, which should not fit in the leftover reclaimed entry from `s_large` _or_ the
823        // available capacity.
824        let s4 = interner.try_intern(s_small);
825        assert_eq!(s4, None);
826
827        assert_eq!(interner.len(), 2);
828        assert_eq!(reclaimed_len(&interner), 1);
829
830        // And make sure we can still dereference the interned strings we _do_ have left:
831        assert_eq!(s2.deref(), s_medium1);
832        assert_eq!(s3.deref(), s_medium2);
833    }
834
835    #[test]
836    fn has_reclaimed_entries_string_fits_exactly() {
837        // The interner is large enough to fit two of the identically-sized strings, but not all three. We show that
838        // when we drop an entry and it is reclaimed, a string of identical size should always be able to reuse that
839        // reclaimed entry.
840        const S1_VALUE: &str = "hello world!";
841        const S2_VALUE: &str = "hello, world";
842        const S3_VALUE: &str = "hello--world";
843
844        let interner = create_interner(80);
845
846        // Intern the first two strings, which should fit without issue.
847        let s1 = interner.try_intern(S1_VALUE).expect("should not fail to intern");
848        let s1_reclaimed_expected = get_reclaimed_entry_for_string(&s1);
849        let _s2 = interner.try_intern(S2_VALUE).expect("should not fail to intern");
850
851        assert_eq!(interner.len(), 2);
852        assert_eq!(reclaimed_len(&interner), 0);
853
854        // Try to intern a third string, which should fail as we don't have the space.
855        let s3 = interner.try_intern(S3_VALUE);
856        assert_eq!(s3, None);
857
858        // Drop the first string, which should decrement the reference count to zero and then reclaim the entry.
859        drop(s1);
860
861        assert_eq!(interner.len(), 1);
862        assert_eq!(reclaimed_len(&interner), 1);
863
864        let s1_reclaimed = first_reclaimed_entry(&interner);
865        assert_eq!(s1_reclaimed_expected, s1_reclaimed);
866
867        // Try again to intern the third string, which should now succeed and take over the reclaimed entry entirely
868        // as the strings are identical in length.
869        let _s3 = interner.try_intern(S3_VALUE).expect("should not fail to intern");
870
871        assert_eq!(interner.len(), 2);
872        assert_eq!(reclaimed_len(&interner), 0);
873    }
874
875    #[test]
876    fn reclaimed_entry_reuse_split_too_small() {
877        // This situation is slightly contrived, but: we want to test that when there's a reclaimed entry of a certain
878        // size, reusing that reclaimed entry won't lead to it being split if the resulting split entry would be too
879        // "small": unable to hold another minimum-sized entry.
880        //
881        // We have to intern three strings to do this because we only track reclaimed entries when they're followed by
882        // in-use entries, and the string we drop to create a reclaimed entry has to be big enough, but not _too_ big,
883        // to hold the string we want to intern after it.
884        let interner = create_interner(128);
885
886        // Declare our strings to intern and just check some preconditions by hand.
887        let s_one = "a horse, a horse, my kingdom for a horse!";
888        let s_one_entry_len = entry_len(s_one);
889        let s_two = "why hello there, beautiful";
890        let s_two_entry_len = entry_len(s_two);
891        let s_three = "real gs move in silence like lasagna";
892        let s_three_entry_len = entry_len(s_three);
893
894        assert!(s_one_entry_len <= interner.capacity_bytes());
895        assert!(s_two_entry_len <= interner.capacity_bytes());
896        assert!(s_one_entry_len + s_two_entry_len + s_three_entry_len > interner.capacity_bytes());
897        assert!(s_one_entry_len > s_two_entry_len);
898        assert!(s_three_entry_len > s_two_entry_len);
899        assert!((s_one_entry_len - s_three_entry_len) < MINIMUM_ENTRY_LEN);
900
901        // Intern the first two strings, which should fit without issue.
902        let s1 = interner.try_intern(s_one).expect("should not fail to intern");
903        let s1_reclaimed_expected = get_reclaimed_entry_for_string(&s1);
904        let _s2 = interner.try_intern(s_two).expect("should not fail to intern");
905
906        assert_eq!(interner.len(), 2);
907        assert_eq!(reclaimed_len(&interner), 0);
908
909        // Try to intern the third string, which should fail as we don't have the space.
910        let s3 = interner.try_intern(s_three);
911        assert_eq!(s3, None);
912
913        // Drop the first string, which should decrement the reference count to zero and then reclaim the entry.
914        drop(s1);
915
916        assert_eq!(interner.len(), 1);
917        assert_eq!(reclaimed_len(&interner), 1);
918
919        let s1_reclaimed = first_reclaimed_entry(&interner);
920        assert_eq!(s1_reclaimed_expected, s1_reclaimed);
921
922        // Try again to intern the third string, which should now succeed and take over the reclaimed entry, but since
923        // the remainder of the reclaimed entry after taking the necessary capacity for `s_three` is not large enough
924        // (`MINIMUM_ENTRY_LEN`), we shouldn't end up splitting the reclaimed entry, and instead, `s3` should consume
925        // the entire reclaimed entry.
926        let s3 = interner.try_intern(s_three).expect("should not fail to intern");
927        let s3_reclaimed_expected = get_reclaimed_entry_for_string(&s3);
928
929        assert_eq!(interner.len(), 2);
930        assert_eq!(reclaimed_len(&interner), 0);
931        assert_eq!(s1_reclaimed_expected, s3_reclaimed_expected);
932    }
933
934    #[test]
935    fn reclaimed_entry_adjacent_to_spare_capacity() {
936        let interner = create_interner(128);
937
938        // Intern two smallish strings that fit without issue.
939        let s1 = interner.try_intern("hello, world!").expect("should not fail to intern");
940        let s2 = interner
941            .try_intern("cheeeeeehooooo!")
942            .expect("should not fail to intern");
943        let s1_entry_len = entry_len(&s1);
944        let s2_entry_len = entry_len(&s2);
945
946        assert_eq!(reclaimed_len(&interner), 0);
947        assert_eq!(available_len(&interner), 128 - s1_entry_len - s2_entry_len);
948
949        drop(s2);
950        assert_eq!(reclaimed_len(&interner), 0);
951        assert_eq!(available_len(&interner), 128 - s1_entry_len);
952
953        drop(s1);
954        assert_eq!(reclaimed_len(&interner), 0);
955        assert_eq!(available_len(&interner), 128);
956    }
957
958    #[test]
959    fn len_bytes_reps_active_interned_entries() {
960        let interner = create_interner(256);
961        let mut active_interned_entries_len = 0;
962        let string_1 = "hello world!";
963        let string_2 = "hello again, world";
964        let string_3 = "this is another string";
965
966        // Intern a string.
967        let s1 = interner.try_intern(string_1).expect("should not fail to intern");
968        active_interned_entries_len += entry_len(string_1);
969        assert_eq!(interner.len(), 1);
970        assert_eq!(interner.len_bytes(), active_interned_entries_len);
971
972        // Intern a second string.
973        let _s2 = interner.try_intern(string_2).expect("should not fail to intern");
974        active_interned_entries_len += entry_len(string_2);
975        assert_eq!(interner.len(), 2);
976        assert_eq!(interner.len_bytes(), active_interned_entries_len);
977
978        // Drop the first string.
979        drop(s1);
980        active_interned_entries_len -= entry_len(string_1);
981        assert_eq!(interner.len(), 1);
982        assert_eq!(interner.len_bytes(), active_interned_entries_len);
983
984        // Intern a new string.
985        let _s3 = interner.try_intern(string_3).expect("should not fail to intern");
986        active_interned_entries_len += entry_len(string_3);
987        assert_eq!(interner.len(), 2);
988        assert_eq!(interner.len_bytes(), active_interned_entries_len);
989    }
990
991    proptest! {
992        #[test]
993        #[cfg_attr(miri, ignore)]
994        fn property_test_entry_count_accurate(
995            strs in arb_alphanum_strings(1..=128, 16..=512),
996            indices in arb_vec(any::<Index>(), 1..=1000),
997        ) {
998            // We ask `proptest` to generate a bunch of unique strings of varying lengths (1-128 bytes, 16-512 unique
999            // strings) which we then randomly select out of those strings which strings we want to intern. The goal
1000            // here is to potentially select the same string multiple times, to exercise the actual interning logic...
1001            // but practically, to ensure that when we intern a string that has already been interned, we're not
1002            // incrementing the entries count again.
1003
1004            // Create an interner with enough capacity to hold all of the strings we've generated. This is the maximum
1005            // string size multiplied by the number of strings we've generated... plus a little constant factor per
1006            // string to account for the entry header.
1007            const ENTRY_SIZE: usize = 128 + HEADER_LEN;
1008            let interner = create_interner(ENTRY_SIZE * indices.len());
1009
1010            // For each index, pull out the string and both track it in `unique_strs` and intern it. We hold on to the
1011            // interned string handle to make sure the interned string is actually kept alive, keeping the entry count
1012            // stable.
1013            let mut interned = Vec::new();
1014            let mut unique_strs = HashSet::new();
1015            for index in &indices {
1016                let s = index.get(&strs);
1017                unique_strs.insert(s);
1018
1019                let s_interned = interner.try_intern(s).expect("should never fail to intern");
1020                interned.push(s_interned);
1021            }
1022
1023            assert_eq!(unique_strs.len(), interner.len());
1024        }
1025    }
1026
1027    #[cfg(feature = "loom")]
1028    #[test]
1029    fn concurrent_drop_and_intern() {
1030        fn reclaimed_entries(interner: &GenericMapInterner) -> Vec<ReclaimedEntry> {
1031            interner
1032                .state
1033                .lock()
1034                .unwrap()
1035                .storage
1036                .reclaimed
1037                .iter()
1038                .copied()
1039                .collect()
1040        }
1041
1042        fn do_reclaimed_entries_overlap(a: ReclaimedEntry, b: ReclaimedEntry) -> bool {
1043            let a_start = a.offset;
1044            let a_end = a.offset + a.capacity - 1;
1045
1046            let b_start = b.offset;
1047            let b_end = b.offset + b.capacity - 1;
1048
1049            (a_start <= b_start && b_start <= a_end) || (a_start <= b_end && b_end <= a_end)
1050        }
1051
1052        const STRING_TO_INTERN: &str = "hello, world!";
1053
1054        // This test is meant to explore the thread orderings when one thread is trying to drop (and thus reclaim) the
1055        // last active reference to an interned string, and another thread is trying to intern that very same string.
1056        //
1057        // We accept, as a caveat, that a possible outcome is that we intern the "new" string again, even though an
1058        // existing entry to that string may have existed in an alternative ordering.
1059        loom::model(|| {
1060            let interner = create_interner(1024);
1061            let t2_interner = interner.clone();
1062
1063            // Intern the string from thread T1.
1064            let t1_interned_s = interner
1065                .try_intern(STRING_TO_INTERN)
1066                .expect("should not fail to intern");
1067            assert_eq!(t1_interned_s.deref(), STRING_TO_INTERN);
1068            let t1_reclaimed_entry = get_reclaimed_entry_for_string(&t1_interned_s);
1069
1070            // Spawn thread T2, which tries to intern the same string and returns the handle to it.
1071            let t2_result = loom::thread::spawn(move || {
1072                let interned_s = t2_interner
1073                    .try_intern(STRING_TO_INTERN)
1074                    .expect("should not fail to intern");
1075                let reclaimed_entry = get_reclaimed_entry_for_string(&interned_s);
1076
1077                (interned_s, reclaimed_entry)
1078            });
1079
1080            drop(t1_interned_s);
1081
1082            let (t2_interned_s, t2_reclaimed_entry) = t2_result.join().expect("should not fail to join T2");
1083            assert_eq!(t2_interned_s.deref(), STRING_TO_INTERN);
1084
1085            // What we're checking for here is that either:
1086            // - there's no reclaimed entries (T2 found the existing entry for the string before T1 dropped it)
1087            // - there's a reclaimed entry (T2 didn't find the existing entry for the string before T1 marked it as
1088            //   inactive) but the reclaimed entry does _not_ overlap with the interned string from T2, meaning we
1089            //   didn't get confused and allow T2 to use an existing entry that T1 then later marked as reclaimed
1090            let reclaimed_entries = reclaimed_entries(&interner);
1091            assert!(reclaimed_entries.len() <= 1, "should have at most one reclaimed entry");
1092
1093            if !reclaimed_entries.is_empty() {
1094                // If we do have a reclaimed entry, it needs to match exactly with only one of the interned strings.
1095                let is_t1_entry = reclaimed_entries.first().unwrap() == &t1_reclaimed_entry;
1096                let is_t2_entry = reclaimed_entries.first().unwrap() == &t2_reclaimed_entry;
1097
1098                assert!(
1099                    (is_t1_entry || is_t2_entry) && !(is_t1_entry && is_t2_entry),
1100                    "should only match one interned string"
1101                );
1102
1103                // Additionally, we ensure that the reclaimed entry does not overlap with the other interned string.
1104                assert!(
1105                    !do_reclaimed_entries_overlap(t1_reclaimed_entry, t2_reclaimed_entry),
1106                    "reclaimed entry should not overlap with remaining interned string"
1107                );
1108            }
1109        });
1110    }
1111}