stringtheory/interning/map.rs
1#[cfg(not(feature = "loom"))]
2use std::sync::{
3 atomic::{
4 AtomicUsize,
5 Ordering::{AcqRel, Acquire},
6 },
7 Arc, Mutex,
8};
9use std::{collections::HashMap, num::NonZeroUsize, ptr::NonNull};
10
11#[cfg(feature = "loom")]
12use loom::sync::{
13 atomic::{
14 AtomicUsize,
15 Ordering::{AcqRel, Acquire},
16 },
17 Arc, Mutex,
18};
19
20use super::{
21 helpers::{layout_for_data, PackedLengthCapacity},
22 InternedString, Interner,
23};
24use crate::interning::helpers::{aligned_string, ReclaimedEntries, ReclaimedEntry};
25
26const HEADER_LEN: usize = std::mem::size_of::<EntryHeader>();
27const HEADER_ALIGN: usize = std::mem::align_of::<EntryHeader>();
28
29/// The minimum possible length of an entry.
30///
31/// For any entry in the interner, there is already an `EntryHeader` followed by the string data itself. In order to
32/// ensure that entries can be written contiguously, we additionally ensure that the number of bytes we utilize for the
33/// string data is aligned at least as much as `EntryHeader` itself.
34///
35/// This means that the minimum possible length of an entry, or the minimum number of bytes a valid entry could consume,
36/// is the length of the header plus the alignment of the header.
37const MINIMUM_ENTRY_LEN: usize = HEADER_LEN + HEADER_ALIGN;
38
39#[derive(Debug)]
40pub(crate) struct StringState {
41 interner: Arc<Mutex<InternerState>>,
42 header: NonNull<EntryHeader>,
43}
44
45impl StringState {
46 #[inline]
47 pub const fn as_str(&self) -> &str {
48 // SAFETY: We ensure `self.header` is well-aligned and points to an initialized `EntryHeader` value when creating `StringState`.
49 unsafe { get_entry_string(self.header) }
50 }
51}
52
53impl PartialEq for StringState {
54 fn eq(&self, other: &Self) -> bool {
55 self.header == other.header
56 }
57}
58
59impl Clone for StringState {
60 fn clone(&self) -> Self {
61 // SAFETY: The caller that creates `StringState` is responsible for ensuring that `self.header` is well-aligned
62 // and points to an initialized `EntryHeader` value.
63 let header = unsafe { self.header.as_ref() };
64 header.increment_active_refs();
65
66 Self {
67 interner: self.interner.clone(),
68 header: self.header,
69 }
70 }
71}
72
73impl Drop for StringState {
74 fn drop(&mut self) {
75 // SAFETY: The caller that creates `StringState` is responsible for ensuring that `self.header` is well-aligned
76 // and points to an initialized `EntryHeader` value.
77 let header = unsafe { self.header.as_ref() };
78 if header.decrement_active_refs() {
79 // We decremented the reference count to zero, so try to mark this entry for reclamation.
80 let mut interner = self.interner.lock().unwrap();
81 interner.mark_for_reclamation(self.header);
82 }
83 }
84}
85
86/// Metadata about an interner entry.
87///
88/// `EntryHeader` represents the smallest amount of information about an interned entry that is needed to support both
89/// lookup of existing interned strings, as well as the ability to reclaim space in the interner when an entry is no
90/// longer in use.
91struct EntryHeader {
92 /// The number of active references to this entry.
93 ///
94 /// Only incremented by the interner itself, and decremented by `InternedString` when it is dropped.
95 refs: AtomicUsize,
96
97 /// Combined length/capacity of the entry, in terms of the string itself.
98 ///
99 /// Notably, this does _not_ include the length of the header itself. For example, an entry holding the string
100 /// "hello, world!" has a string length of 13 bytes, but since we have to pad out to meet our alignment requirements
101 /// for `EntryHeader`, we would end up with a capacity of 16 bytes. As such, `EntryHeader::len` would report `13`,
102 /// while `EntryHeader::capacity` would report `16`. Likewise, `EntryHeader::entry_len` would report `40`,
103 /// accounting for the string capacity (16) as well as the header length itself (24).
104 ///
105 /// As explained in the description of `PackedLengthCapacity`, this does mean strings can't be larger than ~4GB on
106 /// 64-bit platforms, which is not a problem we have.
107 len_cap: PackedLengthCapacity,
108}
109
110impl EntryHeader {
111 /// Creates a tombstone entry with the given capacity.
112 ///
113 /// This is to allow for updating a region in the data buffer, which has been reclaimed, such that it is
114 /// identifiable as being unused.
115 fn tombstone(entry: ReclaimedEntry) -> Self {
116 // The usable capacity for a reclaimed entry is the full capacity minus the size of `EntryHeader` itself, as
117 // reclaimed entries represent the _entire_ region in the data buffer, but `EntryHeader` only cares about the
118 // string portion itself.
119 let str_cap = EntryHeader::usable_from_reclaimed(entry);
120
121 Self {
122 refs: AtomicUsize::new(0),
123 len_cap: PackedLengthCapacity::new(str_cap, 0),
124 }
125 }
126
127 /// Creates a new entry for the given string.
128 fn from_string(s: &str) -> Self {
129 // We're dictating the necessary capacity here, which is the length of the string rounded to the nearest
130 // multiple of the alignment of `EntryHeader`, which ensures that any subsequent entry will be properly aligned.
131 let str_cap = aligned_string::<Self>(s);
132
133 Self {
134 refs: AtomicUsize::new(1),
135 len_cap: PackedLengthCapacity::new(str_cap, s.len()),
136 }
137 }
138
139 /// Creates a new entry for the given string, based on the given reclaimed entry.
140 ///
141 /// This maps the entry header to the underlying capacity of the given reclaimed entry, which is done in cases where
142 /// a reclaimed entry is being used when interning a new string, and the reclaimed entry is larger than the string
143 /// being interned, but not large enough that we could split the excess capacity into a new reclaimed entry.
144 fn from_reclaimed_entry(mut entry: ReclaimedEntry, s: &str) -> (Self, Option<ReclaimedEntry>) {
145 // The usable capacity for a reclaimed entry is the full capacity minus the size of `EntryHeader` itself, as
146 // reclaimed entries represent the _entire_ region in the data buffer, but `EntryHeader` only cares about the
147 // string portion itself.
148 let entry_str_cap = EntryHeader::usable_from_reclaimed(entry);
149 let required_str_cap = aligned_string::<Self>(s);
150
151 // If the reclaimed entry has enough additional space beyond what we need for the string, we'll split it off and
152 // return it for the caller to keep around in the reclaimed entries list.
153 let remainder = entry_str_cap - required_str_cap;
154 let (adjusted_str_cap, maybe_split_entry) = if remainder >= MINIMUM_ENTRY_LEN {
155 let entry_len = EntryHeader::len_for(s);
156 let split_entry = entry.split_off(entry_len);
157
158 (entry_len - HEADER_LEN, Some(split_entry))
159 } else {
160 (entry_str_cap, None)
161 };
162
163 let header = Self {
164 refs: AtomicUsize::new(1),
165 len_cap: PackedLengthCapacity::new(adjusted_str_cap, s.len()),
166 };
167
168 (header, maybe_split_entry)
169 }
170
171 /// Returns the computed length of a complete entry, in bytes, for the given string.
172 ///
173 /// This includes the size of the entry header itself and the string data, when padded for alignment, and represents
174 /// the number of bytes that would be consumed in the data buffer.
175 const fn len_for(s: &str) -> usize {
176 HEADER_LEN + aligned_string::<Self>(s)
177 }
178
179 /// Returns the usable capacity of a reclaimed entry, in bytes.
180 ///
181 /// Usable refers to the number of bytes in a reclaimed entry that could be used for string data, after accounting
182 /// for the size of `EntryHeader` itself.
183 const fn usable_from_reclaimed(entry: ReclaimedEntry) -> usize {
184 entry.capacity() - HEADER_LEN
185 }
186
187 /// Returns the size of the string, in bytes, that this entry can hold.
188 const fn capacity(&self) -> usize {
189 self.len_cap.capacity()
190 }
191
192 /// Returns the size of the string, in bytes, that this entry _actually_ holds.
193 const fn len(&self) -> usize {
194 self.len_cap.len()
195 }
196
197 /// Returns the total length of the entry, in bytes.
198 ///
199 /// This includes the length of the header in addition to the string data.
200 const fn entry_len(&self) -> usize {
201 HEADER_LEN + self.capacity()
202 }
203
204 /// Returns `true` if this entry is currently referenced.
205 fn is_active(&self) -> bool {
206 self.refs.load(Acquire) != 0
207 }
208
209 /// Increments the active reference count by one.
210 fn increment_active_refs(&self) {
211 self.refs.fetch_add(1, AcqRel);
212 }
213
214 /// Decrements the active reference count by one.
215 ///
216 /// Returns `true` if the active reference count is zero _after_ calling this method.
217 fn decrement_active_refs(&self) -> bool {
218 self.refs.fetch_sub(1, AcqRel) == 1
219 }
220}
221
222// SAFETY: We don't take references to the entry header pointer that outlast `StringState`, and the only modification we
223// do to the entry header is through atomic operations, so it's safe to both send and share `StringState` between
224// threads.
225unsafe impl Send for StringState {}
226unsafe impl Sync for StringState {}
227
228#[derive(Debug)]
229struct InternerStorage {
230 // Direct pieces of our buffer allocation.
231 ptr: NonNull<u8>,
232 offset: usize,
233 capacity: NonZeroUsize,
234
235 // Length of all active entries, in bytes.
236 //
237 // This is equivalent to `self.offset` minus the total size of all reclaimed entries.
238 len: usize,
239
240 // Markers for entries that can be reused.
241 reclaimed: ReclaimedEntries,
242}
243
244impl InternerStorage {
245 fn with_capacity(capacity: NonZeroUsize) -> Self {
246 assert!(
247 capacity.get() <= isize::MAX as usize,
248 "capacity would overflow isize::MAX, which violates layout constraints"
249 );
250
251 // Allocate our data buffer. This is the main backing allocation for all interned strings, and is well-aligned
252 // for `EntryHeader`.
253 //
254 // SAFETY: `layout_for_data` ensures the layout is non-zero.
255 let data_layout = layout_for_data::<EntryHeader>(capacity);
256 let data_ptr = unsafe { std::alloc::alloc(data_layout) };
257 let ptr = match NonNull::new(data_ptr) {
258 Some(ptr) => ptr,
259 None => std::alloc::handle_alloc_error(data_layout),
260 };
261
262 Self {
263 ptr,
264 offset: 0,
265 capacity,
266 len: 0,
267 reclaimed: ReclaimedEntries::new(),
268 }
269 }
270
271 #[cfg(test)]
272 /// Returns the total number of unused bytes that are available for interning.
273 fn available(&self) -> usize {
274 self.capacity.get() - self.len
275 }
276
277 /// Returns the total number of bytes of continguous, unoccupied space at the end of the data buffer.
278 fn available_unoccupied(&self) -> usize {
279 self.capacity.get() - self.offset
280 }
281
282 fn get_entry_ptr(&self, offset: usize) -> NonNull<EntryHeader> {
283 debug_assert!(
284 offset + MINIMUM_ENTRY_LEN <= self.capacity.get(),
285 "offset would point to entry that cannot possibly avoid extending past end of data buffer"
286 );
287
288 // SAFETY: The caller is responsible for ensuring that `offset` is within the bounds of the data buffer, and
289 // that `offset` is well-aligned for `EntryHeader`.
290 let entry_ptr = unsafe { self.ptr.as_ptr().add(offset).cast::<EntryHeader>() };
291 debug_assert!(entry_ptr.is_aligned(), "entry header pointer must be well-aligned");
292
293 // SAFETY: `entry_ptr` is derived from `self.ptr`, which itself is `NonNull<u8>`, and the caller is responsible
294 // for ensuring that `offset` is within the bounds of the data buffer, so we know `entry_ptr` is non-null.
295 unsafe { NonNull::new_unchecked(entry_ptr) }
296 }
297
298 fn write_entry(&mut self, offset: usize, entry_header: EntryHeader, s: &str) -> NonNull<EntryHeader> {
299 debug_assert_eq!(
300 entry_header.len(),
301 s.len(),
302 "entry header length must match string length"
303 );
304
305 let entry_ptr = self.get_entry_ptr(offset);
306 let entry_len = entry_header.entry_len();
307
308 // Write the entry header.
309 unsafe { entry_ptr.as_ptr().write(entry_header) };
310
311 let s_buf = s.as_bytes();
312
313 // Write the string.
314 let entry_s_buf = unsafe {
315 // Take the entry pointer and add 1, which sets our pointer to right _after_ the header.
316 let entry_s_ptr = entry_ptr.as_ptr().add(1).cast::<u8>();
317 std::slice::from_raw_parts_mut(entry_s_ptr, s_buf.len())
318 };
319 entry_s_buf.copy_from_slice(s_buf);
320
321 // Update our internal statistics.
322 self.len += entry_len;
323
324 entry_ptr
325 }
326
327 fn write_to_unoccupied(&mut self, s: &str) -> (usize, NonNull<EntryHeader>) {
328 let entry_header = EntryHeader::from_string(s);
329
330 // Write the entry to the end of the data buffer.
331 let entry_offset = self.offset;
332 self.offset += entry_header.entry_len();
333
334 (entry_offset, self.write_entry(entry_offset, entry_header, s))
335 }
336
337 fn write_to_reclaimed_entry(&mut self, entry: ReclaimedEntry, s: &str) -> (usize, NonNull<EntryHeader>) {
338 let entry_offset = entry.offset();
339 let (entry_header, maybe_split_entry) = EntryHeader::from_reclaimed_entry(entry, s);
340
341 // If we had enough capacity in the reclaimed entry to hold this string _and_ potentially hold another entry, we
342 // split it off and store that remainder entry.
343 if let Some(split_entry) = maybe_split_entry {
344 self.add_reclaimed(split_entry);
345 }
346
347 // Write the entry in place of the reclaimed entry.
348 (entry_offset, self.write_entry(entry_offset, entry_header, s))
349 }
350
351 fn add_reclaimed(&mut self, entry: ReclaimedEntry) {
352 // Reclamation is a two-step process: first, we have to actually keep track of the reclaimed entry, which
353 // potentially involves merging adjacent reclaimed entries, and then once all of that has happened, we tombstone
354 // the entry (whether merged or not).
355 //
356 // However, if the merged reclaimed entry immediately precedes any available capacity, we can skip tombstoning
357 // it, since we can just wind back `offset` to reclaim the space.
358 let merged_entry = self.reclaimed.insert(entry);
359 if merged_entry.offset() + merged_entry.capacity() == self.offset {
360 self.offset -= merged_entry.capacity();
361 self.reclaimed.remove(&merged_entry);
362 return;
363 }
364
365 self.clear_reclaimed_entry(merged_entry);
366 }
367
368 fn clear_reclaimed_entry(&mut self, entry: ReclaimedEntry) {
369 let entry_ptr = self.get_entry_ptr(entry.offset);
370
371 // Write the entry tombstone itself, which clears out the hash and sets the reference count to zero.
372 //
373 // SAFETY: We know that `entry_ptr` is valid for writes (reclaimed entries are, by definition, inactive regions
374 // in the data buffer) and is well-aligned for `EntryHeader`.
375 let tombstone = EntryHeader::tombstone(entry);
376 let str_cap = tombstone.capacity();
377
378 unsafe {
379 entry_ptr.as_ptr().write(EntryHeader::tombstone(entry));
380 }
381
382 // Write a magic value to the entire string capacity for the entry. This ensures that there's a known repeating
383 // value which, in the case of debugging issues, can be a signal that offsets/reclaimed entries are incorrect
384 // and overlapping with active entries.
385 //
386 // SAFETY: Like above, the caller is responsible for ensuring that `offset` is within the bounds of the data
387 // buffer, and that `offset + capacity` does not extend past the bounds of the data buffer.
388 unsafe {
389 // Take the entry pointer and add 1, which sets our pointer to right _after_ the header.
390 let str_ptr = entry_ptr.as_ptr().add(1).cast::<u8>();
391 let str_buf = std::slice::from_raw_parts_mut(str_ptr, str_cap);
392 str_buf.fill(0x21);
393 }
394 }
395
396 fn mark_for_reclamation(&mut self, header_ptr: NonNull<EntryHeader>) {
397 // Get the offset of the header within the data buffer.
398 //
399 // SAFETY: The caller is responsible for ensuring the entry header reference belongs to this interner. If that
400 // is upheld, then we know that entry header belongs to our data buffer, and that the pointer to the entry
401 // header is not less than the base pointer of the data buffer, ensuring the offset is non-negative.
402 let entry_offset = unsafe {
403 header_ptr
404 .cast::<u8>()
405 .as_ptr()
406 .offset_from(self.ptr.as_ptr().cast_const())
407 };
408 debug_assert!(entry_offset >= 0, "entry offset must be non-negative");
409
410 let header = unsafe { header_ptr.as_ref() };
411 let entry_len = header.entry_len();
412
413 let entry = ReclaimedEntry::new(entry_offset as usize, entry_len);
414 self.len -= entry.capacity();
415 self.add_reclaimed(entry);
416 }
417}
418
419impl Drop for InternerStorage {
420 fn drop(&mut self) {
421 // SAFETY: We allocated this buffer with the global allocator, and we're generating the same layout that was
422 // used to allocate it in the first place.
423 unsafe {
424 std::alloc::dealloc(self.ptr.as_ptr(), layout_for_data::<EntryHeader>(self.capacity));
425 }
426 }
427}
428
429#[derive(Debug)]
430struct InternerState {
431 // Backing storage for the interned strings.
432 storage: InternerStorage,
433
434 // Active entries in the interner.
435 entries: HashMap<&'static str, usize>,
436}
437
438impl InternerState {
439 /// Creates a new `InternerState` with a pre-allocated buffer that has the given capacity.
440 pub fn with_capacity(capacity: NonZeroUsize) -> Self {
441 Self {
442 storage: InternerStorage::with_capacity(capacity),
443 entries: HashMap::new(),
444 }
445 }
446
447 fn mark_for_reclamation(&mut self, header_ptr: NonNull<EntryHeader>) {
448 // See if the reference count is zero.
449 //
450 // Only interned string values (the frontend handle that wraps the pointer to a specific entry) can decrement
451 // the reference count for their specific entry when dropped, and only `InternerState` -- with its access
452 // mediated through a mutex -- can increment the reference count for entries. This means that if the reference
453 // count is zero, then we know that nobody else is holding a reference to this entry, and no concurrent call to
454 // `try_intern` could be updating the reference count, either... so it's safe to be marked as reclaimed.
455 //
456 // SAFETY: The caller is responsible for ensuring that `header_ptr` is well-aligned and points to an initialized
457 // `EntryHeader` value.
458 let header = unsafe { header_ptr.as_ref() };
459 if !header.is_active() {
460 // Remove the entry from the entries map first before we reclaim the entry, since doing so overwrites the
461 // entry data in the data buffer.
462 //
463 // SAFETY: The caller is responsible for ensuring that they've given us a pointer to an `EntryHeader` that
464 // was acquired from this interner.
465 let entry_str = unsafe { get_entry_string(header_ptr) };
466 self.entries.remove(entry_str);
467
468 self.storage.mark_for_reclamation(header_ptr);
469 }
470 }
471
472 fn try_intern(&mut self, s: &str) -> Option<NonNull<EntryHeader>> {
473 // We can only intern strings with a size that fits within a packed length/capacity value, so if `s` is larger
474 // than that, we can't intern it, and there's existing entry we could have for it either.
475 if s.len() > PackedLengthCapacity::maximum_value() {
476 return None;
477 }
478
479 // Try and find an existing entry for this string.
480 if let Some(entry_offset) = self.entries.get(s) {
481 let entry_ptr = self.storage.get_entry_ptr(*entry_offset);
482
483 let header = unsafe { entry_ptr.as_ref() };
484 header.increment_active_refs();
485
486 return Some(entry_ptr);
487 }
488
489 let required_cap = EntryHeader::len_for(s);
490
491 // We didn't find an existing entry, so we're going to intern it.
492 //
493 // First, try and see if we have a reclaimed entry that can fit this string. If nothing suitable is found, or we
494 // have no reclaimed entries, then we'll just try to fit it in the remaining capacity of our data buffer.
495 let maybe_reclaimed_entry = self.storage.reclaimed.take_if(|entry| entry.capacity() >= required_cap);
496 let (entry_offset, entry_ptr) = if let Some(reclaimed_entry) = maybe_reclaimed_entry {
497 self.storage.write_to_reclaimed_entry(reclaimed_entry, s)
498 } else if required_cap <= self.storage.available_unoccupied() {
499 self.storage.write_to_unoccupied(s)
500 } else {
501 // We don't have enough space to intern this string at all, so we'll just return `None`.
502 return None;
503 };
504
505 // SAFETY: Callers of `get_entry_string` are responsible for ensuring that the chosen lifetime of the string is
506 // valid with respect to ensuring that the underlying entry lives long enough, and that the string data is valid
507 // UTF-8 from as long as the reference is live.
508 //
509 // We're creating a `'static` reference here, which is generally frowned upon when the data is in fact not
510 // `'static`, but this is safe because we never leak this lifetime outside of the interner, and we ensure that
511 // we don't actually keep this reference around longer than the entry itself, as we only need a `'static`
512 // reference to use it as the key in the entries map.
513 let entry_str = unsafe { get_entry_string(entry_ptr) };
514 self.entries.insert(entry_str, entry_offset);
515
516 Some(entry_ptr)
517 }
518}
519
520// SAFETY: We don't take references to the data buffer pointer that outlast `InternerState`, and all access to
521// `InternerState` itself is mediated through a mutex, so we're safe to send it around and share it between threads.
522unsafe impl Send for InternerState {}
523unsafe impl Sync for InternerState {}
524
525/// A string interner based on a single, fixed-size backing buffer with support for reclamation.
526///
527/// ## Overview
528///
529/// This interner uses a single, fixed-size backing buffer where interned strings are stored contiguously. This provides
530/// bounded memory usage, and the interner will not allocate additional memory for new strings once the buffer is full.
531/// Since interned strings are not likely to need to live for the life of the program, the interner supports
532/// reclamation. Once all references to an interned string have been dropped, the storage for that string is reclaimed
533/// and can be used to hold new strings.
534///
535/// ## Storage layout
536///
537/// The backing buffer stores strings contiguously, with an entry "header" prepended to each string. The header contains
538/// relevant data -- hash of the string, reference count, and length of the string -- needed to work with the entry
539/// either when searching for existing entries or when using the entry itself.
540///
541/// The layout of an entry is as follows:
542///
543/// ```text
544/// ┌───────────────────────── entry #1 ──────────────────────────┐ ┌─ entry #2 ─┐ ┌─ entry .. ─┐
545/// ▼ ▼ ▼ ▼ ▼ ▼
546/// ┏━━━━━━━━━━━┯━━━━━━━━━━━┯━━━━━━━━━━━┯━━━━━━━━━━━┯━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━┓
547/// ┃ str hash │ ref cnt │ str len │ str data │ padding ┃ ┃ header ┃ ┃ header ┃
548/// ┃ (8 bytes) │ (8 bytes) │ (8 bytes) │ (N bytes) │ (1-7 bytes) ┃ ┃ & string ┃ ┃ & string ┃
549/// ┗━━━━━━━━━━━┷━━━━━━━━━━━┷━━━━━━━━━━━┷━━━━━━━━━━━┷━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━┛
550/// ▲ ▲ ▲
551/// └────────── `EntryHeader` ──────────┘ └── aligned for `EntryHeader`
552/// (8 byte alignment) via trailing padding
553/// ```
554///
555/// The backing buffer is always aligned properly for `EntryHeader`, so that the first entry can be referenced
556/// correctly. However, when appending additional entries to the buffer, we need to ensure that those entries also have
557/// an aligned start for accessing the header. This is complicated due to the variable number of bytes for the string
558/// data.
559///
560/// Alignment padding is added to the end of the entry to ensure that when appending the next entry, the start of the
561/// entry is properly aligned for `EntryHeader`. In the worst case, up to 7 bytes could be added (and thus wasted) on
562/// this alignment padding.
563///
564/// ## `InternedString`
565///
566/// The `InternedString` type is a handle to the entry header, and thus the string data, for an interned string. It is
567/// designed to be small -- 8 bytes! -- and cheap to clone, as it contains an atomic reference to the entry header and a
568/// reference to the interner that owns the string. It dereferences to the underlying string with relatively low
569/// overhead: two pointer indirections.
570///
571/// When an `InternedString` is dropped, it decrements the reference count for the entry it points to. If the reference
572/// count drops to zero, it will attempt to mark the entry for reclamation.
573///
574/// ## Reclamation
575///
576/// As we want to bound the memory used by the interner, but also not allow it to be filled up with strings that
577/// eventually end up going entirely unused, we need a way to remove those unused strings so their underlying storage
578/// can be used for new strings. This is where reclamation comes in.
579///
580/// When a string is interned, the entry header tracks how many active references there are to it. When that reference
581/// count drops to zero, the last reference to the string attempts to mark the entry for reclamation. Assuming no other
582/// reference has been taken out on the entry in the meantime, the entry gets added to a list of "reclaimed" entries.
583///
584/// Reclaimed entries are simple markers -- start and end position in the data buffer -- which are stored in a freelist.
585/// When attempting to intern a new string, this freelist is searched to see if there's an entry large enough to fit the
586/// new string, and if so, it is used.
587///
588/// Additionally, when entries are reclaimed, adjacent entries are merged together where possible. This helps to avoid
589/// unnecessary fragmentation over time, although not as effectively as reconstructing the data buffer to re-pack
590/// entries.
591#[derive(Clone, Debug)]
592pub struct GenericMapInterner {
593 state: Arc<Mutex<InternerState>>,
594}
595
596impl GenericMapInterner {
597 /// Creates a new `GenericMapInterner` with the given capacity.
598 ///
599 /// The given capacity will potentially be rounded up by a small number of bytes (up to 7) in order to ensure the
600 /// backing buffer is properly aligned.
601 pub fn new(capacity: NonZeroUsize) -> Self {
602 Self {
603 state: Arc::new(Mutex::new(InternerState::with_capacity(capacity))),
604 }
605 }
606}
607
608impl Interner for GenericMapInterner {
609 fn is_empty(&self) -> bool {
610 self.state.lock().unwrap().entries.is_empty()
611 }
612
613 fn len(&self) -> usize {
614 self.state.lock().unwrap().entries.len()
615 }
616
617 fn len_bytes(&self) -> usize {
618 self.state.lock().unwrap().storage.len
619 }
620
621 fn capacity_bytes(&self) -> usize {
622 self.state.lock().unwrap().storage.capacity.get()
623 }
624
625 fn try_intern(&self, s: &str) -> Option<InternedString> {
626 let header = {
627 let mut state = self.state.lock().unwrap();
628 state.try_intern(s)?
629 };
630
631 Some(InternedString::from(StringState {
632 interner: Arc::clone(&self.state),
633 header,
634 }))
635 }
636}
637
638#[inline]
639const unsafe fn get_entry_string_parts(header_ptr: NonNull<EntryHeader>) -> (NonNull<u8>, usize) {
640 // SAFETY: The caller is responsible for ensuring that `header_ptr` is well-aligned and points to an initialized
641 // `EntryHeader` value.
642 let header = header_ptr.as_ref();
643
644 // Advance past the header and get the pointer to the string.
645 //
646 // SAFETY: We know that we're simply skipping over the header by advancing the pointer by one when it's still typed
647 // as `*mut EntryHeader`.
648 let s_ptr = header_ptr.add(1).cast::<u8>();
649 (s_ptr, header.len())
650}
651
652#[inline]
653const unsafe fn get_entry_string<'a>(header_ptr: NonNull<EntryHeader>) -> &'a str {
654 let (s_ptr, s_len) = get_entry_string_parts(header_ptr);
655
656 // SAFETY: We depend on `get_entry_string_parts` to give us a valid pointer and length for the string.
657 std::str::from_utf8_unchecked(std::slice::from_raw_parts(s_ptr.as_ptr() as *const _, s_len))
658}
659
660#[cfg(test)]
661mod tests {
662 use std::{
663 collections::HashSet,
664 ops::{Deref as _, RangeInclusive},
665 };
666
667 use prop::sample::Index;
668 use proptest::{
669 collection::{hash_set, vec as arb_vec},
670 prelude::*,
671 };
672
673 use super::*;
674 use crate::interning::InternedStringState;
675
676 fn create_interner(capacity: usize) -> GenericMapInterner {
677 assert!(capacity > 0, "capacity must be greater than zero");
678 GenericMapInterner::new(NonZeroUsize::new(capacity).unwrap())
679 }
680
681 fn available_len(interner: &GenericMapInterner) -> usize {
682 interner.state.lock().unwrap().storage.available()
683 }
684
685 fn reclaimed_len(interner: &GenericMapInterner) -> usize {
686 interner.state.lock().unwrap().storage.reclaimed.len()
687 }
688
689 fn first_reclaimed_entry(interner: &GenericMapInterner) -> ReclaimedEntry {
690 interner.state.lock().unwrap().storage.reclaimed.first().unwrap()
691 }
692
693 const fn entry_len(s: &str) -> usize {
694 EntryHeader::len_for(s)
695 }
696
697 fn get_reclaimed_entry_for_string(s: &InternedString) -> ReclaimedEntry {
698 let state = match &s.state {
699 InternedStringState::GenericMap(state) => state,
700 _ => panic!("unexpected string state"),
701 };
702
703 let ptr = state.interner.lock().unwrap().storage.ptr.as_ptr();
704 let header = unsafe { state.header.as_ref() };
705 let offset = unsafe { state.header.as_ptr().cast::<u8>().offset_from(ptr) as usize };
706 ReclaimedEntry::new(offset, header.entry_len())
707 }
708
709 fn arb_alphanum_strings(
710 str_len: RangeInclusive<usize>, unique_strs: RangeInclusive<usize>,
711 ) -> impl Strategy<Value = Vec<String>> {
712 // Create characters between 0x20 (32) and 0x7E (126), which are all printable ASCII characters.
713 let char_gen = any::<u8>().prop_map(|c| std::cmp::max(c % 127, 32));
714
715 let str_gen = any::<usize>()
716 .prop_map(move |n| std::cmp::max(n % *str_len.end(), *str_len.start()))
717 .prop_flat_map(move |len| arb_vec(char_gen.clone(), len))
718 // SAFETY: We know our characters are all valid UTF-8 because they're in the ASCII range.
719 .prop_map(|xs| unsafe { String::from_utf8_unchecked(xs) });
720
721 // Create a hash set, which handles the deduplication aspect for us, ensuring we have N unique strings where N
722 // is within the `unique_strs` range... and then convert it to `Vec<String>` for easier consumption.
723 hash_set(str_gen, unique_strs).prop_map(|unique_strs| unique_strs.into_iter().collect::<Vec<_>>())
724 }
725
726 #[test]
727 fn basic() {
728 let interner = create_interner(1024);
729
730 let s1 = interner.try_intern("hello").unwrap();
731 let s2 = interner.try_intern("world").unwrap();
732 let s3 = interner.try_intern("hello").unwrap();
733
734 assert_eq!(s1.deref(), "hello");
735 assert_eq!(s2.deref(), "world");
736 assert_eq!(s3.deref(), "hello");
737
738 // The pointers from the interned strings should be the same, but not between the interned string and a pointer
739 // to an equivalent (but not interned) string:
740 assert!(std::ptr::eq(s1.deref() as *const _, s3.deref() as *const _));
741
742 let local_hello = "hello";
743 assert!(!std::ptr::eq(s1.deref() as *const _, local_hello as *const _));
744 }
745
746 #[test]
747 fn try_intern_without_capacity() {
748 // Big enough to fit a single "hello world!" string, but not big enough to fit two.
749 let interner = create_interner(48);
750
751 let s1 = interner.try_intern("hello world!");
752 assert!(s1.is_some());
753
754 let s2 = interner.try_intern("hello, world");
755 assert!(s2.is_none());
756 }
757
758 #[test]
759 fn reclaim_after_dropped() {
760 let interner = create_interner(1024);
761
762 let s1 = interner.try_intern("hello world!").expect("should not fail to intern");
763 let s1_entry_len = entry_len(&s1);
764
765 assert_eq!(interner.len(), 1);
766 assert_eq!(available_len(&interner), 1024 - s1_entry_len);
767 assert_eq!(reclaimed_len(&interner), 0);
768
769 // Drop the interned string, which should decrement the reference count to zero and then reclaim the entry.
770 drop(s1);
771
772 assert_eq!(interner.len(), 0);
773 assert_eq!(available_len(&interner), 1024);
774 assert_eq!(reclaimed_len(&interner), 0);
775 }
776
777 #[test]
778 fn interns_to_reclaimed_entry_with_leftover() {
779 // We want to intern a string initially, which takes up almost all of the capacity, and then drop it so it gets
780 // reclaimed. After that, we'll intern a much smaller string which should lead to utilizing that reclaimed
781 // entry, but only a part of it. Finally, we'll intern another new string.
782 //
783 // The point is to demonstrate that our reclamation logic is sound in terms of allowing reclaimed entries to be
784 // split while the search/insertion logic is operating.
785 let capacity = 256;
786 let interner = create_interner(capacity);
787
788 // We craft four strings such that the first two (`s_large` and `s_medium1`) will take up enough capacity that
789 // `s_small` can't possibly be interned in the available capacity. We'll also craft `s_medium2` so it can fit
790 // within the reclaimed entry for `s_large` but takes enough capacity that `s_small` cannot fit in the leftover
791 // reclaimed entry that we split off.
792 let s_large = "99 bottles of beer on the wall, 99 bottles of beer! take one down, pass it around, 98 bottles of beer on the wall!";
793 let s_medium1 = "no act of kindness, no matter how small, is ever wasted";
794 let s_medium2 = "if you want to go fast, go alone; if you want to go far, go together";
795 let s_small = "are you there god? it's me, margaret";
796
797 let phase1_available_capacity = capacity - entry_len(s_large) - entry_len(s_medium1);
798 assert!(phase1_available_capacity < entry_len(s_small));
799 assert!((entry_len(s_large) - entry_len(s_medium2)) < entry_len(s_small));
800 assert!(entry_len(s_medium2) < entry_len(s_large));
801
802 // Phase 1: intern our two larger strings.
803 let s1 = interner.try_intern(s_large).expect("should not fail to intern");
804 let s2 = interner.try_intern(s_medium1).expect("should not fail to intern");
805
806 assert_eq!(interner.len(), 2);
807 assert_eq!(available_len(&interner), phase1_available_capacity);
808 assert_eq!(reclaimed_len(&interner), 0);
809
810 // Phase 2: drop `s_large` so it gets reclaimed.
811 drop(s1);
812 assert_eq!(interner.len(), 1);
813 assert_eq!(reclaimed_len(&interner), 1);
814
815 // Phase 3: intern `s_medium2`, which should fit in the reclaimed entry for `s_large`. This should leave a
816 // small, split off reclaimed entry.
817 let s3 = interner.try_intern(s_medium2).expect("should not fail to intern");
818
819 assert_eq!(interner.len(), 2);
820 assert_eq!(reclaimed_len(&interner), 1);
821
822 // Phase 4: intern `s_small`, which should not fit in the leftover reclaimed entry from `s_large` _or_ the
823 // available capacity.
824 let s4 = interner.try_intern(s_small);
825 assert_eq!(s4, None);
826
827 assert_eq!(interner.len(), 2);
828 assert_eq!(reclaimed_len(&interner), 1);
829
830 // And make sure we can still dereference the interned strings we _do_ have left:
831 assert_eq!(s2.deref(), s_medium1);
832 assert_eq!(s3.deref(), s_medium2);
833 }
834
835 #[test]
836 fn has_reclaimed_entries_string_fits_exactly() {
837 // The interner is large enough to fit two of the identically-sized strings, but not all three. We show that
838 // when we drop an entry and it is reclaimed, a string of identical size should always be able to reuse that
839 // reclaimed entry.
840 const S1_VALUE: &str = "hello world!";
841 const S2_VALUE: &str = "hello, world";
842 const S3_VALUE: &str = "hello--world";
843
844 let interner = create_interner(80);
845
846 // Intern the first two strings, which should fit without issue.
847 let s1 = interner.try_intern(S1_VALUE).expect("should not fail to intern");
848 let s1_reclaimed_expected = get_reclaimed_entry_for_string(&s1);
849 let _s2 = interner.try_intern(S2_VALUE).expect("should not fail to intern");
850
851 assert_eq!(interner.len(), 2);
852 assert_eq!(reclaimed_len(&interner), 0);
853
854 // Try to intern a third string, which should fail as we don't have the space.
855 let s3 = interner.try_intern(S3_VALUE);
856 assert_eq!(s3, None);
857
858 // Drop the first string, which should decrement the reference count to zero and then reclaim the entry.
859 drop(s1);
860
861 assert_eq!(interner.len(), 1);
862 assert_eq!(reclaimed_len(&interner), 1);
863
864 let s1_reclaimed = first_reclaimed_entry(&interner);
865 assert_eq!(s1_reclaimed_expected, s1_reclaimed);
866
867 // Try again to intern the third string, which should now succeed and take over the reclaimed entry entirely
868 // as the strings are identical in length.
869 let _s3 = interner.try_intern(S3_VALUE).expect("should not fail to intern");
870
871 assert_eq!(interner.len(), 2);
872 assert_eq!(reclaimed_len(&interner), 0);
873 }
874
875 #[test]
876 fn reclaimed_entry_reuse_split_too_small() {
877 // This situation is slightly contrived, but: we want to test that when there's a reclaimed entry of a certain
878 // size, reusing that reclaimed entry won't lead to it being split if the resulting split entry would be too
879 // "small": unable to hold another minimum-sized entry.
880 //
881 // We have to intern three strings to do this because we only track reclaimed entries when they're followed by
882 // in-use entries, and the string we drop to create a reclaimed entry has to be big enough, but not _too_ big,
883 // to hold the string we want to intern after it.
884 let interner = create_interner(128);
885
886 // Declare our strings to intern and just check some preconditions by hand.
887 let s_one = "a horse, a horse, my kingdom for a horse!";
888 let s_one_entry_len = entry_len(s_one);
889 let s_two = "why hello there, beautiful";
890 let s_two_entry_len = entry_len(s_two);
891 let s_three = "real gs move in silence like lasagna";
892 let s_three_entry_len = entry_len(s_three);
893
894 assert!(s_one_entry_len <= interner.capacity_bytes());
895 assert!(s_two_entry_len <= interner.capacity_bytes());
896 assert!(s_one_entry_len + s_two_entry_len + s_three_entry_len > interner.capacity_bytes());
897 assert!(s_one_entry_len > s_two_entry_len);
898 assert!(s_three_entry_len > s_two_entry_len);
899 assert!((s_one_entry_len - s_three_entry_len) < MINIMUM_ENTRY_LEN);
900
901 // Intern the first two strings, which should fit without issue.
902 let s1 = interner.try_intern(s_one).expect("should not fail to intern");
903 let s1_reclaimed_expected = get_reclaimed_entry_for_string(&s1);
904 let _s2 = interner.try_intern(s_two).expect("should not fail to intern");
905
906 assert_eq!(interner.len(), 2);
907 assert_eq!(reclaimed_len(&interner), 0);
908
909 // Try to intern the third string, which should fail as we don't have the space.
910 let s3 = interner.try_intern(s_three);
911 assert_eq!(s3, None);
912
913 // Drop the first string, which should decrement the reference count to zero and then reclaim the entry.
914 drop(s1);
915
916 assert_eq!(interner.len(), 1);
917 assert_eq!(reclaimed_len(&interner), 1);
918
919 let s1_reclaimed = first_reclaimed_entry(&interner);
920 assert_eq!(s1_reclaimed_expected, s1_reclaimed);
921
922 // Try again to intern the third string, which should now succeed and take over the reclaimed entry, but since
923 // the remainder of the reclaimed entry after taking the necessary capacity for `s_three` is not large enough
924 // (`MINIMUM_ENTRY_LEN`), we shouldn't end up splitting the reclaimed entry, and instead, `s3` should consume
925 // the entire reclaimed entry.
926 let s3 = interner.try_intern(s_three).expect("should not fail to intern");
927 let s3_reclaimed_expected = get_reclaimed_entry_for_string(&s3);
928
929 assert_eq!(interner.len(), 2);
930 assert_eq!(reclaimed_len(&interner), 0);
931 assert_eq!(s1_reclaimed_expected, s3_reclaimed_expected);
932 }
933
934 #[test]
935 fn reclaimed_entry_adjacent_to_spare_capacity() {
936 let interner = create_interner(128);
937
938 // Intern two smallish strings that fit without issue.
939 let s1 = interner.try_intern("hello, world!").expect("should not fail to intern");
940 let s2 = interner
941 .try_intern("cheeeeeehooooo!")
942 .expect("should not fail to intern");
943 let s1_entry_len = entry_len(&s1);
944 let s2_entry_len = entry_len(&s2);
945
946 assert_eq!(reclaimed_len(&interner), 0);
947 assert_eq!(available_len(&interner), 128 - s1_entry_len - s2_entry_len);
948
949 drop(s2);
950 assert_eq!(reclaimed_len(&interner), 0);
951 assert_eq!(available_len(&interner), 128 - s1_entry_len);
952
953 drop(s1);
954 assert_eq!(reclaimed_len(&interner), 0);
955 assert_eq!(available_len(&interner), 128);
956 }
957
958 #[test]
959 fn len_bytes_reps_active_interned_entries() {
960 let interner = create_interner(256);
961 let mut active_interned_entries_len = 0;
962 let string_1 = "hello world!";
963 let string_2 = "hello again, world";
964 let string_3 = "this is another string";
965
966 // Intern a string.
967 let s1 = interner.try_intern(string_1).expect("should not fail to intern");
968 active_interned_entries_len += entry_len(string_1);
969 assert_eq!(interner.len(), 1);
970 assert_eq!(interner.len_bytes(), active_interned_entries_len);
971
972 // Intern a second string.
973 let _s2 = interner.try_intern(string_2).expect("should not fail to intern");
974 active_interned_entries_len += entry_len(string_2);
975 assert_eq!(interner.len(), 2);
976 assert_eq!(interner.len_bytes(), active_interned_entries_len);
977
978 // Drop the first string.
979 drop(s1);
980 active_interned_entries_len -= entry_len(string_1);
981 assert_eq!(interner.len(), 1);
982 assert_eq!(interner.len_bytes(), active_interned_entries_len);
983
984 // Intern a new string.
985 let _s3 = interner.try_intern(string_3).expect("should not fail to intern");
986 active_interned_entries_len += entry_len(string_3);
987 assert_eq!(interner.len(), 2);
988 assert_eq!(interner.len_bytes(), active_interned_entries_len);
989 }
990
991 proptest! {
992 #[test]
993 #[cfg_attr(miri, ignore)]
994 fn property_test_entry_count_accurate(
995 strs in arb_alphanum_strings(1..=128, 16..=512),
996 indices in arb_vec(any::<Index>(), 1..=1000),
997 ) {
998 // We ask `proptest` to generate a bunch of unique strings of varying lengths (1-128 bytes, 16-512 unique
999 // strings) which we then randomly select out of those strings which strings we want to intern. The goal
1000 // here is to potentially select the same string multiple times, to exercise the actual interning logic...
1001 // but practically, to ensure that when we intern a string that has already been interned, we're not
1002 // incrementing the entries count again.
1003
1004 // Create an interner with enough capacity to hold all of the strings we've generated. This is the maximum
1005 // string size multiplied by the number of strings we've generated... plus a little constant factor per
1006 // string to account for the entry header.
1007 const ENTRY_SIZE: usize = 128 + HEADER_LEN;
1008 let interner = create_interner(ENTRY_SIZE * indices.len());
1009
1010 // For each index, pull out the string and both track it in `unique_strs` and intern it. We hold on to the
1011 // interned string handle to make sure the interned string is actually kept alive, keeping the entry count
1012 // stable.
1013 let mut interned = Vec::new();
1014 let mut unique_strs = HashSet::new();
1015 for index in &indices {
1016 let s = index.get(&strs);
1017 unique_strs.insert(s);
1018
1019 let s_interned = interner.try_intern(s).expect("should never fail to intern");
1020 interned.push(s_interned);
1021 }
1022
1023 assert_eq!(unique_strs.len(), interner.len());
1024 }
1025 }
1026
1027 #[cfg(feature = "loom")]
1028 #[test]
1029 fn concurrent_drop_and_intern() {
1030 fn reclaimed_entries(interner: &GenericMapInterner) -> Vec<ReclaimedEntry> {
1031 interner
1032 .state
1033 .lock()
1034 .unwrap()
1035 .storage
1036 .reclaimed
1037 .iter()
1038 .copied()
1039 .collect()
1040 }
1041
1042 fn do_reclaimed_entries_overlap(a: ReclaimedEntry, b: ReclaimedEntry) -> bool {
1043 let a_start = a.offset;
1044 let a_end = a.offset + a.capacity - 1;
1045
1046 let b_start = b.offset;
1047 let b_end = b.offset + b.capacity - 1;
1048
1049 (a_start <= b_start && b_start <= a_end) || (a_start <= b_end && b_end <= a_end)
1050 }
1051
1052 const STRING_TO_INTERN: &str = "hello, world!";
1053
1054 // This test is meant to explore the thread orderings when one thread is trying to drop (and thus reclaim) the
1055 // last active reference to an interned string, and another thread is trying to intern that very same string.
1056 //
1057 // We accept, as a caveat, that a possible outcome is that we intern the "new" string again, even though an
1058 // existing entry to that string may have existed in an alternative ordering.
1059 loom::model(|| {
1060 let interner = create_interner(1024);
1061 let t2_interner = interner.clone();
1062
1063 // Intern the string from thread T1.
1064 let t1_interned_s = interner
1065 .try_intern(STRING_TO_INTERN)
1066 .expect("should not fail to intern");
1067 assert_eq!(t1_interned_s.deref(), STRING_TO_INTERN);
1068 let t1_reclaimed_entry = get_reclaimed_entry_for_string(&t1_interned_s);
1069
1070 // Spawn thread T2, which tries to intern the same string and returns the handle to it.
1071 let t2_result = loom::thread::spawn(move || {
1072 let interned_s = t2_interner
1073 .try_intern(STRING_TO_INTERN)
1074 .expect("should not fail to intern");
1075 let reclaimed_entry = get_reclaimed_entry_for_string(&interned_s);
1076
1077 (interned_s, reclaimed_entry)
1078 });
1079
1080 drop(t1_interned_s);
1081
1082 let (t2_interned_s, t2_reclaimed_entry) = t2_result.join().expect("should not fail to join T2");
1083 assert_eq!(t2_interned_s.deref(), STRING_TO_INTERN);
1084
1085 // What we're checking for here is that either:
1086 // - there's no reclaimed entries (T2 found the existing entry for the string before T1 dropped it)
1087 // - there's a reclaimed entry (T2 didn't find the existing entry for the string before T1 marked it as
1088 // inactive) but the reclaimed entry does _not_ overlap with the interned string from T2, meaning we
1089 // didn't get confused and allow T2 to use an existing entry that T1 then later marked as reclaimed
1090 let reclaimed_entries = reclaimed_entries(&interner);
1091 assert!(reclaimed_entries.len() <= 1, "should have at most one reclaimed entry");
1092
1093 if !reclaimed_entries.is_empty() {
1094 // If we do have a reclaimed entry, it needs to match exactly with only one of the interned strings.
1095 let is_t1_entry = reclaimed_entries.first().unwrap() == &t1_reclaimed_entry;
1096 let is_t2_entry = reclaimed_entries.first().unwrap() == &t2_reclaimed_entry;
1097
1098 assert!(
1099 (is_t1_entry || is_t2_entry) && !(is_t1_entry && is_t2_entry),
1100 "should only match one interned string"
1101 );
1102
1103 // Additionally, we ensure that the reclaimed entry does not overlap with the other interned string.
1104 assert!(
1105 !do_reclaimed_entries_overlap(t1_reclaimed_entry, t2_reclaimed_entry),
1106 "reclaimed entry should not overlap with remaining interned string"
1107 );
1108 }
1109 });
1110 }
1111}