OILS / mycpp / gc_dict.h View on Github | oils.pub

483 lines, 273 significant
1// Hash table based on CPython's "compact dict":
2//
3// https://mail.python.org/pipermail/python-dev/2012-December/123028.html
4// https://code.activestate.com/recipes/578375/
5//
6// Main differences:
7// - It's type-specialized in C++ -- Dict<K, V>.
8// - It's integrated with our mark and sweep GC, using Slab<int>, Slab<K>, and
9// Slab<V>
10// - We use linear probing, not the pseudo-random number generator
11
12#ifndef MYCPP_GC_DICT_H
13#define MYCPP_GC_DICT_H
14
15#include "mycpp/comparators.h"
16#include "mycpp/gc_builtins.h"
17#include "mycpp/gc_list.h"
18#include "mycpp/hash.h"
19
20// Non-negative entries in index_ are array indices into keys_ and values_.
21// There are two special negative entries:
22
23// index_ value to say this Dict item was deleted (a tombstone).
24const int kDeletedEntry = -1;
25
26// index_ value to say this Dict entry is free.
27const int kEmptyEntry = -2;
28
29// Return value for find_kv_index(), not stored in index_.
30const int kNotFound = -3;
31
32// Return value for hash_and_probe(), not stored in index_.
33const int kTooSmall = -4;
34
35// Helper for keys() and values()
36template <typename T>
37List<T>* ListFromDictSlab(Slab<T>* slab, int n) {
38 List<T>* result = Alloc<List<T>>();
39 result->reserve(n);
40
41 for (int i = 0; i < n; ++i) {
42 result->append(slab->items_[i]);
43 }
44 return result;
45}
46
47// GlobalDict is layout-compatible with Dict (unit tests assert this), and it
48// can be a true C global (incurs zero startup time)
49
50template <typename K, typename V, int N>
51class GlobalDict {
52 public:
53 int len_;
54 int capacity_;
55 int index_len_;
56 GlobalSlab<int, N>* index_;
57 GlobalSlab<K, N>* keys_;
58 GlobalSlab<V, N>* values_;
59};
60
61#define GLOBAL_DICT(name, K, V, N, keys, vals) \
62 GcGlobal<GlobalSlab<K, N>> _keys_##name = {ObjHeader::Global(TypeTag::Slab), \
63 {.items_ = keys}}; \
64 GcGlobal<GlobalSlab<V, N>> _vals_##name = {ObjHeader::Global(TypeTag::Slab), \
65 {.items_ = vals}}; \
66 GcGlobal<GlobalDict<K, V, N>> _dict_##name = { \
67 ObjHeader::Global(TypeTag::Dict), \
68 {.len_ = N, \
69 .capacity_ = N, \
70 .index_len_ = 0, \
71 .index_ = nullptr, \
72 .keys_ = &_keys_##name.obj, \
73 .values_ = &_vals_##name.obj}, \
74 }; \
75 Dict<K, V>* name = reinterpret_cast<Dict<K, V>*>(&_dict_##name.obj);
76
77template <class K, class V>
78class Dict {
79 public:
80 Dict()
81 : len_(0),
82 capacity_(0),
83 index_len_(0),
84 index_(nullptr),
85 keys_(nullptr),
86 values_(nullptr) {
87 }
88
89 Dict(std::initializer_list<K> keys, std::initializer_list<V> values)
90 : len_(0),
91 capacity_(0),
92 index_len_(0),
93 index_(nullptr),
94 keys_(nullptr),
95 values_(nullptr) {
96 DCHECK(keys.size() == values.size());
97 auto v = values.begin(); // This simulates a "zip" loop
98 for (auto key : keys) {
99 // note: calls reserve(), and may allocate
100 this->set(key, *v);
101 ++v;
102 }
103 }
104
105 // Reserve enough space for at LEAST this many key-value pairs.
106 void reserve(int num_desired);
107
108 // d[key] in Python: raises KeyError if not found
109 V at(K key) const;
110
111 // d.get(key) in Python. (Can't use this if V isn't a pointer!)
112 V get(K key) const;
113
114 // Get a key, but return a default if not found.
115 // expr_parse.py uses this with OTHER_BALANCE
116 V get(K key, V default_val) const;
117
118 // Implements d[k] = v. May resize the dictionary.
119 void set(K key, V val);
120
121 void update(List<Tuple2<K, V>*>* pairs);
122 void update(Dict<K, V>* other);
123
124 Dict<K, V>* copy();
125
126 List<K>* keys() const;
127
128 List<V>* values() const;
129
130 void clear();
131
132 // Helper used by find_kv_index(), set(), mylib::dict_erase() in
133 // gc_mylib.h
134 // Returns either:
135 // - the slot for an existing key, or an empty slot for a new key
136 // - kTooSmall if the table is full
137 int hash_and_probe(K key) const;
138
139 // Helper used by at(), get(), dict_contains()
140 // Given a key, returns either:
141 // - an index into keys_ and values_
142 // - kNotFound
143 int find_kv_index(K key) const;
144
145 static constexpr ObjHeader obj_header() {
146 return ObjHeader::ClassFixed(field_mask(), sizeof(Dict));
147 }
148
149 int len_; // number of entries (keys and values, almost dense)
150 int capacity_; // number of k/v slots
151 int index_len_; // number of index slots
152
153 // These 3 slabs are resized at the same time.
154 Slab<int>* index_; // kEmptyEntry, kDeletedEntry, or a valid index into
155 // keys_ and values_
156 Slab<K>* keys_; // Dict<K, int>
157 Slab<V>* values_; // Dict<int, V>
158
159 // A dict has 3 pointers the GC needs to follow.
160 static constexpr uint32_t field_mask() {
161 return maskbit(offsetof(Dict, index_)) | maskbit(offsetof(Dict, keys_)) |
162 maskbit(offsetof(Dict, values_));
163 }
164
165 DISALLOW_COPY_AND_ASSIGN(Dict);
166
167 // kItemSize is max of K and V size. That is, on 64-bit machines, the RARE
168 // Dict<int, int> is smaller than other dicts
169 static constexpr int kItemSize = sizeof(K) > sizeof(V) ? sizeof(K)
170 : sizeof(V);
171
172 // Matches mark_sweep_heap.h
173 static constexpr int kPoolBytes2 = 48 - sizeof(ObjHeader);
174 static_assert(kPoolBytes2 % kItemSize == 0,
175 "An integral number of items should fit in second pool");
176 static constexpr int kNumItems2 = kPoolBytes2 / kItemSize;
177
178 static const int kHeaderFudge = sizeof(ObjHeader) / kItemSize;
179 static_assert(sizeof(ObjHeader) % kItemSize == 0,
180 "Slab header size should be multiple of key size");
181
182#if 0
183 static constexpr int kMinBytes2 = 128 - sizeof(ObjHeader);
184 static_assert(kMinBytes2 % kItemSize == 0,
185 "An integral number of items should fit");
186 static constexpr int kMinItems2 = kMinBytes2 / kItemSize;
187#endif
188
189 int HowManyPairs(int num_desired) {
190 // See gc_list.h for comments on nearly identical logic
191
192 if (num_desired <= kNumItems2) { // use full cell in pool 2
193 return kNumItems2;
194 }
195#if 0
196 if (num_desired <= kMinItems2) { // 48 -> 128, not 48 -> 64
197 return kMinItems2;
198 }
199#endif
200 return RoundUp(num_desired + kHeaderFudge) - kHeaderFudge;
201 }
202};
203
204template <typename K, typename V>
205inline bool dict_contains(const Dict<K, V>* haystack, K needle) {
206 return haystack->find_kv_index(needle) != kNotFound;
207}
208
209template <typename K, typename V>
210void Dict<K, V>::reserve(int num_desired) {
211 if (capacity_ >= num_desired) {
212 return; // Don't do anything if there's already enough space.
213 }
214
215 int old_len = len_;
216 Slab<K>* old_k = keys_;
217 Slab<V>* old_v = values_;
218
219 // Calculate the number of keys and values we should have
220 capacity_ = HowManyPairs(num_desired);
221
222 // 1) Ensure index len a power of 2, to avoid expensive modulus % operation
223 // 2) Introduce hash table load factor. Use capacity_+1 to simulate ceil()
224 // div, not floor() div.
225 index_len_ = RoundUp((capacity_ + 1) * 5 / 4);
226 DCHECK(index_len_ > capacity_);
227
228 index_ = NewSlab<int>(index_len_);
229 for (int i = 0; i < index_len_; ++i) {
230 index_->items_[i] = kEmptyEntry;
231 }
232
233 // These are DENSE, while index_ is sparse.
234 keys_ = NewSlab<K>(capacity_);
235 values_ = NewSlab<V>(capacity_);
236
237 if (old_k != nullptr) { // rehash if there were any entries
238 // log("REHASH num_desired %d", num_desired);
239 len_ = 0;
240 for (int i = 0; i < old_len; ++i) {
241 set(old_k->items_[i], old_v->items_[i]);
242 }
243 }
244}
245
246template <typename K, typename V>
247V Dict<K, V>::at(K key) const {
248 int kv_index = find_kv_index(key);
249 if (kv_index == kNotFound) {
250 throw Alloc<KeyError>();
251 } else {
252 return values_->items_[kv_index];
253 }
254}
255
256template <typename K, typename V>
257V Dict<K, V>::get(K key) const {
258 int kv_index = find_kv_index(key);
259 if (kv_index == kNotFound) {
260 return nullptr;
261 } else {
262 return values_->items_[kv_index];
263 }
264}
265
266template <typename K, typename V>
267V Dict<K, V>::get(K key, V default_val) const {
268 int kv_index = find_kv_index(key);
269 if (kv_index == kNotFound) {
270 return default_val;
271 } else {
272 return values_->items_[kv_index];
273 }
274}
275
276template <typename K, typename V>
277List<K>* Dict<K, V>::keys() const {
278 return ListFromDictSlab<K>(keys_, len_);
279}
280
281template <typename K, typename V>
282List<V>* Dict<K, V>::values() const {
283 return ListFromDictSlab<V>(values_, len_);
284}
285
286template <typename K, typename V>
287void Dict<K, V>::clear() {
288 // Maintain invariant
289 for (int i = 0; i < index_len_; ++i) {
290 index_->items_[i] = kEmptyEntry;
291 }
292
293 if (keys_) {
294 memset(keys_->items_, 0, len_ * sizeof(K)); // zero for GC scan
295 }
296 if (values_) {
297 memset(values_->items_, 0, len_ * sizeof(V)); // zero for GC scan
298 }
299 len_ = 0;
300}
301
302// TODO:
303// - Special case to intern BigStr* when it's hashed? How?
304// - Should we have wrappers like:
305// - V GetAndIntern<V>(D, &string_key)
306// - SetAndIntern<V>(D, &string_key, value)
307// This will enable duplicate copies of the string to be garbage collected
308template <typename K, typename V>
309int Dict<K, V>::hash_and_probe(K key) const {
310 if (capacity_ == 0) {
311 return kTooSmall;
312 }
313
314 // Hash the key onto a slot in the index. If the first slot is occupied,
315 // probe until an empty one is found.
316 unsigned h = hash_key(key);
317 // faster % using & -- assuming index_len_ is power of 2
318 int init_bucket = h & (index_len_ - 1);
319
320 // If we see a tombstone along the probing path, stash it.
321 int open_slot = -1;
322
323 for (int i = 0; i < index_len_; ++i) {
324 // Start at init_bucket and wrap araound
325
326 // faster % using & -- assuming index_len_ is power of 2
327 int slot = (i + init_bucket) & (index_len_ - 1);
328
329 int kv_index = index_->items_[slot];
330 DCHECK(kv_index < len_);
331 // Optimistically this is the common case once the table has been populated.
332 if (kv_index >= 0) {
333 unsigned h2 = hash_key(keys_->items_[kv_index]);
334 if (h == h2 && keys_equal(keys_->items_[kv_index], key)) {
335 return slot;
336 }
337 }
338
339 if (kv_index == kEmptyEntry) {
340 if (open_slot != -1) {
341 slot = open_slot;
342 }
343 // If there isn't room in the entry arrays, tell the caller to resize.
344 return len_ < capacity_ ? slot : kTooSmall;
345 }
346
347 // Tombstone or collided keys unequal. Keep scanning.
348 DCHECK(kv_index >= 0 || kv_index == kDeletedEntry);
349 if (kv_index == kDeletedEntry && open_slot == -1) {
350 // NOTE: We only record the open slot here. We DON'T return it. If we're
351 // looking for a key that was writen before this tombstone was written to
352 // the index we should continue probing until we get to that key. If we
353 // get to an empty index slot or the end of the index then we know we are
354 // dealing with a new key and can safely replace the tombstone without
355 // disrupting any existing keys.
356 open_slot = slot;
357 }
358 }
359
360 if (open_slot != -1) {
361 return len_ < capacity_ ? open_slot : kTooSmall;
362 }
363
364 return kTooSmall;
365}
366
367template <typename K, typename V>
368int Dict<K, V>::find_kv_index(K key) const {
369 if (index_ != nullptr) { // Common case
370 int pos = hash_and_probe(key);
371 if (pos == kTooSmall) {
372 return kNotFound;
373 }
374 DCHECK(pos >= 0);
375 int kv_index = index_->items_[pos];
376 if (kv_index < 0) {
377 return kNotFound;
378 }
379 return kv_index;
380 }
381
382 // Linear search on GlobalDict instances.
383 // TODO: Should we populate and compare their hash values?
384 for (int i = 0; i < len_; ++i) {
385 if (keys_equal(keys_->items_[i], key)) {
386 return i;
387 }
388 }
389
390 return kNotFound;
391}
392
393template <typename K, typename V>
394void Dict<K, V>::set(K key, V val) {
395 DCHECK(obj_header().heap_tag != HeapTag::Global);
396 int pos = hash_and_probe(key);
397 if (pos == kTooSmall) {
398 reserve(len_ + 1);
399 pos = hash_and_probe(key);
400 }
401 DCHECK(pos >= 0);
402
403 int kv_index = index_->items_[pos];
404 DCHECK(kv_index < len_);
405 if (kv_index < 0) {
406 // Write new entries to the end of the k/v arrays. This allows us to recall
407 // insertion order until the first deletion.
408 keys_->items_[len_] = key;
409 values_->items_[len_] = val;
410 index_->items_[pos] = len_;
411 len_++;
412 DCHECK(len_ <= capacity_);
413 } else {
414 values_->items_[kv_index] = val;
415 }
416}
417
418template <typename K, typename V>
419inline int len(const Dict<K, V>* d) {
420 return d->len_;
421}
422
423template <class K, class V>
424class DictIter {
425 public:
426 explicit DictIter(Dict<K, V>* D) : D_(D), pos_(0) {
427 }
428 void Next() {
429 pos_++;
430 }
431 bool Done() {
432 return pos_ == D_->len_;
433 }
434 K Key() {
435 return D_->keys_->items_[pos_];
436 }
437 V Value() {
438 return D_->values_->items_[pos_];
439 }
440
441 private:
442 const Dict<K, V>* D_;
443 int pos_;
444};
445
446// dict(mylist) converts a list of (k, v) tuples into a dict
447template <typename K, typename V>
448Dict<K, V>* dict(List<Tuple2<K, V>*>* l) {
449 auto ret = Alloc<Dict<K, V>>();
450 ret->reserve(len(l));
451 for (ListIter<Tuple2<K, V>*> it(l); !it.Done(); it.Next()) {
452 ret->set(it.Value()->at0(), it.Value()->at1());
453 }
454 return ret;
455}
456
457template <class K, class V>
458void Dict<K, V>::update(List<Tuple2<K, V>*>* pairs) {
459 reserve(len_ + len(pairs));
460 for (ListIter<Tuple2<K, V>*> it(pairs); !it.Done(); it.Next()) {
461 set(it.Value()->at0(), it.Value()->at1());
462 }
463}
464
465template <class K, class V>
466void Dict<K, V>::update(Dict<K, V>* other) {
467 reserve(len_ + len(other));
468 for (DictIter<K, V> it(other); !it.Done(); it.Next()) {
469 set(it.Key(), it.Value());
470 }
471}
472
473template <class K, class V>
474Dict<K, V>* Dict<K, V>::copy() {
475 auto ret = Alloc<Dict<K, V>>();
476 ret->reserve(len_);
477 for (DictIter<K, V> it(this); !it.Done(); it.Next()) {
478 ret->set(it.Key(), it.Value());
479 }
480 return ret;
481}
482
483#endif // MYCPP_GC_DICT_H