1//! @file filter.cpp
2//! @author ryftchen
3//! @brief The definitions (filter) in the data structure module.
4//! @version 0.1.0
5//! @copyright Copyright (c) 2022-2025 ryftchen. All rights reserved.
6
7#include "filter.hpp"
8
9#include <cmath>
10#include <cstring>
11
12namespace data_structure::filter
13{
14//! @brief Function version number.
15//! @return version number (major.minor.patch)
16const char* version() noexcept
17{
18 static const char* const ver = "0.1.0";
19 return ver;
20}
21
22// NOLINTBEGIN(readability-magic-numbers)
23//! @brief The MurmurHash2 (64-bit) function.
24//! @param key - key to hash
25//! @param length - length of the key
26//! @param seed - hash seed
27//! @return hash value
28static std::uint64_t murmurHash2X64(const void* const key, const int length, const std::uint32_t seed) noexcept
29{
30 constexpr std::uint64_t mix = 0xC6A4A7935BD1E995;
31 constexpr int shift = 47;
32 const auto* data1 = static_cast<const std::uint64_t*>(key);
33 const std::uint64_t* end = data1 + (length / 8);
34 std::uint64_t hash = seed ^ (length * mix);
35 while (data1 != end)
36 {
37 std::uint64_t block = *data1++;
38 block *= mix;
39 block ^= block >> shift;
40 block *= mix;
41
42 hash ^= block;
43 hash *= mix;
44 }
45
46 const auto* data2 =
47 reinterpret_cast<const unsigned char*>(data1); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
48 switch (length & 7)
49 {
50 case 7:
51 hash ^= static_cast<std::uint64_t>(data2[6]) << 48;
52 [[fallthrough]];
53 case 6:
54 hash ^= static_cast<std::uint64_t>(data2[5]) << 40;
55 [[fallthrough]];
56 case 5:
57 hash ^= static_cast<std::uint64_t>(data2[4]) << 32;
58 [[fallthrough]];
59 case 4:
60 hash ^= static_cast<std::uint64_t>(data2[3]) << 24;
61 [[fallthrough]];
62 case 3:
63 hash ^= static_cast<std::uint64_t>(data2[2]) << 16;
64 [[fallthrough]];
65 case 2:
66 hash ^= static_cast<std::uint64_t>(data2[1]) << 8;
67 [[fallthrough]];
68 case 1:
69 hash ^= static_cast<std::uint64_t>(data2[0]);
70 hash *= mix;
71 [[fallthrough]];
72 default:
73 break;
74 };
75
76 hash ^= hash >> shift;
77 hash *= mix;
78 hash ^= hash >> shift;
79 return hash;
80}
81
82Bloom::Bloom(const std::uint32_t capacity, const double falsePositiveProb, const std::uint32_t hashSeed) :
83 capacity{capacity},
84 filterBitsNum{calculateParamM(n: capacity, p: falsePositiveProb)},
85 hashFuncNum{calculateParamK(m: filterBitsNum, n: capacity)},
86 hashSeed{hashSeed},
87 filterSize{filterBitsNum / byteBits},
88 filter{std::make_unique<std::uint8_t[]>(num: filterSize)},
89 hashPos{std::make_unique<std::uint32_t[]>(num: hashFuncNum)}
90{
91 std::memset(s: filter.get(), c: 0, n: filterSize * sizeof(std::uint8_t));
92 std::memset(s: hashPos.get(), c: 0, n: hashFuncNum * sizeof(std::uint32_t));
93}
94
95bool Bloom::insert(const void* const key, const int length)
96{
97 if (length <= 0)
98 {
99 return false;
100 }
101
102 bloomHash(key, length);
103 for (std::uint32_t i = 0; i < hashFuncNum; ++i)
104 {
105 setBit(filter: filter.get(), pos: hashPos[i]);
106 }
107 ++entries;
108 return entries <= capacity;
109}
110
111bool Bloom::mayContain(const void* const key, const int length)
112{
113 if (!key || (length <= 0))
114 {
115 return false;
116 }
117
118 bloomHash(key, length);
119 for (std::uint32_t i = 0; i < hashFuncNum; ++i)
120 {
121 if (getBit(filter: filter.get(), pos: hashPos[i]) == 0)
122 {
123 return false;
124 }
125 }
126 return true;
127}
128
129void Bloom::clear()
130{
131 entries = 0;
132 std::memset(s: filter.get(), c: 0, n: filterSize * sizeof(std::uint8_t));
133 std::memset(s: hashPos.get(), c: 0, n: hashFuncNum * sizeof(std::uint32_t));
134}
135
136void Bloom::bloomHash(const void* const key, const int length)
137{
138 const std::uint64_t hash1 = murmurHash2X64(key, length, seed: hashSeed);
139 const std::uint64_t hash2 = murmurHash2X64(key, length, seed: static_cast<std::uint32_t>((hash1 >> 32) ^ hash1));
140 for (std::uint32_t i = 0; i < hashFuncNum; ++i)
141 {
142 hashPos[i] = (hash1 + i * hash2) % filterBitsNum;
143 }
144}
145
146void Bloom::setBit(std::uint8_t filter[], const std::uint32_t pos)
147{
148 filter[pos / byteBits] |= 1 << (pos % byteBits);
149}
150
151std::uint8_t Bloom::getBit(const std::uint8_t filter[], const std::uint32_t pos)
152{
153 return filter[pos / byteBits] & (1 << (pos % byteBits));
154}
155
156std::uint32_t Bloom::calculateParamM(const std::uint32_t n, const double p)
157{
158 if ((n == 0) || (p <= 0.0) || (p >= 1.0))
159 {
160 return 0;
161 }
162
163 std::uint32_t m = std::ceil(x: -1.0 * n * std::log(x: p) / (std::numbers::ln2 * std::numbers::ln2));
164 m = (m - m % 64) + 64;
165 return m;
166}
167
168std::uint32_t Bloom::calculateParamK(const std::uint32_t m, const std::uint32_t n)
169{
170 if ((m == 0) || (n == 0))
171 {
172 return 0;
173 }
174
175 const std::uint32_t k = std::round(x: std::numbers::ln2 * m / n);
176 return k;
177}
178
179Quotient::Quotient(const std::uint8_t qBits, const std::uint8_t rBits, const std::uint32_t hashSeed) :
180 qBits{qBits},
181 rBits{rBits},
182 elemBits{static_cast<std::uint8_t>(rBits + 3)},
183 indexMask{lowMask(n: qBits)},
184 rMask{lowMask(n: rBits)},
185 elemMask{lowMask(n: elemBits)},
186 capacity{static_cast<std::uint64_t>(1 << qBits)},
187 hashSeed{hashSeed},
188 filterSize{filterSizeInBytes(q: qBits, r: rBits)},
189 filter{std::make_unique<std::uint64_t[]>(num: filterSize)}
190{
191 std::memset(s: filter.get(), c: 0, n: filterSize * sizeof(std::uint64_t));
192}
193
194bool Quotient::insert(const void* const key, const int length)
195{
196 return (key && (length > 0)) && insert(hash: quotientHash(key, length));
197}
198
199bool Quotient::mayContain(const void* const key, const int length)
200{
201 return (key && (length > 0)) && mayContain(hash: quotientHash(key, length));
202}
203
204bool Quotient::remove(const void* const key, const int length)
205{
206 return (key && (length > 0)) && remove(hash: quotientHash(key, length));
207}
208
209void Quotient::clear()
210{
211 entries = 0;
212 std::memset(s: filter.get(), c: 0, n: filterSize * sizeof(std::uint64_t));
213}
214
215bool Quotient::insert(const std::uint64_t hash)
216{
217 if (entries >= capacity)
218 {
219 return false;
220 }
221
222 const std::uint64_t hq = hashToQuotient(hash);
223 const std::uint64_t hr = hashToRemainder(hash);
224 const std::uint64_t hqElem = getElement(qf: *this, index: hq);
225 std::uint64_t entry = (hr << 3) & ~7;
226 if (isEmptyElement(elem: hqElem))
227 {
228 setElement(qf&: *this, index: hq, elem: setOccupied(entry));
229 ++entries;
230 return true;
231 }
232 if (!isOccupied(elem: hqElem))
233 {
234 setElement(qf&: *this, index: hq, elem: setOccupied(hqElem));
235 }
236
237 const std::uint64_t start = findRunIndex(quot: hq);
238 std::uint64_t slot = start;
239 if (isOccupied(elem: hqElem))
240 {
241 do
242 {
243 const std::uint64_t rem = getRemainder(elem: getElement(qf: *this, index: slot));
244 if (rem == hr)
245 {
246 return true;
247 }
248 if (rem > hr)
249 {
250 break;
251 }
252 slot = increase(qf: *this, index: slot);
253 }
254 while (isContinuation(elem: getElement(qf: *this, index: slot)));
255
256 if (slot == start)
257 {
258 const std::uint64_t oldHead = getElement(qf: *this, index: start);
259 setElement(qf&: *this, index: start, elem: setContinuation(oldHead));
260 }
261 else
262 {
263 entry = setContinuation(entry);
264 }
265 }
266
267 if (slot != hq)
268 {
269 entry = setShifted(entry);
270 }
271
272 insertAt(start: slot, elem: entry);
273 ++entries;
274 return true;
275}
276
277void Quotient::insertAt(const std::uint64_t start, const std::uint64_t elem)
278{
279 bool isEmpty = false;
280 std::uint64_t slot = start;
281 std::uint64_t currElem = elem;
282 do
283 {
284 std::uint64_t prevElem = getElement(qf: *this, index: slot);
285 isEmpty = isEmptyElement(elem: prevElem);
286 if (!isEmpty)
287 {
288 prevElem = setShifted(prevElem);
289 if (isOccupied(elem: prevElem))
290 {
291 currElem = setOccupied(currElem);
292 prevElem = clearOccupied(elem: prevElem);
293 }
294 }
295 setElement(qf&: *this, index: slot, elem: currElem);
296 currElem = prevElem;
297 slot = increase(qf: *this, index: slot);
298 }
299 while (!isEmpty);
300}
301
302void Quotient::insertFrom(const Quotient& qf)
303{
304 Iterator iter{};
305 start(qf, iter);
306 while (!done(qf, iter))
307 {
308 insert(hash: next(qf, iter));
309 }
310}
311
312bool Quotient::isCompatibleWith(const Quotient& qf) const
313{
314 return (qf.qBits == qBits) && (qf.rBits == rBits) && (qf.hashSeed == hashSeed);
315}
316
317bool Quotient::mayContain(const std::uint64_t hash)
318{
319 const std::uint64_t hq = hashToQuotient(hash);
320 const std::uint64_t hr = hashToRemainder(hash);
321 const std::uint64_t hqElem = getElement(qf: *this, index: hq);
322 if (!isOccupied(elem: hqElem))
323 {
324 return false;
325 }
326
327 std::uint64_t slot = findRunIndex(quot: hq);
328 do
329 {
330 const std::uint64_t rem = getRemainder(elem: getElement(qf: *this, index: slot));
331 if (rem == hr)
332 {
333 return true;
334 }
335 if (rem > hr)
336 {
337 return false;
338 }
339 slot = increase(qf: *this, index: slot);
340 }
341 while (isContinuation(elem: getElement(qf: *this, index: slot)));
342 return false;
343}
344
345bool Quotient::remove(const std::uint64_t hash)
346{
347 const std::uint64_t highBits = hash >> (qBits + rBits);
348 if (highBits)
349 {
350 return false;
351 }
352
353 const std::uint64_t hq = hashToQuotient(hash);
354 const std::uint64_t hr = hashToRemainder(hash);
355 std::uint64_t hqElem = getElement(qf: *this, index: hq);
356 if (!isOccupied(elem: hqElem) || (entries == 0))
357 {
358 return true;
359 }
360
361 std::uint64_t slot = findRunIndex(quot: hq);
362 std::uint64_t rem = 0;
363 do
364 {
365 rem = getRemainder(elem: getElement(qf: *this, index: slot));
366 if (rem >= hr)
367 {
368 break;
369 }
370 slot = increase(qf: *this, index: slot);
371 }
372 while (isContinuation(elem: getElement(qf: *this, index: slot)));
373 if (rem != hr)
374 {
375 return true;
376 }
377
378 const std::uint64_t entryToRemove = (slot == hq) ? hqElem : getElement(qf: *this, index: slot);
379 const bool toReplaceRunStart = isRunStart(elem: entryToRemove);
380 if (toReplaceRunStart)
381 {
382 const std::uint64_t nextElem = getElement(qf: *this, index: increase(qf: *this, index: slot));
383 if (!isContinuation(elem: nextElem))
384 {
385 hqElem = clearOccupied(elem: hqElem);
386 setElement(qf&: *this, index: hq, elem: hqElem);
387 }
388 }
389
390 removeAt(start: slot, quot: hq);
391 if (toReplaceRunStart)
392 {
393 const std::uint64_t nextElem = getElement(qf: *this, index: slot);
394 std::uint64_t updatedNextElem = isContinuation(elem: nextElem) ? clearContinuation(elem: nextElem) : nextElem;
395 if ((slot == hq) && isRunStart(elem: updatedNextElem))
396 {
397 updatedNextElem = clearShifted(elem: updatedNextElem);
398 }
399 if (updatedNextElem != nextElem)
400 {
401 setElement(qf&: *this, index: slot, elem: updatedNextElem);
402 }
403 }
404 --entries;
405 return true;
406}
407
408void Quotient::removeAt(const std::uint64_t start, const std::uint64_t quot)
409{
410 const std::uint64_t orig = start;
411 std::uint64_t slot = start;
412 std::uint64_t currElem = getElement(qf: *this, index: slot);
413 std::uint64_t scanPos = increase(qf: *this, index: slot);
414 std::uint64_t quotient = quot;
415 for (;;)
416 {
417 const std::uint64_t nextElem = getElement(qf: *this, index: scanPos);
418 if (isEmptyElement(elem: nextElem) || isClusterStart(elem: nextElem) || (scanPos == orig))
419 {
420 setElement(qf&: *this, index: slot, elem: 0);
421 return;
422 }
423
424 const bool isCurrOccupied = isOccupied(elem: currElem);
425 std::uint64_t updatedNextElem = nextElem;
426 if (isRunStart(elem: nextElem))
427 {
428 do
429 {
430 quotient = increase(qf: *this, index: quotient);
431 }
432 while (!isOccupied(elem: getElement(qf: *this, index: quotient)));
433
434 if (isCurrOccupied && (slot == quotient))
435 {
436 updatedNextElem = clearShifted(elem: nextElem);
437 }
438 }
439
440 setElement(qf&: *this, index: slot, elem: isCurrOccupied ? setOccupied(updatedNextElem) : clearOccupied(elem: updatedNextElem));
441 slot = scanPos;
442 scanPos = increase(qf: *this, index: scanPos);
443 currElem = nextElem;
444 }
445}
446
447std::uint64_t Quotient::quotientHash(const void* const key, const int length) const
448{
449 return murmurHash2X64(key, length, seed: hashSeed) & lowMask(n: qBits + rBits);
450}
451
452std::uint64_t Quotient::hashToQuotient(const std::uint64_t hash) const
453{
454 return (hash >> rBits) & indexMask;
455}
456
457std::uint64_t Quotient::hashToRemainder(const std::uint64_t hash) const
458{
459 return hash & rMask;
460}
461
462std::uint64_t Quotient::findRunIndex(const std::uint64_t quot) const
463{
464 std::uint64_t start = quot;
465 while (isShifted(elem: getElement(qf: *this, index: start)))
466 {
467 start = decrease(qf: *this, index: start);
468 }
469
470 std::uint64_t slot = start;
471 while (start != quot)
472 {
473 do
474 {
475 slot = increase(qf: *this, index: slot);
476 }
477 while (isContinuation(elem: getElement(qf: *this, index: slot)));
478
479 do
480 {
481 start = increase(qf: *this, index: start);
482 }
483 while (!isOccupied(elem: getElement(qf: *this, index: start)));
484 }
485 return slot;
486}
487
488void Quotient::start(const Quotient& qf, Iterator& iter)
489{
490 iter.visited = qf.entries;
491 if (qf.entries == 0)
492 {
493 return;
494 }
495
496 std::uint64_t start = 0;
497 for (start = 0; start < qf.capacity; ++start)
498 {
499 if (isClusterStart(elem: getElement(qf, index: start)))
500 {
501 break;
502 }
503 }
504
505 iter.visited = 0;
506 iter.index = start;
507}
508
509bool Quotient::done(const Quotient& qf, const Iterator& iter)
510{
511 return qf.entries == iter.visited;
512}
513
514std::uint64_t Quotient::next(const Quotient& qf, Iterator& iter)
515{
516 while (!done(qf, iter))
517 {
518 const std::uint64_t entry = getElement(qf, index: iter.index);
519 if (isClusterStart(elem: entry))
520 {
521 iter.quotient = iter.index;
522 }
523 else if (isRunStart(elem: entry))
524 {
525 std::uint64_t quotient = iter.quotient;
526 do
527 {
528 quotient = increase(qf, index: quotient);
529 }
530 while (!isOccupied(elem: getElement(qf, index: quotient)));
531 iter.quotient = quotient;
532 }
533
534 iter.index = increase(qf, index: iter.index);
535 if (!isEmptyElement(elem: entry))
536 {
537 const std::uint64_t quotient = iter.quotient;
538 const std::uint64_t rem = getRemainder(elem: entry);
539 const std::uint64_t hash = (quotient << qf.rBits) | rem;
540 ++iter.visited;
541 return hash;
542 }
543 }
544 return 0;
545}
546
547void Quotient::setElement(Quotient& qf, const std::uint64_t index, std::uint64_t elem)
548{
549 const std::size_t bitPos = qf.elemBits * index;
550 const std::size_t slotPos = bitPos % 64;
551 const int spillBits = (slotPos + qf.elemBits) - 64;
552 std::size_t tabPos = bitPos / 64;
553
554 elem &= qf.elemMask;
555 qf.filter[tabPos] &= ~(qf.elemMask << slotPos);
556 qf.filter[tabPos] |= elem << slotPos;
557 if (spillBits > 0)
558 {
559 ++tabPos;
560 qf.filter[tabPos] &= ~lowMask(n: spillBits);
561 qf.filter[tabPos] |= elem >> (qf.elemBits - spillBits);
562 }
563}
564
565std::uint64_t Quotient::getElement(const Quotient& qf, const std::uint64_t index)
566{
567 const std::size_t bitPos = qf.elemBits * index;
568 const std::size_t slotPos = bitPos % 64;
569 const int spillBits = (slotPos + qf.elemBits) - 64;
570 std::size_t tabPos = bitPos / 64;
571 std::uint64_t elem = (qf.filter[tabPos] >> slotPos) & qf.elemMask;
572
573 if (spillBits > 0)
574 {
575 ++tabPos;
576 const std::uint64_t x = qf.filter[tabPos] & lowMask(n: spillBits);
577 elem |= x << (qf.elemBits - spillBits);
578 }
579 return elem;
580}
581
582std::uint64_t Quotient::increase(const Quotient& qf, const std::uint64_t index)
583{
584 return (index + 1) & qf.indexMask;
585}
586
587std::uint64_t Quotient::decrease(const Quotient& qf, const std::uint64_t index)
588{
589 return (index - 1) & qf.indexMask;
590}
591
592bool Quotient::isOccupied(const std::uint64_t elem)
593{
594 return (elem & 1) != 0;
595}
596
597std::uint64_t Quotient::setOccupied(const std::uint64_t elem)
598{
599 return elem | 1;
600}
601
602std::uint64_t Quotient::clearOccupied(const std::uint64_t elem)
603{
604 return elem & ~1;
605}
606
607bool Quotient::isContinuation(const std::uint64_t elem)
608{
609 return (elem & 2) != 0;
610}
611
612std::uint64_t Quotient::setContinuation(const std::uint64_t elem)
613{
614 return elem | 2;
615}
616
617std::uint64_t Quotient::clearContinuation(const std::uint64_t elem)
618{
619 return elem & ~2;
620}
621
622bool Quotient::isShifted(const std::uint64_t elem)
623{
624 return (elem & 4) != 0;
625}
626
627std::uint64_t Quotient::setShifted(const std::uint64_t elem)
628{
629 return elem | 4;
630}
631
632std::uint64_t Quotient::clearShifted(const std::uint64_t elem)
633{
634 return elem & ~4;
635}
636
637std::uint64_t Quotient::getRemainder(const std::uint64_t elem)
638{
639 return elem >> 3;
640}
641
642bool Quotient::isEmptyElement(const std::uint64_t elem)
643{
644 return (elem & 7) == 0;
645}
646
647bool Quotient::isClusterStart(const std::uint64_t elem)
648{
649 return isOccupied(elem) && !isContinuation(elem) && !isShifted(elem);
650}
651
652bool Quotient::isRunStart(const std::uint64_t elem)
653{
654 return !isContinuation(elem) && (isOccupied(elem) || isShifted(elem));
655}
656
657std::uint64_t Quotient::lowMask(const std::uint64_t n)
658{
659 return (1ULL << n) - 1ULL;
660}
661
662std::uint64_t Quotient::filterSizeInBytes(const std::uint8_t q, const std::uint8_t r)
663{
664 if ((q == 0) || (r == 0) || ((q + r) > 64))
665 {
666 return 0;
667 }
668
669 const std::uint64_t bits = static_cast<std::uint64_t>(1 << q) * (r + 3);
670 const std::uint64_t bytes = bits / 8;
671 return (bits % 8) ? (bytes + 1) : bytes;
672}
673// NOLINTEND(readability-magic-numbers)
674} // namespace data_structure::filter
675