BitShares-Core  5.0.0
BitShares blockchain implementation and command-line interface software
bloom_filter.hpp
Go to the documentation of this file.
1 /*
2  *********************************************************************
3  * *
4  * Open Bloom Filter *
5  * *
6  * Author: Arash Partow - 2000 *
7  * URL: http://www.partow.net *
8  * URL: http://www.partow.net/programming/hashfunctions/index.html *
9  * *
10  * Copyright notice: *
11  * Free use of the Open Bloom Filter Library is permitted under the *
12  * guidelines and in accordance with the MIT License. *
13  * http://www.opensource.org/licenses/MIT *
14  * *
15  *********************************************************************
16 */
17 
18 
19 #ifndef INCLUDE_BLOOM_FILTER_HPP
20 #define INCLUDE_BLOOM_FILTER_HPP
21 
22 #include <algorithm>
23 #include <cmath>
24 #include <cstddef>
25 #include <cstdlib>
26 #include <iterator>
27 #include <limits>
28 #include <string>
29 #include <vector>
30 
31 namespace fc {
32 
33 static constexpr std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
34 
35 static const unsigned char bit_mask[bits_per_char] = {
36  0x01, //00000001
37  0x02, //00000010
38  0x04, //00000100
39  0x08, //00001000
40  0x10, //00010000
41  0x20, //00100000
42  0x40, //01000000
43  0x80 //10000000
44  };
45 
47 {
48 public:
49 
51  : minimum_size(1),
52  maximum_size(std::numeric_limits<unsigned long long int>::max()),
54  maximum_number_of_hashes(std::numeric_limits<unsigned int>::max()),
57  random_seed(0xA5A5A5A55A5A5A5AULL)
58  {}
59 
60  bloom_parameters(unsigned long long int projected_element_count,
62  unsigned long long int maximum_size) :
63  minimum_size(1),
64  maximum_size(maximum_size),
66  maximum_number_of_hashes(std::numeric_limits<unsigned int>::max()),
67  projected_element_count(projected_element_count),
68  false_positive_probability(false_positive_probability),
69  random_seed(0xA5A5A5A55A5A5A5AULL)
70  {
72  }
73 
75  {}
76 
77  inline bool operator!()
78  {
79  return (minimum_size > maximum_size) ||
82  (0 == maximum_number_of_hashes) ||
83  (0 == projected_element_count) ||
85  (std::numeric_limits<double>::infinity() == std::abs(false_positive_probability)) ||
86  (0 == random_seed) ||
87  (0xFFFFFFFFFFFFFFFFULL == random_seed);
88  }
89 
90  // Allowable min/max size of the bloom filter in bits
91  unsigned long long int minimum_size;
92  unsigned long long int maximum_size;
93 
94  // Allowable min/max number of hash functions
97 
98  // The approximate number of elements to be inserted
99  // into the bloom filter, should be within one order
100  // of magnitude. The default is 10000.
101  unsigned long long int projected_element_count;
102 
103  // The approximate false positive probability expected
104  // from the bloom filter. The default is assumed to be
105  // the reciprocal of the projected_element_count.
107 
108  unsigned long long int random_seed;
109 
111  {
113  : number_of_hashes(0),
114  table_size(0)
115  {}
116 
117  unsigned int number_of_hashes;
118  unsigned long long int table_size;
119  };
120 
122 
124  {
125  /*
126  Note:
127  The following will attempt to find the number of hash functions
128  and minimum amount of storage bits required to construct a bloom
129  filter consistent with the user defined false positive probability
130  and estimated element insertion count.
131  */
132 
133  if (!(*this))
134  return false;
135 
136  double min_m = std::numeric_limits<double>::infinity();
137  double min_k = 0.0;
138  double k = 1.0;
139 
140  while (k < 1000.0)
141  {
142  const double numerator = (- k * projected_element_count);
143  const double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k));
144 
145  const double curr_m = numerator / denominator;
146 
147  if (curr_m < min_m)
148  {
149  min_m = curr_m;
150  min_k = k;
151  }
152 
153  k += 1.0;
154  }
155 
157 
158  optp.number_of_hashes = static_cast<unsigned int>(min_k);
159 
160  optp.table_size = static_cast<unsigned long long int>(min_m);
161 
162  optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0);
163 
164  if (optp.number_of_hashes < minimum_number_of_hashes)
166  else if (optp.number_of_hashes > maximum_number_of_hashes)
168 
169  if (optp.table_size < minimum_size)
170  optp.table_size = minimum_size;
171  else if (optp.table_size > maximum_size)
172  optp.table_size = maximum_size;
173 
174  return true;
175  }
176 
177 };
178 
180 {
181 protected:
182 
183  typedef unsigned int bloom_type;
184  typedef unsigned char cell_type;
185  typedef std::vector<unsigned char> table_type;
186 
187 public:
188 
190  : salt_count_(0),
191  table_size_(0),
192  projected_element_count_(0),
193  inserted_element_count_ (0),
194  random_seed_(0),
195  desired_false_positive_probability_(0.0)
196  {}
197 
199  : projected_element_count_(p.projected_element_count),
200  inserted_element_count_(0),
201  random_seed_((p.random_seed * 0xA5A5A5A5) + 1),
202  desired_false_positive_probability_(p.false_positive_probability)
203  {
204  salt_count_ = p.optimal_parameters.number_of_hashes;
205  table_size_ = p.optimal_parameters.table_size;
206 
207  generate_unique_salt();
208 
209  bit_table_.resize(table_size_ / bits_per_char, static_cast<unsigned char>(0x00));
210  }
211 
213  {
214  this->operator=(filter);
215  }
216 
217  inline bool operator == (const bloom_filter& f) const
218  {
219  if (this != &f)
220  {
221  return
222  (salt_count_ == f.salt_count_ ) &&
223  (table_size_ == f.table_size_ ) &&
224  (bit_table_.size() == f.bit_table_.size() ) &&
225  (projected_element_count_ == f.projected_element_count_ ) &&
226  (inserted_element_count_ == f.inserted_element_count_ ) &&
227  (random_seed_ == f.random_seed_ ) &&
228  (desired_false_positive_probability_ == f.desired_false_positive_probability_) &&
229  (salt_ == f.salt_ ) &&
230  (bit_table_ == f.bit_table_ ) ;
231  }
232  else
233  return true;
234  }
235 
236  inline bool operator != (const bloom_filter& f) const
237  {
238  return !operator==(f);
239  }
240 
241  inline bloom_filter& operator = (const bloom_filter& f)
242  {
243  if (this != &f)
244  {
245  salt_count_ = f.salt_count_;
246  table_size_ = f.table_size_;
247  bit_table_ = f.bit_table_;
248  salt_ = f.salt_;
249 
250  projected_element_count_ = f.projected_element_count_;
251  inserted_element_count_ = f.inserted_element_count_;
252 
253  random_seed_ = f.random_seed_;
254 
255  desired_false_positive_probability_ = f.desired_false_positive_probability_;
256  }
257 
258  return *this;
259  }
260 
261  virtual ~bloom_filter()
262  {}
263 
264  inline bool operator!() const
265  {
266  return (0 == table_size_);
267  }
268 
269  inline void clear()
270  {
271  std::fill(bit_table_.begin(), bit_table_.end(), static_cast<unsigned char>(0x00));
272  inserted_element_count_ = 0;
273  }
274 
275  inline void insert(const unsigned char* key_begin, const std::size_t& length)
276  {
277  std::size_t bit_index = 0;
278  std::size_t bit = 0;
279 
280  for (std::size_t i = 0; i < salt_.size(); ++i)
281  {
282  compute_indices(hash_ap(key_begin, length, salt_[i]), bit_index, bit);
283 
284  bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
285  }
286 
287  ++inserted_element_count_;
288  }
289 
290  template <typename T>
291  inline void insert(const T& t)
292  {
293  // Note: T must be a C++ POD type.
294  insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
295  }
296 
297  inline void insert(const std::string& key)
298  {
299  insert(reinterpret_cast<const unsigned char*>(key.data()),key.size());
300  }
301 
302  inline void insert(const char* data, const std::size_t& length)
303  {
304  insert(reinterpret_cast<const unsigned char*>(data),length);
305  }
306 
307  template <typename InputIterator>
308  inline void insert(const InputIterator begin, const InputIterator end)
309  {
310  InputIterator itr = begin;
311 
312  while (end != itr)
313  {
314  insert(*(itr++));
315  }
316  }
317 
318  inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
319  {
320  std::size_t bit_index = 0;
321  std::size_t bit = 0;
322 
323  for (std::size_t i = 0; i < salt_.size(); ++i)
324  {
325  compute_indices(hash_ap(key_begin, length, salt_[i]), bit_index, bit);
326 
327  if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
328  {
329  return false;
330  }
331  }
332 
333  return true;
334  }
335 
336  template <typename T>
337  inline bool contains(const T& t) const
338  {
339  return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
340  }
341 
342  inline bool contains(const std::string& key) const
343  {
344  return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
345  }
346 
347  inline bool contains(const char* data, const std::size_t& length) const
348  {
349  return contains(reinterpret_cast<const unsigned char*>(data),length);
350  }
351 
352  template <typename InputIterator>
353  inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
354  {
355  InputIterator itr = begin;
356 
357  while (end != itr)
358  {
359  if (!contains(*itr))
360  {
361  return itr;
362  }
363 
364  ++itr;
365  }
366 
367  return end;
368  }
369 
370  template <typename InputIterator>
371  inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
372  {
373  InputIterator itr = begin;
374 
375  while (end != itr)
376  {
377  if (contains(*itr))
378  {
379  return itr;
380  }
381 
382  ++itr;
383  }
384 
385  return end;
386  }
387 
388  inline virtual unsigned long long int size() const
389  {
390  return table_size_;
391  }
392 
393  inline unsigned long long int element_count() const
394  {
395  return inserted_element_count_;
396  }
397 
398  inline double effective_fpp() const
399  {
400  /*
401  Note:
402  The effective false positive probability is calculated using the
403  designated table size and hash function count in conjunction with
404  the current number of inserted elements - not the user defined
405  predicated/expected number of inserted elements.
406  */
407  return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
408  }
409 
410  inline bloom_filter& operator &= (const bloom_filter& f)
411  {
412  /* intersection */
413  if (
414  (salt_count_ == f.salt_count_ ) &&
415  (table_size_ == f.table_size_ ) &&
416  (random_seed_ == f.random_seed_)
417  )
418  {
419  for (std::size_t i = 0; i < bit_table_.size(); ++i)
420  {
421  bit_table_[i] &= f.bit_table_[i];
422  }
423  }
424 
425  return *this;
426  }
427 
428  inline bloom_filter& operator |= (const bloom_filter& f)
429  {
430  /* union */
431  if (
432  (salt_count_ == f.salt_count_ ) &&
433  (table_size_ == f.table_size_ ) &&
434  (random_seed_ == f.random_seed_)
435  )
436  {
437  for (std::size_t i = 0; i < bit_table_.size(); ++i)
438  {
439  bit_table_[i] |= f.bit_table_[i];
440  }
441  }
442 
443  return *this;
444  }
445 
446  inline bloom_filter& operator ^= (const bloom_filter& f)
447  {
448  /* difference */
449  if (
450  (salt_count_ == f.salt_count_ ) &&
451  (table_size_ == f.table_size_ ) &&
452  (random_seed_ == f.random_seed_)
453  )
454  {
455  for (std::size_t i = 0; i < bit_table_.size(); ++i)
456  {
457  bit_table_[i] ^= f.bit_table_[i];
458  }
459  }
460 
461  return *this;
462  }
463 
464  inline const cell_type* table() const
465  {
466  return bit_table_.data();
467  }
468 
469  inline std::size_t hash_count()
470  {
471  return salt_.size();
472  }
473 
474 protected:
475 
476  inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
477  {
478  bit_index = hash % table_size_;
479  bit = bit_index % bits_per_char;
480  }
481 
483  {
484  /*
485  Note:
486  A distinct hash function need not be implementation-wise
487  distinct. In the current implementation "seeding" a common
488  hash function with different values seems to be adequate.
489  */
490  const unsigned int predef_salt_count = 128;
491 
492  static const bloom_type predef_salt[predef_salt_count] =
493  {
494  0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
495  0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
496  0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
497  0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
498  0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
499  0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
500  0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
501  0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
502  0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
503  0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
504  0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
505  0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
506  0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
507  0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
508  0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
509  0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
510  0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
511  0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
512  0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
513  0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
514  0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
515  0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
516  0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
517  0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
518  0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
519  0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
520  0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
521  0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
522  0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
523  0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
524  0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
525  0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
526  };
527 
528  if (salt_count_ <= predef_salt_count)
529  {
530  std::copy(predef_salt,
531  predef_salt + salt_count_,
532  std::back_inserter(salt_));
533 
534  for (std::size_t i = 0; i < salt_.size(); ++i)
535  {
536  /*
537  Note:
538  This is done to integrate the user defined random seed,
539  so as to allow for the generation of unique bloom filter
540  instances.
541  */
542  salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + static_cast<bloom_type>(random_seed_);
543  }
544  }
545  else
546  {
547  std::copy(predef_salt, predef_salt + predef_salt_count, std::back_inserter(salt_));
548 
549  srand(static_cast<unsigned int>(random_seed_));
550 
551  while (salt_.size() < salt_count_)
552  {
553  bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
554 
555  if (0 == current_salt)
556  continue;
557 
558  if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
559  {
560  salt_.push_back(current_salt);
561  }
562  }
563  }
564  }
565 
566  inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
567  {
568  const unsigned char* itr = begin;
569  unsigned int loop = 0;
570 
571  while (remaining_length >= 8)
572  {
573  const unsigned int& i1 = *(reinterpret_cast<const unsigned int*>(itr)); itr += sizeof(unsigned int);
574  const unsigned int& i2 = *(reinterpret_cast<const unsigned int*>(itr)); itr += sizeof(unsigned int);
575 
576  hash ^= (hash << 7) ^ i1 * (hash >> 3) ^
577  (~((hash << 11) + (i2 ^ (hash >> 5))));
578 
579  remaining_length -= 8;
580  }
581 
582  if (remaining_length)
583  {
584  if (remaining_length >= 4)
585  {
586  const unsigned int& i = *(reinterpret_cast<const unsigned int*>(itr));
587 
588  if (loop & 0x01)
589  hash ^= (hash << 7) ^ i * (hash >> 3);
590  else
591  hash ^= (~((hash << 11) + (i ^ (hash >> 5))));
592 
593  ++loop;
594 
595  remaining_length -= 4;
596 
597  itr += sizeof(unsigned int);
598  }
599 
600  if (remaining_length >= 2)
601  {
602  const unsigned short& i = *(reinterpret_cast<const unsigned short*>(itr));
603 
604  if (loop & 0x01)
605  hash ^= (hash << 7) ^ i * (hash >> 3);
606  else
607  hash ^= (~((hash << 11) + (i ^ (hash >> 5))));
608 
609  ++loop;
610 
611  remaining_length -= 2;
612 
613  itr += sizeof(unsigned short);
614  }
615 
616  if (remaining_length)
617  {
618  hash += ((*itr) ^ (hash * 0xA5A5A5A5)) + loop;
619  }
620  }
621 
622  return hash;
623  }
624 
625  public:
626  std::vector<bloom_type> salt_;
627  std::vector<unsigned char> bit_table_;
628  unsigned int salt_count_;
629  unsigned long long int table_size_;
630  unsigned long long int projected_element_count_;
631  unsigned long long int inserted_element_count_;
632  unsigned long long int random_seed_;
634 };
635 
637 {
638  bloom_filter result = a;
639  result &= b;
640  return result;
641 }
642 
644 {
645  bloom_filter result = a;
646  result |= b;
647  return result;
648 }
649 
651 {
652  bloom_filter result = a;
653  result ^= b;
654  return result;
655 }
656 
657 
658 } // namespace fc
659 
660 #endif
661 
662 
663 /*
664  Note 1:
665  If it can be guaranteed that bits_per_char will be of the form 2^n then
666  the following optimization can be used:
667 
668  bit_table_[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
669 
670  Note 2:
671  For performance reasons where possible when allocating memory it should
672  be aligned (aligned_alloc) according to the architecture being used.
673 */
bloom_filter operator&(const bloom_filter &a, const bloom_filter &b)
void insert(const unsigned char *key_begin, const std::size_t &length)
void insert(const char *data, const std::size_t &length)
bloom_filter operator|(const bloom_filter &a, const bloom_filter &b)
std::vector< unsigned char > bit_table_
void insert(const InputIterator begin, const InputIterator end)
bool operator!() const
std::vector< bloom_type > salt_
void generate_unique_salt()
InputIterator contains_none(const InputIterator begin, const InputIterator end) const
unsigned long long int random_seed_
unsigned long long int projected_element_count_
bloom_filter(const bloom_filter &filter)
unsigned int salt_count_
unsigned long long int element_count() const
optimal_parameters_t optimal_parameters
virtual ~bloom_filter()
bool operator!=(const optional< T > &left, const optional< T > &right)
Definition: optional.hpp:253
unsigned long long int table_size_
double desired_false_positive_probability_
virtual bool compute_optimal_parameters()
bloom_filter operator^(const bloom_filter &a, const bloom_filter &b)
unsigned int bloom_type
bool operator==(const optional< T > &left, const optional< T > &right)
Definition: optional.hpp:245
double effective_fpp() const
void insert(const std::string &key)
unsigned char cell_type
bloom_filter(const bloom_parameters &p)
unsigned long long int minimum_size
std::vector< unsigned char > table_type
unsigned int minimum_number_of_hashes
std::size_t hash_count()
const cell_type * table() const
unsigned long long int projected_element_count
unsigned long long int maximum_size
bool contains(const T &t) const
unsigned long long int inserted_element_count_
virtual bool contains(const unsigned char *key_begin, const std::size_t length) const
typename impl::filter< Filter, list<>, List >::type filter
Get a list with all elements that do not pass a filter removed.
Definition: typelist.hpp:205
void insert(const T &t)
bloom_parameters(unsigned long long int projected_element_count, double false_positive_probability, unsigned long long int maximum_size)
InputIterator contains_all(const InputIterator begin, const InputIterator end) const
Definition: api.hpp:15
bloom_type hash_ap(const unsigned char *begin, std::size_t remaining_length, bloom_type hash) const
virtual unsigned long long int size() const
virtual void compute_indices(const bloom_type &hash, std::size_t &bit_index, std::size_t &bit) const
unsigned long long int random_seed
void copy(const path &from, const path &to)
Definition: filesystem.cpp:241
bool contains(const char *data, const std::size_t &length) const
virtual ~bloom_parameters()
unsigned int maximum_number_of_hashes
bool contains(const std::string &key) const