BitShares-Core  4.0.0
BitShares blockchain implementation and command-line interface software
bloom_filter.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 /*
4  *********************************************************************
5  * *
6  * Open Bloom Filter *
7  * *
8  * Author: Arash Partow - 2000 *
9  * URL: http://www.partow.net *
10  * URL: http://www.partow.net/programming/hashfunctions/index.html *
11  * *
12  * Copyright notice: *
13  * Free use of the Open Bloom Filter Library is permitted under the *
14  * guidelines and in accordance with the most current version of the *
15  * Common Public License. *
16  * http://www.opensource.org/licenses/cpl1.0.php *
17  * *
18  *********************************************************************
19 */
20 
21 #include <algorithm>
22 #include <cmath>
23 #include <cstddef>
24 #include <iterator>
25 #include <limits>
26 #include <string>
27 #include <vector>
28 
29 #include <fc/reflect/reflect.hpp>
30 
31 namespace fc {
32 
33 
34 static constexpr std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
35 static const unsigned char bit_mask[bits_per_char] = {
36  0x01, //00000001
37  0x02, //00000010
38  0x04, //00000100
39  0x08, //00001000
40  0x10, //00010000
41  0x20, //00100000
42  0x40, //01000000
43  0x80 //10000000
44  };
45 
47 {
48 public:
49 
51  : minimum_size(1),
52  maximum_size(std::numeric_limits<unsigned long long int>::max()),
54  maximum_number_of_hashes(std::numeric_limits<unsigned int>::max()),
57  random_seed(0xA5A5A5A55A5A5A5AULL)
58  {}
59 
60  bloom_parameters(unsigned long long int projected_element_count,
62  unsigned long long int maximum_size) :
63  minimum_size(1),
64  maximum_size(maximum_size),
66  maximum_number_of_hashes(std::numeric_limits<unsigned int>::max()),
67  projected_element_count(projected_element_count),
68  false_positive_probability(false_positive_probability),
69  random_seed(0xA5A5A5A55A5A5A5AULL)
70  {
72  }
73 
75  {}
76 
77  inline bool operator!()
78  {
79  return (minimum_size > maximum_size) ||
82  (0 == maximum_number_of_hashes) ||
83  (0 == projected_element_count) ||
85  (std::numeric_limits<double>::infinity() == std::abs(false_positive_probability)) ||
86  (0 == random_seed) ||
87  (0xFFFFFFFFFFFFFFFFULL == random_seed);
88  }
89 
90  //Allowed min/max size of the bloom filter in bits
91  unsigned long long int minimum_size;
92  unsigned long long int maximum_size;
93 
94  //Allowed min/max number of hash functions
97 
98  //The approximate number of elements to be inserted
99  //into the bloom filter, should be within one order
100  //of magnitude. The default is 10000.
101  unsigned long long int projected_element_count;
102 
103  //The approximate false positive probability expected
104  //from the bloom filter. The default is the reciprocal
105  //of the projected_element_count.
107 
108  unsigned long long int random_seed;
109 
111  {
113  : number_of_hashes(0),
114  table_size(0)
115  {}
116 
117  unsigned int number_of_hashes;
118  unsigned long long int table_size;
119  };
120 
122 
124  {
125  /*
126  Note:
127  The following will attempt to find the number of hash functions
128  and minimum amount of storage bits required to construct a bloom
129  filter consistent with the user defined false positive probability
130  and estimated element insertion count.
131  */
132 
133  if (!(*this))
134  return false;
135 
136  double min_m = std::numeric_limits<double>::infinity();
137  double min_k = 0.0;
138  double curr_m = 0.0;
139  double k = 1.0;
140 
141  while (k < 1000.0)
142  {
143  double numerator = (- k * projected_element_count);
144  double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k));
145  curr_m = numerator / denominator;
146  if (curr_m < min_m)
147  {
148  min_m = curr_m;
149  min_k = k;
150  }
151  k += 1.0;
152  }
153 
155 
156  optp.number_of_hashes = static_cast<unsigned int>(min_k);
157  optp.table_size = static_cast<unsigned long long int>(min_m);
158  optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0);
159 
160  if (optp.number_of_hashes < minimum_number_of_hashes)
162  else if (optp.number_of_hashes > maximum_number_of_hashes)
164 
165  if (optp.table_size < minimum_size)
166  optp.table_size = minimum_size;
167  else if (optp.table_size > maximum_size)
168  optp.table_size = maximum_size;
169 
170  return true;
171  }
172 
173 };
174 
176 {
177 protected:
178 
179  typedef unsigned int bloom_type;
180  typedef unsigned char cell_type;
181 
182 public:
183 
185  : salt_count_(0),
186  table_size_(0),
187  raw_table_size_(0),
188  projected_element_count_(0),
189  inserted_element_count_(0),
190  random_seed_(0),
191  desired_false_positive_probability_(0.0)
192  {}
193 
195  : projected_element_count_(p.projected_element_count),
196  inserted_element_count_(0),
197  random_seed_((p.random_seed * 0xA5A5A5A5) + 1),
198  desired_false_positive_probability_(p.false_positive_probability)
199  {
200  salt_count_ = p.optimal_parameters.number_of_hashes;
201  table_size_ = p.optimal_parameters.table_size;
202  generate_unique_salt();
203  raw_table_size_ = table_size_ / bits_per_char;
204 
205  bit_table_.resize( static_cast<std::size_t>(raw_table_size_) );
206  //bit_table_ = new cell_type[static_cast<std::size_t>(raw_table_size_)];
207  std::fill_n(bit_table_.data(),raw_table_size_,0x00);
208  }
209 
211  {
212  this->operator=(filter);
213  }
214 
215  inline bool operator == (const bloom_filter& f) const
216  {
217  if (this != &f)
218  {
219  return
220  (salt_count_ == f.salt_count_) &&
221  (table_size_ == f.table_size_) &&
222  (raw_table_size_ == f.raw_table_size_) &&
223  (projected_element_count_ == f.projected_element_count_) &&
224  (inserted_element_count_ == f.inserted_element_count_) &&
225  (random_seed_ == f.random_seed_) &&
226  (desired_false_positive_probability_ == f.desired_false_positive_probability_) &&
227  (salt_ == f.salt_) &&
228  std::equal(f.bit_table_.data(),f.bit_table_.data() + raw_table_size_,bit_table_.data());
229  }
230  else
231  return true;
232  }
233 
234  inline bool operator != (const bloom_filter& f) const
235  {
236  return !operator==(f);
237  }
238 
239  inline bloom_filter& operator = (const bloom_filter& f)
240  {
241  if (this != &f)
242  {
243  salt_count_ = f.salt_count_;
244  table_size_ = f.table_size_;
245  raw_table_size_ = f.raw_table_size_;
246  projected_element_count_ = f.projected_element_count_;
247  inserted_element_count_ = f.inserted_element_count_;
248  random_seed_ = f.random_seed_;
249  desired_false_positive_probability_ = f.desired_false_positive_probability_;
250  bit_table_.resize( raw_table_size_ );
251  std::copy(f.bit_table_.data(),f.bit_table_.data() + raw_table_size_,bit_table_.data());
252  salt_ = f.salt_;
253  }
254  return *this;
255  }
256 
257  virtual ~bloom_filter()
258  {
259  }
260 
261  inline bool operator!() const
262  {
263  return (0 == table_size_);
264  }
265 
266  inline void clear()
267  {
268  std::fill_n(bit_table_.data(),raw_table_size_,0x00);
269  inserted_element_count_ = 0;
270  }
271 
272  inline void insert(const unsigned char* key_begin, const std::size_t& length)
273  {
274  std::size_t bit_index = 0;
275  std::size_t bit = 0;
276  for (std::size_t i = 0; i < salt_.size(); ++i)
277  {
278  compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
279  bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
280  }
281  ++inserted_element_count_;
282  }
283 
284  template<typename T>
285  inline void insert(const T& t)
286  {
287  // Note: T must be a C++ POD type.
288  insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
289  }
290 
291  inline void insert(const std::string& key)
292  {
293  insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
294  }
295 
296  inline void insert(const char* data, const std::size_t& length)
297  {
298  insert(reinterpret_cast<const unsigned char*>(data),length);
299  }
300 
301  template<typename InputIterator>
302  inline void insert(const InputIterator begin, const InputIterator end)
303  {
304  InputIterator itr = begin;
305  while (end != itr)
306  {
307  insert(*(itr++));
308  }
309  }
310 
311  inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
312  {
313  std::size_t bit_index = 0;
314  std::size_t bit = 0;
315  for (std::size_t i = 0; i < salt_.size(); ++i)
316  {
317  compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
318  if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
319  {
320  return false;
321  }
322  }
323  return true;
324  }
325 
326  template<typename T>
327  inline bool contains(const T& t) const
328  {
329  return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
330  }
331 
332  inline bool contains(const std::string& key) const
333  {
334  return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
335  }
336 
337  inline bool contains(const char* data, const std::size_t& length) const
338  {
339  return contains(reinterpret_cast<const unsigned char*>(data),length);
340  }
341 
342  template<typename InputIterator>
343  inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
344  {
345  InputIterator itr = begin;
346  while (end != itr)
347  {
348  if (!contains(*itr))
349  {
350  return itr;
351  }
352  ++itr;
353  }
354  return end;
355  }
356 
357  template<typename InputIterator>
358  inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
359  {
360  InputIterator itr = begin;
361  while (end != itr)
362  {
363  if (contains(*itr))
364  {
365  return itr;
366  }
367  ++itr;
368  }
369  return end;
370  }
371 
372  inline virtual unsigned long long int size() const
373  {
374  return table_size_;
375  }
376 
377  inline std::size_t element_count() const
378  {
379  return inserted_element_count_;
380  }
381 
382  inline double effective_fpp() const
383  {
384  /*
385  Note:
386  The effective false positive probability is calculated using the
387  designated table size and hash function count in conjunction with
388  the current number of inserted elements - not the user defined
389  predicated/expected number of inserted elements.
390  */
391  return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
392  }
393 
394  inline bloom_filter& operator &= (const bloom_filter& f)
395  {
396  /* intersection */
397  if (
398  (salt_count_ == f.salt_count_) &&
399  (table_size_ == f.table_size_) &&
400  (random_seed_ == f.random_seed_)
401  )
402  {
403  for (std::size_t i = 0; i < raw_table_size_; ++i)
404  {
405  bit_table_[i] &= f.bit_table_[i];
406  }
407  }
408  return *this;
409  }
410 
411  inline bloom_filter& operator |= (const bloom_filter& f)
412  {
413  /* union */
414  if (
415  (salt_count_ == f.salt_count_) &&
416  (table_size_ == f.table_size_) &&
417  (random_seed_ == f.random_seed_)
418  )
419  {
420  for (std::size_t i = 0; i < raw_table_size_; ++i)
421  {
422  bit_table_[i] |= f.bit_table_[i];
423  }
424  }
425  return *this;
426  }
427 
428  inline bloom_filter& operator ^= (const bloom_filter& f)
429  {
430  /* difference */
431  if (
432  (salt_count_ == f.salt_count_) &&
433  (table_size_ == f.table_size_) &&
434  (random_seed_ == f.random_seed_)
435  )
436  {
437  for (std::size_t i = 0; i < raw_table_size_; ++i)
438  {
439  bit_table_[i] ^= f.bit_table_[i];
440  }
441  }
442  return *this;
443  }
444 
445  inline const cell_type* table() const
446  {
447  return bit_table_.data();
448  }
449 
450  inline std::size_t hash_count()
451  {
452  return salt_.size();
453  }
454 
455 protected:
456 
457  inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
458  {
459  bit_index = hash % table_size_;
460  bit = bit_index % bits_per_char;
461  }
462 
464  {
465  /*
466  Note:
467  A distinct hash function need not be implementation-wise
468  distinct. In the current implementation "seeding" a common
469  hash function with different values seems to be adequate.
470  */
471  const unsigned int predef_salt_count = 128;
472  static const bloom_type predef_salt[predef_salt_count] =
473  {
474  0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
475  0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
476  0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
477  0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
478  0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
479  0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
480  0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
481  0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
482  0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
483  0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
484  0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
485  0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
486  0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
487  0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
488  0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
489  0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
490  0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
491  0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
492  0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
493  0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
494  0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
495  0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
496  0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
497  0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
498  0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
499  0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
500  0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
501  0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
502  0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
503  0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
504  0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
505  0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
506  };
507 
508  if (salt_count_ <= predef_salt_count)
509  {
510  std::copy(predef_salt,
511  predef_salt + salt_count_,
512  std::back_inserter(salt_));
513  for (unsigned int i = 0; i < salt_.size(); ++i)
514  {
515  /*
516  Note:
517  This is done to integrate the user defined random seed,
518  so as to allow for the generation of unique bloom filter
519  instances.
520  */
521  salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + static_cast<bloom_type>(random_seed_);
522  }
523  }
524  else
525  {
526  std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
527  srand(static_cast<unsigned int>(random_seed_));
528  while (salt_.size() < salt_count_)
529  {
530  bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
531  if (0 == current_salt) continue;
532  if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
533  {
534  salt_.push_back(current_salt);
535  }
536  }
537  }
538  }
539 
540  inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
541  {
542  const unsigned char* itr = begin;
543  unsigned int loop = 0;
544  while (remaining_length >= 8)
545  {
546  const unsigned int& i1 = *(reinterpret_cast<const unsigned int*>(itr)); itr += sizeof(unsigned int);
547  const unsigned int& i2 = *(reinterpret_cast<const unsigned int*>(itr)); itr += sizeof(unsigned int);
548  hash ^= (hash << 7) ^ i1 * (hash >> 3) ^
549  (~((hash << 11) + (i2 ^ (hash >> 5))));
550  remaining_length -= 8;
551  }
552  if (remaining_length)
553  {
554  if (remaining_length >= 4)
555  {
556  const unsigned int& i = *(reinterpret_cast<const unsigned int*>(itr));
557  if (loop & 0x01)
558  hash ^= (hash << 7) ^ i * (hash >> 3);
559  else
560  hash ^= (~((hash << 11) + (i ^ (hash >> 5))));
561  ++loop;
562  remaining_length -= 4;
563  itr += sizeof(unsigned int);
564  }
565  if (remaining_length >= 2)
566  {
567  const unsigned short& i = *(reinterpret_cast<const unsigned short*>(itr));
568  if (loop & 0x01)
569  hash ^= (hash << 7) ^ i * (hash >> 3);
570  else
571  hash ^= (~((hash << 11) + (i ^ (hash >> 5))));
572  ++loop;
573  remaining_length -= 2;
574  itr += sizeof(unsigned short);
575  }
576  if (remaining_length)
577  {
578  hash += ((*itr) ^ (hash * 0xA5A5A5A5)) + loop;
579  }
580  }
581  return hash;
582  }
583 
584 public:
585  std::vector<bloom_type> salt_;
586  std::vector<unsigned char> bit_table_;
587  unsigned int salt_count_;
588  unsigned long long int table_size_;
589  unsigned long long int raw_table_size_;
590  unsigned long long int projected_element_count_;
592  unsigned long long int random_seed_;
594 };
595 
597 {
598  bloom_filter result = a;
599  result &= b;
600  return result;
601 }
602 
604 {
605  bloom_filter result = a;
606  result |= b;
607  return result;
608 }
609 
611 {
612  bloom_filter result = a;
613  result ^= b;
614  return result;
615 }
616 
617 
618 } // namespace fc
619 
620 /*
621  Note 1:
622  If it can be guaranteed that bits_per_char will be of the form 2^n then
623  the following optimization can be used:
624 
625  hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
626 
627  Note 2:
628  For performance reasons where possible when allocating memory it should
629  be aligned (aligned_alloc) according to the architecture being used.
630 */
bloom_filter operator&(const bloom_filter &a, const bloom_filter &b)
void insert(const unsigned char *key_begin, const std::size_t &length)
void insert(const char *data, const std::size_t &length)
unsigned long long int raw_table_size_
bloom_filter operator|(const bloom_filter &a, const bloom_filter &b)
std::vector< unsigned char > bit_table_
void insert(const InputIterator begin, const InputIterator end)
bool operator!() const
std::vector< bloom_type > salt_
unsigned int inserted_element_count_
void generate_unique_salt()
InputIterator contains_none(const InputIterator begin, const InputIterator end) const
Defines types and macros used to provide reflection.
unsigned long long int random_seed_
unsigned long long int projected_element_count_
bloom_filter(const bloom_filter &filter)
unsigned int salt_count_
optimal_parameters_t optimal_parameters
virtual ~bloom_filter()
bool operator!=(const optional< T > &left, const optional< T > &right)
Definition: optional.hpp:253
unsigned long long int table_size_
double desired_false_positive_probability_
virtual bool compute_optimal_parameters()
bloom_filter operator^(const bloom_filter &a, const bloom_filter &b)
unsigned int bloom_type
bool operator==(const optional< T > &left, const optional< T > &right)
Definition: optional.hpp:245
double effective_fpp() const
void insert(const std::string &key)
unsigned char cell_type
bloom_filter(const bloom_parameters &p)
unsigned long long int minimum_size
unsigned int minimum_number_of_hashes
std::size_t hash_count()
std::size_t element_count() const
const cell_type * table() const
unsigned long long int projected_element_count
unsigned long long int maximum_size
bool contains(const T &t) const
virtual bool contains(const unsigned char *key_begin, const std::size_t length) const
typename impl::filter< Filter, list<>, List >::type filter
Get a list with all elements that do not pass a filter removed.
Definition: typelist.hpp:205
void insert(const T &t)
bloom_parameters(unsigned long long int projected_element_count, double false_positive_probability, unsigned long long int maximum_size)
InputIterator contains_all(const InputIterator begin, const InputIterator end) const
Definition: api.hpp:15
bloom_type hash_ap(const unsigned char *begin, std::size_t remaining_length, bloom_type hash) const
virtual unsigned long long int size() const
virtual void compute_indices(const bloom_type &hash, std::size_t &bit_index, std::size_t &bit) const
unsigned long long int random_seed
void copy(const path &from, const path &to)
Definition: filesystem.cpp:241
bool contains(const char *data, const std::size_t &length) const
virtual ~bloom_parameters()
unsigned int maximum_number_of_hashes
bool contains(const std::string &key) const