Blame


1 4545b700 2021-10-15 thomas /*
2 4545b700 2021-10-15 thomas * Copyright (c) 2012-2019, Jyri J. Virkki
3 4545b700 2021-10-15 thomas * All rights reserved.
4 4545b700 2021-10-15 thomas *
5 4545b700 2021-10-15 thomas * Redistribution and use in source and binary forms, with or without
6 4545b700 2021-10-15 thomas * modification, are permitted provided that the following conditions are
7 4545b700 2021-10-15 thomas * met:
8 4545b700 2021-10-15 thomas *
9 4545b700 2021-10-15 thomas * 1. Redistributions of source code must retain the above copyright
10 4545b700 2021-10-15 thomas * notice, this list of conditions and the following disclaimer.
11 4545b700 2021-10-15 thomas *
12 4545b700 2021-10-15 thomas * 2. Redistributions in binary form must reproduce the above copyright
13 4545b700 2021-10-15 thomas * notice, this list of conditions and the following disclaimer in the
14 4545b700 2021-10-15 thomas * documentation and/or other materials provided with the distribution.
15 4545b700 2021-10-15 thomas *
16 4545b700 2021-10-15 thomas * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 4545b700 2021-10-15 thomas * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 4545b700 2021-10-15 thomas * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 4545b700 2021-10-15 thomas * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 4545b700 2021-10-15 thomas * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 4545b700 2021-10-15 thomas * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 4545b700 2021-10-15 thomas * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 4545b700 2021-10-15 thomas * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 4545b700 2021-10-15 thomas * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 4545b700 2021-10-15 thomas * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 4545b700 2021-10-15 thomas * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 4545b700 2021-10-15 thomas */
28 4545b700 2021-10-15 thomas
29 4545b700 2021-10-15 thomas /* Obtained from https://github.com/jvirkki/libbloom */
30 4545b700 2021-10-15 thomas
31 4545b700 2021-10-15 thomas /*
32 4545b700 2021-10-15 thomas * Refer to bloom.h for documentation on the public interfaces.
33 4545b700 2021-10-15 thomas */
34 4545b700 2021-10-15 thomas
35 4545b700 2021-10-15 thomas #include <assert.h>
36 4545b700 2021-10-15 thomas #include <fcntl.h>
37 4545b700 2021-10-15 thomas #include <math.h>
38 4545b700 2021-10-15 thomas #include <stdint.h>
39 4545b700 2021-10-15 thomas #include <stdio.h>
40 4545b700 2021-10-15 thomas #include <stdlib.h>
41 4545b700 2021-10-15 thomas #include <string.h>
42 4545b700 2021-10-15 thomas #include <sys/stat.h>
43 4545b700 2021-10-15 thomas #include <sys/types.h>
44 4545b700 2021-10-15 thomas #include <unistd.h>
45 4545b700 2021-10-15 thomas
46 4545b700 2021-10-15 thomas #include "bloom.h"
47 4545b700 2021-10-15 thomas #include "murmurhash2.h"
48 4545b700 2021-10-15 thomas
49 4545b700 2021-10-15 thomas #define MAKESTRING(n) STRING(n)
50 4545b700 2021-10-15 thomas #define STRING(n) #n
51 4545b700 2021-10-15 thomas
52 4545b700 2021-10-15 thomas
53 4545b700 2021-10-15 thomas inline static int test_bit_set_bit(unsigned char * buf,
54 4545b700 2021-10-15 thomas unsigned int x, int set_bit)
55 4545b700 2021-10-15 thomas {
56 4545b700 2021-10-15 thomas unsigned int byte = x >> 3;
57 4545b700 2021-10-15 thomas unsigned char c = buf[byte]; // expensive memory access
58 4545b700 2021-10-15 thomas unsigned int mask = 1 << (x % 8);
59 4545b700 2021-10-15 thomas
60 4545b700 2021-10-15 thomas if (c & mask) {
61 4545b700 2021-10-15 thomas return 1;
62 4545b700 2021-10-15 thomas } else {
63 4545b700 2021-10-15 thomas if (set_bit) {
64 4545b700 2021-10-15 thomas buf[byte] = c | mask;
65 4545b700 2021-10-15 thomas }
66 4545b700 2021-10-15 thomas return 0;
67 4545b700 2021-10-15 thomas }
68 4545b700 2021-10-15 thomas }
69 4545b700 2021-10-15 thomas
70 4545b700 2021-10-15 thomas
71 4545b700 2021-10-15 thomas static int bloom_check_add(struct bloom * bloom,
72 4545b700 2021-10-15 thomas const void * buffer, int len, int add)
73 4545b700 2021-10-15 thomas {
74 4545b700 2021-10-15 thomas if (bloom->ready == 0) {
75 4545b700 2021-10-15 thomas printf("bloom at %p not initialized!\n", (void *)bloom);
76 4545b700 2021-10-15 thomas return -1;
77 4545b700 2021-10-15 thomas }
78 4545b700 2021-10-15 thomas
79 4545b700 2021-10-15 thomas int hits = 0;
80 cc524d36 2022-05-31 thomas register unsigned int a = murmurhash2(buffer, len, bloom->seed);
81 4545b700 2021-10-15 thomas register unsigned int b = murmurhash2(buffer, len, a);
82 4545b700 2021-10-15 thomas register unsigned int x;
83 4545b700 2021-10-15 thomas register unsigned int i;
84 4545b700 2021-10-15 thomas
85 4545b700 2021-10-15 thomas for (i = 0; i < bloom->hashes; i++) {
86 4545b700 2021-10-15 thomas x = (a + i*b) % bloom->bits;
87 4545b700 2021-10-15 thomas if (test_bit_set_bit(bloom->bf, x, add)) {
88 4545b700 2021-10-15 thomas hits++;
89 4545b700 2021-10-15 thomas } else if (!add) {
90 4545b700 2021-10-15 thomas // Don't care about the presence of all the bits. Just our own.
91 4545b700 2021-10-15 thomas return 0;
92 4545b700 2021-10-15 thomas }
93 4545b700 2021-10-15 thomas }
94 4545b700 2021-10-15 thomas
95 4545b700 2021-10-15 thomas if (hits == bloom->hashes) {
96 4545b700 2021-10-15 thomas return 1; // 1 == element already in (or collision)
97 4545b700 2021-10-15 thomas }
98 4545b700 2021-10-15 thomas
99 4545b700 2021-10-15 thomas return 0;
100 4545b700 2021-10-15 thomas }
101 4545b700 2021-10-15 thomas
102 4545b700 2021-10-15 thomas
103 4545b700 2021-10-15 thomas int bloom_init_size(struct bloom * bloom, int entries, double error,
104 4545b700 2021-10-15 thomas unsigned int cache_size)
105 4545b700 2021-10-15 thomas {
106 4545b700 2021-10-15 thomas return bloom_init(bloom, entries, error);
107 4545b700 2021-10-15 thomas }
108 4545b700 2021-10-15 thomas
109 4545b700 2021-10-15 thomas
110 4545b700 2021-10-15 thomas int bloom_init(struct bloom * bloom, int entries, double error)
111 4545b700 2021-10-15 thomas {
112 4545b700 2021-10-15 thomas bloom->ready = 0;
113 4545b700 2021-10-15 thomas
114 cc524d36 2022-05-31 thomas bloom->seed = arc4random();
115 cc524d36 2022-05-31 thomas
116 4545b700 2021-10-15 thomas if (entries < 1000 || error == 0) {
117 4545b700 2021-10-15 thomas return 1;
118 4545b700 2021-10-15 thomas }
119 4545b700 2021-10-15 thomas
120 4545b700 2021-10-15 thomas bloom->entries = entries;
121 4545b700 2021-10-15 thomas bloom->error = error;
122 4545b700 2021-10-15 thomas
123 4545b700 2021-10-15 thomas double num = log(bloom->error);
124 4545b700 2021-10-15 thomas double denom = 0.480453013918201; // ln(2)^2
125 4545b700 2021-10-15 thomas bloom->bpe = -(num / denom);
126 4545b700 2021-10-15 thomas
127 4545b700 2021-10-15 thomas double dentries = (double)entries;
128 4545b700 2021-10-15 thomas bloom->bits = (int)(dentries * bloom->bpe);
129 4545b700 2021-10-15 thomas
130 4545b700 2021-10-15 thomas if (bloom->bits % 8) {
131 4545b700 2021-10-15 thomas bloom->bytes = (bloom->bits / 8) + 1;
132 4545b700 2021-10-15 thomas } else {
133 4545b700 2021-10-15 thomas bloom->bytes = bloom->bits / 8;
134 4545b700 2021-10-15 thomas }
135 4545b700 2021-10-15 thomas
136 4545b700 2021-10-15 thomas bloom->hashes = (int)ceil(0.693147180559945 * bloom->bpe); // ln(2)
137 4545b700 2021-10-15 thomas
138 4545b700 2021-10-15 thomas bloom->bf = (unsigned char *)calloc(bloom->bytes, sizeof(unsigned char));
139 4545b700 2021-10-15 thomas if (bloom->bf == NULL) { // LCOV_EXCL_START
140 4545b700 2021-10-15 thomas return 1;
141 4545b700 2021-10-15 thomas } // LCOV_EXCL_STOP
142 4545b700 2021-10-15 thomas
143 4545b700 2021-10-15 thomas bloom->ready = 1;
144 4545b700 2021-10-15 thomas return 0;
145 4545b700 2021-10-15 thomas }
146 4545b700 2021-10-15 thomas
147 4545b700 2021-10-15 thomas
148 4545b700 2021-10-15 thomas int bloom_check(struct bloom * bloom, const void * buffer, int len)
149 4545b700 2021-10-15 thomas {
150 4545b700 2021-10-15 thomas return bloom_check_add(bloom, buffer, len, 0);
151 4545b700 2021-10-15 thomas }
152 4545b700 2021-10-15 thomas
153 4545b700 2021-10-15 thomas
154 4545b700 2021-10-15 thomas int bloom_add(struct bloom * bloom, const void * buffer, int len)
155 4545b700 2021-10-15 thomas {
156 4545b700 2021-10-15 thomas return bloom_check_add(bloom, buffer, len, 1);
157 4545b700 2021-10-15 thomas }
158 4545b700 2021-10-15 thomas
159 4545b700 2021-10-15 thomas
160 4545b700 2021-10-15 thomas void bloom_print(struct bloom * bloom)
161 4545b700 2021-10-15 thomas {
162 4545b700 2021-10-15 thomas printf("bloom at %p\n", (void *)bloom);
163 4545b700 2021-10-15 thomas printf(" ->entries = %d\n", bloom->entries);
164 4545b700 2021-10-15 thomas printf(" ->error = %f\n", bloom->error);
165 4545b700 2021-10-15 thomas printf(" ->bits = %d\n", bloom->bits);
166 4545b700 2021-10-15 thomas printf(" ->bits per elem = %f\n", bloom->bpe);
167 4545b700 2021-10-15 thomas printf(" ->bytes = %d\n", bloom->bytes);
168 4545b700 2021-10-15 thomas printf(" ->hash functions = %d\n", bloom->hashes);
169 4545b700 2021-10-15 thomas }
170 4545b700 2021-10-15 thomas
171 4545b700 2021-10-15 thomas
172 4545b700 2021-10-15 thomas void bloom_free(struct bloom * bloom)
173 4545b700 2021-10-15 thomas {
174 4545b700 2021-10-15 thomas if (bloom->ready) {
175 4545b700 2021-10-15 thomas free(bloom->bf);
176 4545b700 2021-10-15 thomas }
177 4545b700 2021-10-15 thomas bloom->ready = 0;
178 4545b700 2021-10-15 thomas }
179 4545b700 2021-10-15 thomas
180 4545b700 2021-10-15 thomas
181 4545b700 2021-10-15 thomas int bloom_reset(struct bloom * bloom)
182 4545b700 2021-10-15 thomas {
183 4545b700 2021-10-15 thomas if (!bloom->ready) return 1;
184 4545b700 2021-10-15 thomas memset(bloom->bf, 0, bloom->bytes);
185 4545b700 2021-10-15 thomas return 0;
186 4545b700 2021-10-15 thomas }