summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--bloom.c38
-rw-r--r--bloom.h63
-rw-r--r--t/helper/test-bloom.c48
-rwxr-xr-xt/t0095-bloom.sh40
4 files changed, 188 insertions, 1 deletions
diff --git a/bloom.c b/bloom.c
index 40e87632ae..888b67f1ea 100644
--- a/bloom.c
+++ b/bloom.c
@@ -8,6 +8,11 @@ static uint32_t rotate_left(uint32_t value, int32_t count)
return ((value << count) | (value >> ((-count) & mask)));
}
+static inline unsigned char get_bitmask(uint32_t pos)
+{
+ return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
+}
+
/*
* Calculate the murmur3 32-bit hash value for the given data
* using the given seed.
@@ -70,4 +75,35 @@ uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len)
seed ^= (seed >> 16);
return seed;
-} \ No newline at end of file
+}
+
+void fill_bloom_key(const char *data,
+ size_t len,
+ struct bloom_key *key,
+ const struct bloom_filter_settings *settings)
+{
+ int i;
+ const uint32_t seed0 = 0x293ae76f;
+ const uint32_t seed1 = 0x7e646e2c;
+ const uint32_t hash0 = murmur3_seeded(seed0, data, len);
+ const uint32_t hash1 = murmur3_seeded(seed1, data, len);
+
+ key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
+ for (i = 0; i < settings->num_hashes; i++)
+ key->hashes[i] = hash0 + i * hash1;
+}
+
+void add_key_to_filter(const struct bloom_key *key,
+ struct bloom_filter *filter,
+ const struct bloom_filter_settings *settings)
+{
+ int i;
+ uint64_t mod = filter->len * BITS_PER_WORD;
+
+ for (i = 0; i < settings->num_hashes; i++) {
+ uint64_t hash_mod = key->hashes[i] % mod;
+ uint64_t block_pos = hash_mod / BITS_PER_WORD;
+
+ filter->data[block_pos] |= get_bitmask(hash_mod);
+ }
+}
diff --git a/bloom.h b/bloom.h
index d0fcc5f0aa..b9ce422ca2 100644
--- a/bloom.h
+++ b/bloom.h
@@ -1,6 +1,60 @@
#ifndef BLOOM_H
#define BLOOM_H
+struct bloom_filter_settings {
+ /*
+ * The version of the hashing technique being used.
+ * We currently only support version = 1 which is
+ * the seeded murmur3 hashing technique implemented
+ * in bloom.c.
+ */
+ uint32_t hash_version;
+
+ /*
+ * The number of times a path is hashed, i.e. the
+ * number of bit positions tht cumulatively
+ * determine whether a path is present in the
+ * Bloom filter.
+ */
+ uint32_t num_hashes;
+
+ /*
+ * The minimum number of bits per entry in the Bloom
+ * filter. If the filter contains 'n' entries, then
+ * filter size is the minimum number of 8-bit words
+ * that contain n*b bits.
+ */
+ uint32_t bits_per_entry;
+};
+
+#define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10 }
+#define BITS_PER_WORD 8
+
+/*
+ * A bloom_filter struct represents a data segment to
+ * use when testing hash values. The 'len' member
+ * dictates how many entries are stored in
+ * 'data'.
+ */
+struct bloom_filter {
+ unsigned char *data;
+ size_t len;
+};
+
+/*
+ * A bloom_key represents the k hash values for a
+ * given string. These can be precomputed and
+ * stored in a bloom_key for re-use when testing
+ * against a bloom_filter. The number of hashes is
+ * given by the Bloom filter settings and is the same
+ * for all Bloom filters and keys interacting with
+ * the loaded version of the commit graph file and
+ * the Bloom data chunks.
+ */
+struct bloom_key {
+ uint32_t *hashes;
+};
+
/*
* Calculate the murmur3 32-bit hash value for the given data
* using the given seed.
@@ -10,4 +64,13 @@
*/
uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len);
+void fill_bloom_key(const char *data,
+ size_t len,
+ struct bloom_key *key,
+ const struct bloom_filter_settings *settings);
+
+void add_key_to_filter(const struct bloom_key *key,
+ struct bloom_filter *filter,
+ const struct bloom_filter_settings *settings);
+
#endif \ No newline at end of file
diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c
index 60ee204368..20460cde77 100644
--- a/t/helper/test-bloom.c
+++ b/t/helper/test-bloom.c
@@ -2,6 +2,36 @@
#include "bloom.h"
#include "test-tool.h"
+struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
+
+static void add_string_to_filter(const char *data, struct bloom_filter *filter) {
+ struct bloom_key key;
+ int i;
+
+ fill_bloom_key(data, strlen(data), &key, &settings);
+ printf("Hashes:");
+ for (i = 0; i < settings.num_hashes; i++){
+ printf("0x%08x|", key.hashes[i]);
+ }
+ printf("\n");
+ add_key_to_filter(&key, filter, &settings);
+}
+
+static void print_bloom_filter(struct bloom_filter *filter) {
+ int i;
+
+ if (!filter) {
+ printf("No filter.\n");
+ return;
+ }
+ printf("Filter_Length:%d\n", (int)filter->len);
+ printf("Filter_Data:");
+ for (i = 0; i < filter->len; i++){
+ printf("%02x|", filter->data[i]);
+ }
+ printf("\n");
+}
+
int cmd__bloom(int argc, const char **argv)
{
if (!strcmp(argv[1], "get_murmur3")) {
@@ -9,5 +39,23 @@ int cmd__bloom(int argc, const char **argv)
printf("Murmur3 Hash with seed=0:0x%08x\n", hashed);
}
+ if (!strcmp(argv[1], "generate_filter")) {
+ struct bloom_filter filter;
+ int i = 2;
+ filter.len = (settings.bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
+ filter.data = xcalloc(filter.len, sizeof(unsigned char));
+
+ if (!argv[2]){
+ die("at least one input string expected");
+ }
+
+ while (argv[i]) {
+ add_string_to_filter(argv[i], &filter);
+ i++;
+ }
+
+ print_bloom_filter(&filter);
+ }
+
return 0;
} \ No newline at end of file
diff --git a/t/t0095-bloom.sh b/t/t0095-bloom.sh
index 2dad8c4a94..36a086c7c6 100755
--- a/t/t0095-bloom.sh
+++ b/t/t0095-bloom.sh
@@ -27,4 +27,44 @@ test_expect_success 'compute unseeded murmur3 hash for test string 2' '
test_cmp expect actual
'
+test_expect_success 'compute bloom key for empty string' '
+ cat >expect <<-\EOF &&
+ Hashes:0x5615800c|0x5b966560|0x61174ab4|0x66983008|0x6c19155c|0x7199fab0|0x771ae004|
+ Filter_Length:2
+ Filter_Data:11|11|
+ EOF
+ test-tool bloom generate_filter "" >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success 'compute bloom key for whitespace' '
+ cat >expect <<-\EOF &&
+ Hashes:0xf178874c|0x5f3d6eb6|0xcd025620|0x3ac73d8a|0xa88c24f4|0x16510c5e|0x8415f3c8|
+ Filter_Length:2
+ Filter_Data:51|55|
+ EOF
+ test-tool bloom generate_filter " " >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success 'compute bloom key for test string 1' '
+ cat >expect <<-\EOF &&
+ Hashes:0xb270de9b|0x1bb6f26e|0x84fd0641|0xee431a14|0x57892de7|0xc0cf41ba|0x2a15558d|
+ Filter_Length:2
+ Filter_Data:92|6c|
+ EOF
+ test-tool bloom generate_filter "Hello world!" >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success 'compute bloom key for test string 2' '
+ cat >expect <<-\EOF &&
+ Hashes:0x20ab385b|0xf5237fe2|0xc99bc769|0x9e140ef0|0x728c5677|0x47049dfe|0x1b7ce585|
+ Filter_Length:2
+ Filter_Data:a5|4a|
+ EOF
+ test-tool bloom generate_filter "file.txt" >actual &&
+ test_cmp expect actual
+'
+
test_done \ No newline at end of file