summaryrefslogtreecommitdiffstats
path: root/ewah/ewah_rlw.c
diff options
context:
space:
mode:
authorVicent Marti <tanoku@gmail.com>2013-11-14 13:43:51 +0100
committerJunio C Hamano <gitster@pobox.com>2013-12-30 21:17:20 +0100
commite1273106f62927e3efdb1cfa107cb1a9f913274c (patch)
tree9a23af0dbec1791e1a0d8b3137e614744ee14f19 /ewah/ewah_rlw.c
parentcompat: add endianness helpers (diff)
downloadgit-e1273106f62927e3efdb1cfa107cb1a9f913274c.tar.xz
git-e1273106f62927e3efdb1cfa107cb1a9f913274c.zip
ewah: compressed bitmap implementation
EWAH is a word-aligned compressed variant of a bitset (i.e. a data structure that acts as a 0-indexed boolean array for many entries). It uses a 64-bit run-length encoding (RLE) compression scheme, trading some compression for better processing speed. The goal of this word-aligned implementation is not to achieve the best compression, but rather to improve query processing time. As it stands right now, this EWAH implementation will always be more efficient storage-wise than its uncompressed alternative. EWAH arrays will be used as the on-disk format to store reachability bitmaps for all objects in a repository while keeping reasonable sizes, in the same way that JGit does. This EWAH implementation is a mostly straightforward port of the original `javaewah` library that JGit currently uses. The library is self-contained and has been embedded whole (4 files) inside the `ewah` folder to ease redistribution. The library is re-licensed under the GPLv2 with the permission of Daniel Lemire, the original author. The source code for the C version can be found on GitHub: https://github.com/vmg/libewok The original Java implementation can also be found on GitHub: https://github.com/lemire/javaewah [jc: stripped debug-only code per Peff's $gmane/239768] Signed-off-by: Vicent Marti <tanoku@gmail.com> Signed-off-by: Jeff King <peff@peff.net> Helped-by: Ramsay Jones <ramsay@ramsay1.demon.co.uk> Signed-off-by: Junio C Hamano <gitster@pobox.com>
Diffstat (limited to 'ewah/ewah_rlw.c')
-rw-r--r--ewah/ewah_rlw.c115
1 files changed, 115 insertions, 0 deletions
diff --git a/ewah/ewah_rlw.c b/ewah/ewah_rlw.c
new file mode 100644
index 0000000000..c723f1aefd
--- /dev/null
+++ b/ewah/ewah_rlw.c
@@ -0,0 +1,115 @@
+/**
+ * Copyright 2013, GitHub, Inc
+ * Copyright 2009-2013, Daniel Lemire, Cliff Moon,
+ * David McIntosh, Robert Becho, Google Inc. and Veronika Zenz
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "git-compat-util.h"
+#include "ewok.h"
+#include "ewok_rlw.h"
+
+static inline int next_word(struct rlw_iterator *it)
+{
+ if (it->pointer >= it->size)
+ return 0;
+
+ it->rlw.word = &it->buffer[it->pointer];
+ it->pointer += rlw_get_literal_words(it->rlw.word) + 1;
+
+ it->rlw.literal_words = rlw_get_literal_words(it->rlw.word);
+ it->rlw.running_len = rlw_get_running_len(it->rlw.word);
+ it->rlw.running_bit = rlw_get_run_bit(it->rlw.word);
+ it->rlw.literal_word_offset = 0;
+
+ return 1;
+}
+
+void rlwit_init(struct rlw_iterator *it, struct ewah_bitmap *from_ewah)
+{
+ it->buffer = from_ewah->buffer;
+ it->size = from_ewah->buffer_size;
+ it->pointer = 0;
+
+ next_word(it);
+
+ it->literal_word_start = rlwit_literal_words(it) +
+ it->rlw.literal_word_offset;
+}
+
+void rlwit_discard_first_words(struct rlw_iterator *it, size_t x)
+{
+ while (x > 0) {
+ size_t discard;
+
+ if (it->rlw.running_len > x) {
+ it->rlw.running_len -= x;
+ return;
+ }
+
+ x -= it->rlw.running_len;
+ it->rlw.running_len = 0;
+
+ discard = (x > it->rlw.literal_words) ? it->rlw.literal_words : x;
+
+ it->literal_word_start += discard;
+ it->rlw.literal_words -= discard;
+ x -= discard;
+
+ if (x > 0 || rlwit_word_size(it) == 0) {
+ if (!next_word(it))
+ break;
+
+ it->literal_word_start =
+ rlwit_literal_words(it) + it->rlw.literal_word_offset;
+ }
+ }
+}
+
+size_t rlwit_discharge(
+ struct rlw_iterator *it, struct ewah_bitmap *out, size_t max, int negate)
+{
+ size_t index = 0;
+
+ while (index < max && rlwit_word_size(it) > 0) {
+ size_t pd, pl = it->rlw.running_len;
+
+ if (index + pl > max)
+ pl = max - index;
+
+ ewah_add_empty_words(out, it->rlw.running_bit ^ negate, pl);
+ index += pl;
+
+ pd = it->rlw.literal_words;
+ if (pd + index > max)
+ pd = max - index;
+
+ ewah_add_dirty_words(out,
+ it->buffer + it->literal_word_start, pd, negate);
+
+ rlwit_discard_first_words(it, pd + pl);
+ index += pd;
+ }
+
+ return index;
+}
+
+void rlwit_discharge_empty(struct rlw_iterator *it, struct ewah_bitmap *out)
+{
+ while (rlwit_word_size(it) > 0) {
+ ewah_add_empty_words(out, 0, rlwit_word_size(it));
+ rlwit_discard_first_words(it, rlwit_word_size(it));
+ }
+}