git » nmdb » commit cf06a74

nmdb: Change the cache hash function to Austin Appleby's MurmurHash2

author Alberto Bertogli
2010-04-15 21:13:23 UTC
committer Alberto Bertogli
2010-04-15 22:12:36 UTC
parent 8b4ab9201633c17412bd96cabb5c25c311489628

nmdb: Change the cache hash function to Austin Appleby's MurmurHash2

After testing several hash functions, the one used in the cache was
changed to Austin Appleby's MurmurHash2.

MurmurHash2 is much faster than the one we use (10-25% in our benchmarks),
has nice and clean code, and is in the public domain.

We keep the old Jenkins' one-at-a-time function, and also add FNV and
Paul Hsieh's SuperFastHash, so the tests can be redone in the future if
needed.

Signed-off-by: Alberto Bertogli <albertito@blitiri.com.ar>

nmdb/cache.c +1 -28
nmdb/hash.h +164 -0

diff --git a/nmdb/cache.c b/nmdb/cache.c
index e8fe449..5d1e2c8 100644
--- a/nmdb/cache.c
+++ b/nmdb/cache.c
@@ -10,6 +10,7 @@
 #include <stdlib.h>		/* for malloc() */
 #include <string.h>		/* for memcpy()/memcmp() */
 #include <stdio.h>		/* snprintf() */
+#include "hash.h"		/* hash() */
 #include "cache.h"
 
 
@@ -101,34 +102,6 @@ static void free_entry(struct cache_entry *e)
 	e->vsize = -1;
 }
 
-/*
- * The hash function used is the "One at a time" function, which seems simple,
- * fast and popular. Others for future consideration if speed becomes an issue
- * include:
- *  * FNV Hash (http://www.isthe.com/chongo/tech/comp/fnv/)
- *  * SuperFastHash (http://www.azillionmonkeys.com/qed/hash.html)
- *  * Judy dynamic arrays (http://judy.sf.net)
- *
- * A good comparison can be found at
- * http://eternallyconfuzzled.com/tuts/hashing.html#existing
- */
-
-static uint32_t hash(const unsigned char *key, const size_t ksize)
-{
-	uint32_t h = 0;
-	size_t i;
-
-	for (i = 0; i < ksize; i++) {
-		h += key[i];
-		h += (h << 10);
-		h ^= (h >> 6);
-	}
-	h += (h << 3);
-	h ^= (h >> 11);
-	h += (h << 15);
-	return h;
-}
-
 
 /* Looks up the given key in the chain. Returns NULL if not found, or a
  * pointer to the cache entry if it is. The chain can be empty. */
diff --git a/nmdb/hash.h b/nmdb/hash.h
new file mode 100644
index 0000000..706c5a6
--- /dev/null
+++ b/nmdb/hash.h
@@ -0,0 +1,164 @@
+
+#ifndef _HASH_H
+#define _HASH_H
+
+/*
+ * Hash function used in the cache.
+ *
+ * This is kept here instead of a .c file so the compiler can inline if it
+ * decides it's worth it.
+ *
+ * The hash function used is Austin Appleby's MurmurHash2. It used to be
+ * Jenkins's one-at-a-time, but this one is much faster. However, we keep
+ * the others that were tested for future comparison purposes.
+ */
+#define hash(k, s) murmurhash2(k, s)
+
+/* MurmurHash2, by Austin Appleby. The one we use.
+ * It has been modify to fit into the coding style, to work on uint32_t
+ * instead of ints, and the seed was fixed to a random number because it's not
+ * an issue for us. The author placed it in the public domain, so it's safe.
+ * http://sites.google.com/site/murmurhash/ */
+static uint32_t murmurhash2(const unsigned char *key, size_t len)
+{
+	const uint32_t m = 0x5bd1e995;
+	const int r = 24;
+	const uint32_t seed = 0x34a4b627;
+
+	// Initialize the hash to a 'random' value
+	uint32_t h = seed ^ len;
+
+	// Mix 4 bytes at a time into the hash
+	while (len >= 4) {
+		uint32_t k = *(uint32_t *) key;
+
+		k *= m;
+		k ^= k >> r;
+		k *= m;
+
+		h *= m;
+		h ^= k;
+
+		key += 4;
+		len -= 4;
+	}
+
+	// Handle the last few bytes of the input array
+	switch (len) {
+		case 3: h ^= key[2] << 16;
+		case 2: h ^= key[1] << 8;
+		case 1: h ^= key[0];
+			h *= m;
+	}
+
+	// Do a few final mixes of the hash to ensure the last few
+	// bytes are well-incorporated.
+	h ^= h >> 13;
+	h *= m;
+	h ^= h >> 15;
+
+	return h;
+}
+
+
+/* Unused functions, left for comparison purposes */
+#if 0
+
+/* Paul Hsieh's SuperFastHash, which is really fast, but a tad slower than
+ * MurmurHash2.
+ * http://www.azillionmonkeys.com/qed/hash.html */
+static uint32_t superfast(const unsigned char *data, size_t len)
+{
+	uint32_t h = len, tmp;
+	int rem;
+
+	/* this is the same as (*((const uint16_t *) (d))) on little endians,
+	 * but we keep the generic version since gcc compiles decent code for
+	 * both and there's no noticeable difference in performance */
+	#define get16bits(d) ( \
+		( ( (uint32_t) ( ((const uint8_t *)(d))[1] ) ) << 8 ) \
+		+ (uint32_t) ( ((const uint8_t *)(d))[0] ) )
+
+	if (len <= 0 || data == NULL)
+		return 0;
+
+	rem = len & 3;
+	len >>= 2;
+
+	/* Main loop */
+	for (; len > 0; len--) {
+		h += get16bits(data);
+		tmp = (get16bits(data+2) << 11) ^ h;
+		h = (h << 16) ^ tmp;
+		data += 2*sizeof (uint16_t);
+		h += h >> 11;
+	}
+
+	/* Handle end cases */
+	switch (rem) {
+		case 3: h += get16bits(data);
+			h ^= h << 16;
+			h ^= data[sizeof(uint16_t)] << 18;
+			h += h >> 11;
+			break;
+		case 2: h += get16bits(data);
+			h ^= h << 11;
+			h += h >> 17;
+			break;
+		case 1: h += *data;
+			h ^= h << 10;
+			h += h >> 1;
+	}
+
+	/* Force "avalanching" of final 127 bits */
+	h ^= h << 3;
+	h += h >> 5;
+	h ^= h << 4;
+	h += h >> 17;
+	h ^= h << 25;
+	h += h >> 6;
+
+	return h;
+
+	#undef get16bits
+}
+
+/* Jenkins' one-at-a-time hash, the one we used to use.
+ * http://en.wikipedia.org/wiki/Jenkins_hash_function */
+static uint32_t oneatatime(const unsigned char *key, const size_t ksize)
+{
+	uint32_t h = 0;
+	size_t i;
+
+	for (i = 0; i < ksize; i++) {
+		h += key[i];
+		h += (h << 10);
+		h ^= (h >> 6);
+	}
+	h += (h << 3);
+	h ^= (h >> 11);
+	h += (h << 15);
+	return h;
+}
+
+/* FNV 32 bit hash; slower than the rest.
+ * http://www.isthe.com/chongo/tech/comp/fnv/ */
+static uint32_t fnv_hash(const unsigned char *key, const size_t ksize)
+{
+	const unsigned char *end = key + ksize;
+	uint32_t hval;
+
+	hval = 0x811c9dc5;
+	while (key < end) {
+		hval += (hval<<1) + (hval<<4) + (hval<<7) +
+			(hval<<8) + (hval<<24);
+		hval ^= (uint32_t) *key++;
+	}
+
+	return hval;
+}
+
+#endif
+
+#endif
+