git » libfiu » commit 20cd735

libfiu: Improve/fix handling of deleted elements in the hash table

author Alberto Bertogli
2018-09-30 21:44:29 UTC
committer Alberto Bertogli
2018-10-14 20:03:52 UTC
parent ad14b3c9635f851a12254f7808b6f2182ee41a31

libfiu: Improve/fix handling of deleted elements in the hash table

There are a few problems with the current way deleted items are handled
in the hash table, including:

- We can reuse them during hash_set(), which can lead to bugs such as
  double and/or phantom entries.
- The resizing logic is suboptimal because the removed elements are
  counted as valid entries (this causes unnecessary grows and shrinks,
  but it's otherwise harmless).
- Possible infinite loops when deleting entries.

Note the tests in later patches trigger most of these conditions.

This patch fixes those found bugs by:
- Never reusing removed entries.
- Improving resize logic to consider removed entries separately.
- Avoid potential infinite loops by adding iteration logic (for defense
  in depth).

libfiu/hash.c +82 -34

diff --git a/libfiu/hash.c b/libfiu/hash.c
index e8b037d..45a9b9b 100644
--- a/libfiu/hash.c
+++ b/libfiu/hash.c
@@ -80,6 +80,7 @@ struct hash {
 	struct entry *entries;
 	size_t table_size;
 	size_t nentries;
+	size_t nremoved;
 	void (*destructor)(void *);
 };
 
@@ -109,6 +110,7 @@ struct hash *hash_create(void (*destructor)(void *))
 
 	h->table_size = MIN_SIZE;
 	h->nentries = 0;
+	h->nremoved = 0;
 
 	if (destructor == NULL)
 		destructor = dumb_destructor;
@@ -142,7 +144,7 @@ void *hash_get(struct hash *h, const char *key)
 
 	pos = murmurhash2(key, strlen(key)) % h->table_size;
 
-	for (;;) {
+	for (int i = 0; i < h->table_size; i++) {
 		entry = h->entries + pos;
 		if (entry->in_use == NEVER) {
 			/* We got to a entry never used, no match. */
@@ -151,12 +153,16 @@ void *hash_get(struct hash *h, const char *key)
 				strcmp(key, entry->key) == 0) {
 			/* The key matches. */
 			return entry->value;
-		} else {
-			/* We use linear probing for now */
-			pos = (pos + 1) % h->table_size;
 		}
+
+		/* The key doesn't match this entry, continue with linear probing. */
+		pos = (pos + 1) % h->table_size;
 	}
 
+	/* We went through the entire table and did not find the key.
+	 * Note this is a pathological case that we don't expect would happen
+	 * under normal operation, since we resize the table so there are
+	 * always some never-used spaces. */
 	return NULL;
 }
 
@@ -170,25 +176,33 @@ static bool _hash_set(struct hash *h, char *key, void *value)
 
 	pos = murmurhash2(key, strlen(key)) % h->table_size;
 
-	for (;;) {
+	for (int i = 0; i < h->table_size; i++) {
 		entry = h->entries + pos;
-		if (entry->in_use != IN_USE) {
+		if (entry->in_use == NEVER) {
+			/* Use only fresh entries, reusing removed ones
+			 * introduces significant complications when combined
+			 * with the linear probing. */
 			entry->in_use = IN_USE;
 			entry->key = key;
 			entry->value = value;
 			h->nentries++;
 			return true;
-		} else if (strcmp(key, entry->key) == 0) {
+		} else if (entry->in_use == IN_USE &&
+				strcmp(key, entry->key) == 0) {
 			/* The key matches, override the value. */
 			h->destructor(entry->value);
 			entry->value = value;
 			return true;
-		} else {
-			/* The key doesn't match, linear probing. */
-			pos = (pos + 1) % h->table_size;
 		}
+
+		/* The key doesn't match this entry, continue with linear probing. */
+		pos = (pos + 1) % h->table_size;
 	}
 
+	/* We went through the entire table and did not find the key.
+	 * Note this is a pathological case that we don't expect would happen
+	 * under normal operation, since we resize the table so there are
+	 * always some never-used spaces. */
 	return false;
 }
 
@@ -198,9 +212,10 @@ static bool resize_table(struct hash *h, size_t new_size)
 	struct entry *old_entries, *e;
 	size_t old_size;
 
+	/* Do not resize below minimum size; however, continue anyway to clear
+	 * up removed entries. */
 	if (new_size < MIN_SIZE) {
-		/* Do not resize below minimum size */
-		return true;
+		new_size = MIN_SIZE;
 	}
 
 	old_entries = h->entries;
@@ -213,6 +228,7 @@ static bool resize_table(struct hash *h, size_t new_size)
 	memset(h->entries, 0, sizeof(struct entry) * new_size);
 	h->table_size = new_size;
 	h->nentries = 0;
+	h->nremoved = 0;
 
 	/* Insert the old entries into the new table. We use the internal
 	 * version _hash_set() to avoid copying the keys again. */
@@ -227,26 +243,53 @@ static bool resize_table(struct hash *h, size_t new_size)
 	return true;
 }
 
+static bool auto_resize_table(hash_t *h) {
+	/* Slots in the table is taken by valid and removed entries.
+	 * Since we don't reuse removed slots, we need to also take them into
+	 * consideration and shrink if we are accumulating too many.
+	 *
+	 * This function always resizes to <number of entries> * 2, which
+	 * preserves the "at least 30% usable" property both when growing and
+	 * shrinking, and makes calculations simpler.
+	 */
+
+	/* Maintain at least 30% usable entries. */
+	float usable_ratio = (float) (h->table_size - h->nentries - h->nremoved)
+		/ h->table_size;
+	if (usable_ratio < 0.3) {
+		return resize_table(h, h->nentries * 2);
+	}
+
+	/* If we have less than 30% used (by valid entries), compact. */
+	float entries_ratio = (float) h->nentries / h->table_size;
+	if (h->table_size > MIN_SIZE && entries_ratio < 0.3) {
+		return resize_table(h, h->nentries * 2);
+	}
+
+	return true;
+}
+
+
 bool hash_set(struct hash *h, const char *key, void *value)
 {
-	if ((float) h->nentries / h->table_size > 0.7) {
-		/* If we're over 70% full, grow the table by 30% */
-		if (!resize_table(h, h->table_size * 1.3))
-			return false;
+	bool r = _hash_set(h, strdup(key), value);
+
+	if (!auto_resize_table(h)) {
+		return false;
 	}
 
-	return _hash_set(h, strdup(key), value);
+	return r;
 }
 
-
 bool hash_del(struct hash *h, const char *key)
 {
 	size_t pos;
 	struct entry *entry;
+	bool found = false;
 
 	pos = murmurhash2(key, strlen(key)) % h->table_size;
 
-	for (;;) {
+	for (int i = 0; i < h->table_size; i++) {
 		entry = h->entries + pos;
 		if (entry->in_use == NEVER) {
 			/* We got to a never used key, not found. */
@@ -254,29 +297,34 @@ bool hash_del(struct hash *h, const char *key)
 		} else if (entry->in_use == IN_USE &&
 				strcmp(key, entry->key) == 0) {
 			/* The key matches, remove it. */
-			free(entry->key);
-			h->destructor(entry->value);
-			entry->key = NULL;
-			entry->value = NULL;
-			entry->in_use = REMOVED;
+			found = true;
 			break;
-		} else {
-			/* The key doesn't match, linear probing. */
-			pos = (pos + 1) % h->table_size;
 		}
+
+		/* The key doesn't match this entry, continue with linear probing. */
+		pos = (pos + 1) % h->table_size;
 	}
 
-	if (h->table_size > MIN_SIZE &&
-			(float) h->nentries / h->table_size < 0.5) {
-		/* If we're over 50% free, shrink. */
-		if (!resize_table(h, h->table_size * 0.8))
-			return false;
+	if (!found) {
+		return false;
 	}
 
-	return true;
+	/* Remove the entry */
+	free(entry->key);
+	h->destructor(entry->value);
+	entry->key = NULL;
+	entry->value = NULL;
+	entry->in_use = REMOVED;
+	h->nentries--;
+	h->nremoved++;
 
+	if (!auto_resize_table(h))
+		return false;
+
+	return true;
 }
 
+
 /* Generic, simple cache.
  *
  * It is implemented using a hash table and manipulating it directly when
@@ -436,7 +484,7 @@ bool cache_del(struct cache *c, const char *key)
 		free(e->key);
 		e->key = NULL;
 		e->value = NULL;
-		e->in_use = REMOVED;
+		e->in_use = NEVER;
 		pthread_rwlock_unlock(&c->lock);
 		return true;
 	}