原文:Exercise 37: Hashmaps
哈希表(HashMap、HashTable以及Dictionary)广泛用于许多动态编程语言来储存键值对的数据。哈希表通过在键上执行“哈希”运算产生整数,之后使用它来寻找相应的桶来获取或储存值。它是非常快速的使用数据结构,因为它适用于任何数据并且易于实现。
HashMap
HashTable
Dictionary
下面是哈希表(也叫作字典)的一个使用示例:
fruit_weights = {'Apples': 10, 'Oranges': 100, 'Grapes': 1.0} for key, value in fruit_weights.items(): print key, "=", value
几乎所有现代语言都具备这种特性,所以许多人写完代码都不知道它实际上如何工作。通过在C中创建Hashmap数据结构,我会向你展示它的工作原理。我会从头文件开始,来谈论整个数据结构。
Hashmap
#ifndef _lcthw_Hashmap_h #define _lcthw_Hashmap_h #include <stdint.h> #include <lcthw/darray.h> #define DEFAULT_NUMBER_OF_BUCKETS 100 typedef int (*Hashmap_compare)(void *a, void *b); typedef uint32_t (*Hashmap_hash)(void *key); typedef struct Hashmap { DArray *buckets; Hashmap_compare compare; Hashmap_hash hash; } Hashmap; typedef struct HashmapNode { void *key; void *data; uint32_t hash; } HashmapNode; typedef int (*Hashmap_traverse_cb)(HashmapNode *node); Hashmap *Hashmap_create(Hashmap_compare compare, Hashmap_hash); void Hashmap_destroy(Hashmap *map); int Hashmap_set(Hashmap *map, void *key, void *data); void *Hashmap_get(Hashmap *map, void *key); int Hashmap_traverse(Hashmap *map, Hashmap_traverse_cb traverse_cb); void *Hashmap_delete(Hashmap *map, void *key); #endif
这个结构就是Hashmap,含有许多HashmapNode节点。观察Hashmap你会看到它类似这样:
HashmapNode
DArray *buckets
一个动态数组,设置为100个桶的固定大小。每个桶会含有一个DArray,来实际存档HashmapNode对。
DArray
Hashmap_compare compare
这是一个比较函数,被Hashmap用于实际用过键寻找元素。它应该和其它的比较函数类似,并且默认设置为bstrcmp来比较字符串。
bstrcmp
Hashmap_hash
这是哈希函数,它用于接收键,处理它的内容,之后产生一个uint32_t索引数值。之后你会看到默认的实现。
uint32_t
这些告诉了你数据如何存储,但是用作buckets的DArray还没有创建。要记住它具有二层结构;
buckets
HashMapNode由下面三个元素组成:
HashMapNode
void *key
键值对的键。
void *value
键值对的值。
uint32_t hash
计算出的哈希值,它用于使查找该节点更加迅速,只要判断键是否相等。
有文件的剩余部分没有新的东西,所以我现在可以向你展示hashmap.c的实现了:
hashmap.c
#undef NDEBUG #include <stdint.h> #include <lcthw/hashmap.h> #include <lcthw/dbg.h> #include <lcthw/bstrlib.h> static int default_compare(void *a, void *b) { return bstrcmp((bstring)a, (bstring)b); } /** * Simple Bob Jenkins's hash algorithm taken from the * wikipedia description. */ static uint32_t default_hash(void *a) { size_t len = blength((bstring)a); char *key = bdata((bstring)a); uint32_t hash = 0; uint32_t i = 0; for(hash = i = 0; i < len; ++i) { hash += key[i]; hash += (hash << 10); hash ^= (hash >> 6); } hash += (hash << 3); hash ^= (hash >> 11); hash += (hash << 15); return hash; } Hashmap *Hashmap_create(Hashmap_compare compare, Hashmap_hash hash) { Hashmap *map = calloc(1, sizeof(Hashmap)); check_mem(map); map->compare = compare == NULL ? default_compare : compare; map->hash = hash == NULL ? default_hash : hash; map->buckets = DArray_create(sizeof(DArray *), DEFAULT_NUMBER_OF_BUCKETS); map->buckets->end = map->buckets->max; // fake out expanding it check_mem(map->buckets); return map; error: if(map) { Hashmap_destroy(map); } return NULL; } void Hashmap_destroy(Hashmap *map) { int i = 0; int j = 0; if(map) { if(map->buckets) { for(i = 0; i < DArray_count(map->buckets); i++) { DArray *bucket = DArray_get(map->buckets, i); if(bucket) { for(j = 0; j < DArray_count(bucket); j++) { free(DArray_get(bucket, j)); } DArray_destroy(bucket); } } DArray_destroy(map->buckets); } free(map); } } static inline HashmapNode *Hashmap_node_create(int hash, void *key, void *data) { HashmapNode *node = calloc(1, sizeof(HashmapNode)); check_mem(node); node->key = key; node->data = data; node->hash = hash; return node; error: return NULL; } static inline DArray *Hashmap_find_bucket(Hashmap *map, void *key, int create, uint32_t *hash_out) { uint32_t hash = map->hash(key); int bucket_n = hash % DEFAULT_NUMBER_OF_BUCKETS; check(bucket_n >= 0, "Invalid bucket found: %d", bucket_n); *hash_out = hash; // store it for the return so the caller can use it DArray *bucket = DArray_get(map->buckets, bucket_n); if(!bucket && create) { // new bucket, set it up bucket = DArray_create(sizeof(void *), DEFAULT_NUMBER_OF_BUCKETS); check_mem(bucket); DArray_set(map->buckets, bucket_n, bucket); } return bucket; error: return NULL; } int Hashmap_set(Hashmap *map, void *key, void *data) { uint32_t hash = 0; DArray *bucket = Hashmap_find_bucket(map, key, 1, &hash); check(bucket, "Error can't create bucket."); HashmapNode *node = Hashmap_node_create(hash, key, data); check_mem(node); DArray_push(bucket, node); return 0; error: return -1; } static inline int Hashmap_get_node(Hashmap *map, uint32_t hash, DArray *bucket, void *key) { int i = 0; for(i = 0; i < DArray_end(bucket); i++) { debug("TRY: %d", i); HashmapNode *node = DArray_get(bucket, i); if(node->hash == hash && map->compare(node->key, key) == 0) { return i; } } return -1; } void *Hashmap_get(Hashmap *map, void *key) { uint32_t hash = 0; DArray *bucket = Hashmap_find_bucket(map, key, 0, &hash); if(!bucket) return NULL; int i = Hashmap_get_node(map, hash, bucket, key); if(i == -1) return NULL; HashmapNode *node = DArray_get(bucket, i); check(node != NULL, "Failed to get node from bucket when it should exist."); return node->data; error: // fallthrough return NULL; } int Hashmap_traverse(Hashmap *map, Hashmap_traverse_cb traverse_cb) { int i = 0; int j = 0; int rc = 0; for(i = 0; i < DArray_count(map->buckets); i++) { DArray *bucket = DArray_get(map->buckets, i); if(bucket) { for(j = 0; j < DArray_count(bucket); j++) { HashmapNode *node = DArray_get(bucket, j); rc = traverse_cb(node); if(rc != 0) return rc; } } } return 0; } void *Hashmap_delete(Hashmap *map, void *key) { uint32_t hash = 0; DArray *bucket = Hashmap_find_bucket(map, key, 0, &hash); if(!bucket) return NULL; int i = Hashmap_get_node(map, hash, bucket, key); if(i == -1) return NULL; HashmapNode *node = DArray_get(bucket, i); void *data = node->data; free(node); HashmapNode *ending = DArray_pop(bucket); if(ending != node) { // alright looks like it's not the last one, swap it DArray_set(bucket, i, ending); } return data; }
这个实现中并没有什么复杂的东西,但是default_hash和Hashmap_find_bucket需要一些解释。当你使用Hashmap_create时,你可以传入任何定制的比较和哈希函数。但是如果你没有则会使用default_compare和default_hash函数。
default_hash
Hashmap_find_bucket
Hashmap_create
default_compare
需要观察的第一件事,是default_hash的行为。这是一个简单的哈希函数,叫做“Jenkins hash”,以Bob Jenkins的名字命名。我从维基百科上获得了这个算法。它仅仅遍历键(bstring)的每个字节来计算哈希,以便得出uint32_t的结果。它使用一些加法和异或运算来实现。
bstring
哈希函数有很多中,它们具有不同的特性,然而一旦你选择了一种,就需要一种方法来使用它找到正确的桶。Hashmap_find_bucket像这样实现它:
map->hash(key)
hash % DEFAULT_NUMBER_OF_BUCKETS
create
hash_out
其它函数都使用Hashmap_find_bucket来完成工作:
hash
key
你需要学习的唯一一个其他函数是Hashmap_travers,它仅仅遍历每个桶,对于任何含有值的桶,在每个值上调用traverse_cb。这就是扫描整个Hashmap的办法。
Hashmap_travers
traverse_cb
最后你需要编写单元测试,对于所有这些操作:
#include "minunit.h" #include <lcthw/hashmap.h> #include <assert.h> #include <lcthw/bstrlib.h> Hashmap *map = NULL; static int traverse_called = 0; struct tagbstring test1 = bsStatic("test data 1"); struct tagbstring test2 = bsStatic("test data 2"); struct tagbstring test3 = bsStatic("xest data 3"); struct tagbstring expect1 = bsStatic("THE VALUE 1"); struct tagbstring expect2 = bsStatic("THE VALUE 2"); struct tagbstring expect3 = bsStatic("THE VALUE 3"); static int traverse_good_cb(HashmapNode *node) { debug("KEY: %s", bdata((bstring)node->key)); traverse_called++; return 0; } static int traverse_fail_cb(HashmapNode *node) { debug("KEY: %s", bdata((bstring)node->key)); traverse_called++; if(traverse_called == 2) { return 1; } else { return 0; } } char *test_create() { map = Hashmap_create(NULL, NULL); mu_assert(map != NULL, "Failed to create map."); return NULL; } char *test_destroy() { Hashmap_destroy(map); return NULL; } char *test_get_set() { int rc = Hashmap_set(map, &test1, &expect1); mu_assert(rc == 0, "Failed to set &test1"); bstring result = Hashmap_get(map, &test1); mu_assert(result == &expect1, "Wrong value for test1."); rc = Hashmap_set(map, &test2, &expect2); mu_assert(rc == 0, "Failed to set test2"); result = Hashmap_get(map, &test2); mu_assert(result == &expect2, "Wrong value for test2."); rc = Hashmap_set(map, &test3, &expect3); mu_assert(rc == 0, "Failed to set test3"); result = Hashmap_get(map, &test3); mu_assert(result == &expect3, "Wrong value for test3."); return NULL; } char *test_traverse() { int rc = Hashmap_traverse(map, traverse_good_cb); mu_assert(rc == 0, "Failed to traverse."); mu_assert(traverse_called == 3, "Wrong count traverse."); traverse_called = 0; rc = Hashmap_traverse(map, traverse_fail_cb); mu_assert(rc == 1, "Failed to traverse."); mu_assert(traverse_called == 2, "Wrong count traverse for fail."); return NULL; } char *test_delete() { bstring deleted = (bstring)Hashmap_delete(map, &test1); mu_assert(deleted != NULL, "Got NULL on delete."); mu_assert(deleted == &expect1, "Should get test1"); bstring result = Hashmap_get(map, &test1); mu_assert(result == NULL, "Should delete."); deleted = (bstring)Hashmap_delete(map, &test2); mu_assert(deleted != NULL, "Got NULL on delete."); mu_assert(deleted == &expect2, "Should get test2"); result = Hashmap_get(map, &test2); mu_assert(result == NULL, "Should delete."); deleted = (bstring)Hashmap_delete(map, &test3); mu_assert(deleted != NULL, "Got NULL on delete."); mu_assert(deleted == &expect3, "Should get test3"); result = Hashmap_get(map, &test3); mu_assert(result == NULL, "Should delete."); return NULL; } char *all_tests() { mu_suite_start(); mu_run_test(test_create); mu_run_test(test_get_set); mu_run_test(test_traverse); mu_run_test(test_delete); mu_run_test(test_destroy); return NULL; } RUN_TESTS(all_tests);
需要学习的唯一一件事情就是我在单元测试的顶端使用了bstring的特性来创建静态字符串用于测试。我使用tagbstring和bsStatic在7~13行创建他们。
tagbstring
bsStatic
这是一个非常简单的Hashmap实现,就像书中的大多数其他数据结构那样。我的目标不是让你以非常快的速度来掌握数据结构。通常这些讨论起来非常复杂,并且会让你偏离真正的基础和实用的数据结构。我的目标是提供一个易于理解的起始点,然后再改进或理解它们如何实现。
对于这和练习,下面是你能够用于改进这个实现的方法:
像往常一样,你需要浏览每个函数,并且使之健壮。Hashmap也可以使用一些调试设置,来执行不变量检查。
Copyright© 2013-2020
All Rights Reserved 京ICP备2023019179号-8