在处理电脑中文件名的存储问题时,为了节约存储空间,需要完成字符串的去重操作,前提得实现字符串的快速查找,之前的做法是采用平衡二叉树存储,但是操作比较复杂,而且树结点中指针及其他额外的变量占用了较多的空间,在网上查了许多字符串哈希算法,最后选了个最简单性能较好的BKDR哈希算法,使用哈希链表处理冲突,这种操作很简单,效率也比之前的平衡二叉树要高,但是这种方法失去了字符串的顺序性,虽然这种顺序性目前没有被利用,但是以后对字符串进行压缩时可能会利用这种顺序。
目前的哈希表长是固定的还在试验,装填因子维持在0.65左右,冲突链最长没超过10。
// strHash.h #pragma once #include <string.h> #include <stdlib.h> typedef struct STR_NODE { struct STR_NODE* next; unsigned char nodeCite; /* 结点引用数超过255后设置此结点不能删除 */ unsigned char strLength; /* 字符串长度大于255时设置此长度变量不可用 */ unsigned char tag; char str[1]; static const unsigned char TAG_CANNOTERASE = 0B00000001; /* 结点不可被删除 */ static const unsigned char TAG_CANNOTUSELENGTH = 0B00000010; /* 长度变量不可用 */ static const unsigned char TAG_CONTAINNOANSI = 0B00000100; /* 含非ANSI字符 */ static const unsigned char TAG_CHECKED = 0B00001000; /* 此结点已被查询过 */ static const unsigned char TAG_CHECKEDANDOK = 0B00010000; /* 此结点已被查询过且符合要求 */ /* 应调用此函数获取准确的字符串长度 */ size_t _strLength() { return tag & TAG_CANNOTUSELENGTH ? strlen(str) : strLength; } /* 新建结点 */ static STR_NODE* newNode(const char* str, size_t strLength, unsigned char tag) { PSTR_NODE newNode = (PSTR_NODE)malloc(sizeof(STR_NODE) + strLength); if (!newNode) exit(1); newNode->next = NULL; newNode->nodeCite = 1; newNode->strLength = strLength; newNode->tag = tag; memcpy(newNode->str, str, strLength); newNode->str[strLength] = 0; return newNode; } }STR_NODE, *PSTR_NODE; /* 哈希表头结点 */ typedef struct { PSTR_NODE first; }STR_HASH_TABLE_HEAD_NODE; class StrHashTable { public: StrHashTable(size_t size); ~StrHashTable(); PSTR_NODE insert(const char* str, size_t strLength, unsigned char tag); void erase(const char* str); private: static const size_t DEFAULT_SIZE_BIG = 218357; /* 默认主文件名哈希表长 */ static const size_t DEFAULT_SIZE_SMALL = 1627; /* 默认后缀名哈希表长 */ size_t size; STR_HASH_TABLE_HEAD_NODE* table; size_t BKDRHash(const char* str); PSTR_NODE deleteNode(PSTR_NODE& node); /* 返回删除操作后待删除结点前驱的新后继 */ }; // strHash.cpp #include "strHash.h" #include <iostream> StrHashTable::StrHashTable(size_t size) { this->size = size; table = (STR_HASH_TABLE_HEAD_NODE*)malloc((size + 1) * sizeof(STR_HASH_TABLE_HEAD_NODE)); if (!table) exit(1); for (size_t i = 0; i <= size; ++i) table[i].first = NULL; } StrHashTable::~StrHashTable() { if (table) { for (size_t i = 0; i < size; ++i) { PSTR_NODE p = table[i].first; PSTR_NODE temp = NULL; while (p) { temp = p; p = p->next; free(temp); temp = NULL; } } free(table); table = NULL; } } PSTR_NODE StrHashTable::insert(const char* str, size_t strLength, unsigned char tag) { if (!str) return NULL; size_t addr = BKDRHash(str) % size; PSTR_NODE p = table[addr].first; while (p) { if (strcmp(str, p->str) == 0) break; p = p->next; } if (p) { if (p->tag & STR_NODE::TAG_CANNOTERASE); else if (p->nodeCite == 255) p->tag |= STR_NODE::TAG_CANNOTERASE; else p->nodeCite++; } else { p = STR_NODE::newNode(str, strLength, tag); p->next = table[addr].first; table[addr].first = p; } return p; } void StrHashTable::erase(const char* str) { if (!str) return; /* 找到结点 */ size_t addr = BKDRHash(str) % size; PSTR_NODE p = table[addr].first; while (p) { if (strcmp(str, p->str) == 0) break; p = p->next; } if (!p) return; /* 是头结点 */ if (table[addr].first == p) table[addr].first = deleteNode(p); /* 是中间结点 */ else { PSTR_NODE pre = table[addr].first; while (pre->next != p) pre = pre->next; pre->next = deleteNode(p); } } size_t StrHashTable::BKDRHash(const char* str) { register size_t key = 0; size_t c = 0; while (c = (size_t)*str++) key = key * 131 + c; return key; } PSTR_NODE StrHashTable::deleteNode(PSTR_NODE& node) { if (node->tag & STR_NODE::TAG_CANNOTERASE) return node; /* 不可删除 */ node->nodeCite--; if (node->nodeCite == 0) { PSTR_NODE retNode = node->next; free(node); node = NULL; return retNode; } return node; }