散列(四):.NET Framework Hashtable类
2012-09-02 16:30:09Hash是各种数据结构中最为重要的数据结构之一;Hash具有O(1)的数据检索效率;Hash的空间开销却通常很大,以空间换取时间;适用于读取操作频繁,写入操作很少的操作类型。
一、存储结构
.NET中hash以数组的方式保存,数组的大小为素数,当数组中存在的数据项个数大于数组容量的扩张因子倍数时,数组进行扩张。
我们先看一下Hashtable类中的构造函数和一些重要数据项,如以下代码:
public class Hashtable : IDictionary, ISerializable, IDeserializationCallback, ICloneable {
//其他部分
private struct bucket {
public Object key; //实际保存的键对象,如果key==buckets或者null,认为该项数据已经被清除
public Object val; //实际保存的值对象
public int hash_coll; //hash值最高位(31bit)表示在该位上是否发生过冲突
//即按照hash算法,在插入数据时如果计算到该位置发现hash_coll(31bit)==1,那么认为该位置曾经发生过冲突,
//从当前位置使用hash算法计算后可能会继续发现可用的数据
}
private bucket[] buckets; //保存数据的hash桶
private int count; //hash表中实际包含数据项个数
private int occupancy; //在hashtable中标记的冲突位个数
private int loadsize;
private float loadFactor; //装载因子,如果 当前hash表中的实际数据项个数*装载因子>hash表中保存数据项的桶的大小
//那么当前保存数据项桶需要扩容至少当前容量2倍的素数大小
//其他部分
//capacity为hash表中可以保存的数据项的期望值,注意:这个值并不等于保存数据项的hash桶的实际大小
public Hashtable(int capacity, float loadFactor) {
if (capacity < 0)
throw new ArgumentOutOfRangeException("capacity", Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
if (!(loadFactor >= 0.1f && loadFactor <= 1.0f))
throw new ArgumentOutOfRangeException("loadFactor", Environment.GetResourceString("ArgumentOutOfRange_HashtableLoadFactor", .1, 1.0));
this.loadFactor = 0.72f * loadFactor;
double rawsize = capacity / this.loadFactor; //根据填充因子,计算hash桶的真实大小值
if (rawsize > Int32.MaxValue)
throw new ArgumentException(Environment.GetResourceString("Arg_HTCapacityOverflow"));
//如果rawsize>11, hash桶的大小为不小于rawsize值的最小素数
int hashsize = (rawsize > 11) ? HashHelpers.GetPrime((int)rawsize) : 11;
buckets = new bucket[hashsize];
loadsize = (int)(this.loadFactor * hashsize);
isWriterInProgress = false;
BCLDebug.Assert( loadsize < hashsize, "Invalid hashtable loadsize!");
}
//其他部分
}
在上面代码中保存数据hash桶为一个结构类型的数组;初始化使的大小为不小于rawsize的最小素数,其中rawsize的值会用0.72这个数来优化,而这个素数要不通过GetPrime方法来获得(下面将介绍这个方法),要不为最小值11。
接下来我们来分许GetPrime方法,代码如下:
internal static class HashHelpers
{
internal static readonly int[] primes = {
3, 7, 11, 17, 23, 29, 37, 47, 59, 71, 89, 107, 131, 163, 197, 239, 293, 353, 431, 521, 631, 761, 919,
1103, 1327, 1597, 1931, 2333, 2801, 3371, 4049, 4861, 5839, 7013, 8419, 10103, 12143, 14591,
17519, 21023, 25229, 30293, 36353, 43627, 52361, 62851, 75431, 90523, 108631, 130363, 156437,
187751, 225307, 270371, 324449, 389357, 467237, 560689, 672827, 807403, 968897, 1162687, 1395263,
1674319, 2009191, 2411033, 2893249, 3471899, 4166287, 4999559, 5999471, 7199369};
internal static bool IsPrime(int candidate)
{
if ((candidate & 1) != 0)
{
int limit = (int)Math.Sqrt (candidate);
for (int divisor = 3; divisor <= limit; divisor+=2)
{
if ((candidate % divisor) == 0)
return false;
}
return true;
}
return (candidate == 2);
}
//找出不小于min的最小素数
internal static int GetPrime(int min)
{
if (min < 0)
throw new ArgumentException(Environment.GetResourceString("Arg_HTCapacityOverflow"));
//对以小于799369的素数通过查表获得
for (int i = 0; i < primes.Length; i++)
{
int prime = primes[i];
if (prime >= min) return prime;
}
//对于较大的素数通过计算获得
for (int i = (min | 1); i < Int32.MaxValue;i+=2)
{
if (IsPrime(i))
return i;
}
return min;
}
}
这个方法返回不小于参数的最小素数。该方法提供两种方式返回结果:如果是小于799369的素数,直接通过查表来过得,此处的表指HashHelpers类的静态变量primes;如果是大于799369则通过计算来过得,请读者在上面代码中仔细分析此算法的实现。
再接下来我们看一下Hashtable类扩容的操作,代码如下:
//修改hash桶的大小
private void rehash( int newsize ) {
occupancy=0;
bucket[] newBuckets = new bucket[newsize]; //创建新的hash桶
//对原有桶中每个数据重新计算其hash值,并将其放入到新创建的newBuckets中
//对于之前已经被清除但是仍然残留在桶中的数据,不会进行填充操作
int nb;
for (nb = 0; nb < buckets.Length; nb++){
bucket oldb = buckets[nb];
if ((oldb.key != null) && (oldb.key != buckets)){
putEntry(newBuckets, oldb.key, oldb.val, oldb.hash_coll & 0x7FFFFFFF);
}
}
Thread.BeginCriticalRegion();
isWriterInProgress = true;
buckets = newBuckets;
loadsize = (int)(loadFactor * newsize); //根据填充因子,计算当前hash可装数据项个数的上限,当数据项个数大于这个上限时,hash桶需要扩容
UpdateVersion();
isWriterInProgress = false;
Thread.EndCriticalRegion();
BCLDebug.Assert(loadsize < newsize, "Our current implementaion means this is not possible.");
return;
}
//直接将(key, nvalue, hashcode)放入到hash桶中
private void putEntry (bucket[] newBuckets, Object key, Object nvalue, int hashcode)
{
BCLDebug.Assert(hashcode >= 0, "hashcode >= 0");
uint seed = (uint) hashcode;
uint incr = (uint)(1 + (((seed >> 5) + 1) % ((uint)newBuckets.Length - 1)));
int bucketNumber = (int) (seed % (uint)newBuckets.Length);
do {
if ((newBuckets[bucketNumber].key == null) || (newBuckets[bucketNumber].key == buckets)) { //找到了一个空位置
newBuckets[bucketNumber].val = nvalue;
newBuckets[bucketNumber].key = key;
newBuckets[bucketNumber].hash_coll |= hashcode;
return;
}
if( newBuckets[bucketNumber].hash_coll >= 0 ) { //如果冲突需要将该位置的hash_coll的32bit设置为1,表示被冲突位置后继有数
newBuckets[bucketNumber].hash_coll |= unchecked((int)0x80000000);
occupancy++;
}
bucketNumber = (int)(((long)bucketNumber + incr) % (uint)newBuckets.Length); //增加incr参长
} while (true);
}
注意:.NET优化认为0.72最为合适,所有数据将会被复制。
二、hash算法
公式:h(key, n) = h1(key) + n*h2(key)。Key为要hash的关键字,N为由于发生冲突而需要重新hash的次数。
其中:h1(key) = “具体对象的HashCode数值”,h2(key) = 1 + (((h1(key) >> 5) + 1) % (hashsize - 1));Hashsize为保存hash数据的桶数组的大小。
三、插入删除算法
计算seed值和遇到冲突时的hash值增量步长;
根据hash值计算应放入的桶的位置;
如果当前桶位置为空,如果当前桶位置冲突位为空,直接插入数据,如果当前桶位置冲突位被置位,记录下当前位置,继续向后扫描是否有重复的key插入;
如果当前桶位置已经存在数据,将当前冲突位设置为1,hash值增加增量,重新判断hash桶是否可用。
public virtual void Add(Object key, Object value) {
Insert(key, value, true);
}
private void Insert (Object key, Object nvalue, bool add) {
if (key == null) {
throw new ArgumentNullException("key", Environment.GetResourceString("ArgumentNull_Key"));
}
if (count >= loadsize) {
expand();
}
else if(occupancy > loadsize && count > 100) {
rehash();
}
uint seed;
uint incr;
uint hashcode = InitHash(key, buckets.Length, out seed, out incr);
int ntry = 0;
int emptySlotNumber = -1;
int bucketNumber = (int) (seed % (uint)buckets.Length);
do {
//emptySlotNumber被设置为第一个曾经发过冲突的可用的桶位置的下标
//为了保存整个hash桶中不存在冲突,需要继续检索下去
if (emptySlotNumber == -1 && (buckets[bucketNumber].key == buckets) && (buckets[bucketNumber].hash_coll < 0))//(((buckets[bucketNumber].hash_coll & unchecked(0x80000000))!=0)))
emptySlotNumber = bucketNumber;
//当buckets[bucketNumber].key == null时表示找到了一个可用的位置
//buckets[bucketNumber].key == buckets且buckets[bucketNumber]的冲突检查位没有被设置时,标识冲突检查已经结束,当前或者之前已经找到了一个可用的位置
if ((buckets[bucketNumber].key == null) ||
(buckets[bucketNumber].key == buckets && ((buckets[bucketNumber].hash_coll & unchecked(0x80000000))==0))) {
//使用第一个找到的虽然曾经发生过碰撞,但是目前可用的位置
if (emptySlotNumber != -1) // Reuse slot
bucketNumber = emptySlotNumber;
//添填数据到桶中,然后直接返回
Thread.BeginCriticalRegion();
isWriterInProgress = true;
buckets[bucketNumber].val = nvalue;
buckets[bucketNumber].key = key;
buckets[bucketNumber].hash_coll |= (int) hashcode;
count++;
UpdateVersion();
isWriterInProgress = false;
Thread.EndCriticalRegion();
return;
}
//在桶中找到了相同的key,更加add参数决定抛出异常,或者更新已经存在的数据
if (((buckets[bucketNumber].hash_coll & 0x7FFFFFFF) == hashcode) &&
KeyEquals (buckets[bucketNumber].key, key)) {
if (add) {
throw new ArgumentException(Environment.GetResourceString("Argument_AddingDuplicate__", buckets[bucketNumber].key, key));
}
Thread.BeginCriticalRegion();
isWriterInProgress = true;
buckets[bucketNumber].val = nvalue;
UpdateVersion();
isWriterInProgress = false;
Thread.EndCriticalRegion();
return;
}
if (emptySlotNumber == -1) {
if( buckets[bucketNumber].hash_coll >= 0 ) {
buckets[bucketNumber].hash_coll |= unchecked((int)0x80000000);
occupancy++;
}
}
bucketNumber = (int)(((long)bucketNumber + incr) % (uint)buckets.Length); //bucketNumber的大小增加incr
} while (++ntry < buckets.Length);
if (emptySlotNumber != -1)
{
Thread.BeginCriticalRegion();
isWriterInProgress = true;
buckets[emptySlotNumber].val = nvalue;
buckets[emptySlotNumber].key = key;
buckets[emptySlotNumber].hash_coll |= (int) hashcode;
count++;
UpdateVersion();
isWriterInProgress = false;
Thread.EndCriticalRegion();
return;
}
BCLDebug.Assert(false, "hash table insert failed! Load factor too high, or our double hashing function is incorrect.");
throw new InvalidOperationException(Environment.GetResourceString("InvalidOperation_HashInsertFailed"));
}
下面是删除操作的代码:
//删除指定key对象
//函数采用了快速删除的方法,对于冲突链上的删除,仅仅通过将hash冲突标识设置成1就可快速完成,省去了重新生成hash冲突的开销
public virtual void Remove(Object key) {
if (key == null) {
throw new ArgumentNullException("key", Environment.GetResourceString("ArgumentNull_Key"));
}
uint seed;
uint incr;
uint hashcode = InitHash(key, buckets.Length, out seed, out incr);
int ntry = 0;
bucket b;
int bn = (int) (seed % (uint)buckets.Length);
do {
b = buckets[bn];
if (((b.hash_coll & 0x7FFFFFFF) == hashcode) &&
KeyEquals (b.key, key)) { //判断是否找到要删除的key的数据项
Thread.BeginCriticalRegion();
isWriterInProgress = true;
// Clear hash_coll field, then key, then value
buckets[bn].hash_coll &= unchecked((int)0x80000000); //清楚hash_coll项,仅将冲突标识位(31bit)设置为1
if (buckets[bn].hash_coll != 0) {
buckets[bn].key = buckets;
}
else {
buckets[bn].key = null;
}
buckets[bn].val = null;
count--;
UpdateVersion();
isWriterInProgress = false;
Thread.EndCriticalRegion();
return;
}
bn = (int)(((long)bn + incr) % (uint)buckets.Length); //增开incr参长
} while (b.hash_coll < 0 && ++ntry < buckets.Length);
}
四、通过key读取value算法
计算seed值和遇到冲突时的hash值增量步长;
获取当前hash桶的快照;
根据hash值获取第一个可能的桶,注意这里面采用了lockfree类似的方法保证访问同步;
如果找到相等的key则返回结构,否则hashcode += 增量步长,重新查找。
public virtual Object this[Object key]
{
get
{
if (key == null)
{
throw new ArgumentNullException("key", Environment.GetResourceString("ArgumentNull_Key"));
}
uint seed;
uint incr;
//拿到当前hash的快照
//即使在获取value值的过程中系统调用rehash,当前取值操作仍然针对表的数据快照进行。
bucket[] lbuckets = buckets;
uint hashcode = InitHash(key, lbuckets.Length, out seed, out incr);
int ntry = 0;
bucket b;
int bucketNumber = (int)(seed % (uint)lbuckets.Length);
do
{
int currentversion;
//类似与lockfree的带有自旋特性的取值操作
//CLR内存模型保证lockfree可以正常进行
int spinCount = 0;
do
{
currentversion = version;
b = lbuckets[bucketNumber];
if ((++spinCount) % 8 == 0)
{
Thread.Sleep(1);
}
} while (isWriterInProgress || (currentversion != version));
//判断b是否包含了所需要的key
if (b.key == null)
{
return null;
}
if (((b.hash_coll & 0x7FFFFFFF) == hashcode) &&
KeyEquals(b.key, key))
return b.val;
bucketNumber = (int)(((long)bucketNumber + incr) % (uint)lbuckets.Length);
} while (b.hash_coll < 0 && ++ntry < lbuckets.Length);
return null;
}
set
{
Insert(key, value, false);
}
}