Spiga

散列(四):.NET Framework Hashtable类

2012-09-02 16:30:09

  Hash是各种数据结构中最为重要的数据结构之一;Hash具有O(1)的数据检索效率;Hash的空间开销却通常很大,以空间换取时间;适用于读取操作频繁,写入操作很少的操作类型。

一、存储结构

  .NET中hash以数组的方式保存,数组的大小为素数,当数组中存在的数据项个数大于数组容量的扩张因子倍数时,数组进行扩张。
  我们先看一下Hashtable类中的构造函数和一些重要数据项,如以下代码:

public class Hashtable : IDictionary, ISerializable, IDeserializationCallback, ICloneable {
	//其他部分
	private struct bucket {
		public Object key;  //实际保存的键对象,如果key==buckets或者null,认为该项数据已经被清除
		public Object val;  //实际保存的值对象
		public int hash_coll;   //hash值最高位(31bit)表示在该位上是否发生过冲突
								//即按照hash算法,在插入数据时如果计算到该位置发现hash_coll(31bit)==1,那么认为该位置曾经发生过冲突,
								//从当前位置使用hash算法计算后可能会继续发现可用的数据
	}
	private  bucket[] buckets;   //保存数据的hash桶
	private  int count; //hash表中实际包含数据项个数   
	private  int occupancy;  //在hashtable中标记的冲突位个数       
	private  int loadsize;
	private  float loadFactor;  //装载因子,如果 当前hash表中的实际数据项个数*装载因子>hash表中保存数据项的桶的大小
								//那么当前保存数据项桶需要扩容至少当前容量2倍的素数大小
	//其他部分

	//capacity为hash表中可以保存的数据项的期望值,注意:这个值并不等于保存数据项的hash桶的实际大小
	public Hashtable(int capacity, float loadFactor) {
		if (capacity < 0)
			throw new ArgumentOutOfRangeException("capacity", Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
		if (!(loadFactor >= 0.1f && loadFactor <= 1.0f))
			throw new ArgumentOutOfRangeException("loadFactor", Environment.GetResourceString("ArgumentOutOfRange_HashtableLoadFactor", .1, 1.0));
		this.loadFactor = 0.72f * loadFactor;
		double rawsize = capacity / this.loadFactor;    //根据填充因子,计算hash桶的真实大小值
		if (rawsize > Int32.MaxValue)
			throw new ArgumentException(Environment.GetResourceString("Arg_HTCapacityOverflow"));
		//如果rawsize>11, hash桶的大小为不小于rawsize值的最小素数
		int hashsize = (rawsize > 11) ? HashHelpers.GetPrime((int)rawsize) : 11;
		buckets = new bucket[hashsize];
		loadsize = (int)(this.loadFactor * hashsize);
		isWriterInProgress = false;
		BCLDebug.Assert( loadsize < hashsize, "Invalid hashtable loadsize!");
	}
	//其他部分
}

在上面代码中保存数据hash桶为一个结构类型的数组;初始化使的大小为不小于rawsize的最小素数,其中rawsize的值会用0.72这个数来优化,而这个素数要不通过GetPrime方法来获得(下面将介绍这个方法),要不为最小值11。
  接下来我们来分许GetPrime方法,代码如下:

internal static class HashHelpers
{
	internal static readonly int[] primes = {
		3, 7, 11, 17, 23, 29, 37, 47, 59, 71, 89, 107, 131, 163, 197, 239, 293, 353, 431, 521, 631, 761, 919,
		1103, 1327, 1597, 1931, 2333, 2801, 3371, 4049, 4861, 5839, 7013, 8419, 10103, 12143, 14591,
		17519, 21023, 25229, 30293, 36353, 43627, 52361, 62851, 75431, 90523, 108631, 130363, 156437,
		187751, 225307, 270371, 324449, 389357, 467237, 560689, 672827, 807403, 968897, 1162687, 1395263,
		1674319, 2009191, 2411033, 2893249, 3471899, 4166287, 4999559, 5999471, 7199369};

	internal static bool IsPrime(int candidate) 
	{
		if ((candidate & 1) != 0) 
		{
			int limit = (int)Math.Sqrt (candidate);
			for (int divisor = 3; divisor <= limit; divisor+=2)
			{
				if ((candidate % divisor) == 0)
					return false;
			}
			return true;
		}
		return (candidate == 2);
	}

	//找出不小于min的最小素数
	internal static int GetPrime(int min) 
	{
		if (min < 0)
			throw new ArgumentException(Environment.GetResourceString("Arg_HTCapacityOverflow"));

		//对以小于799369的素数通过查表获得
		for (int i = 0; i < primes.Length; i++) 
		{
			int prime = primes[i];
			if (prime >= min) return prime;
		}

		//对于较大的素数通过计算获得
		for (int i = (min | 1); i < Int32.MaxValue;i+=2) 
		{
			if (IsPrime(i))
				return i;
		}
		return min;
	}
}

这个方法返回不小于参数的最小素数。该方法提供两种方式返回结果:如果是小于799369的素数,直接通过查表来过得,此处的表指HashHelpers类的静态变量primes;如果是大于799369则通过计算来过得,请读者在上面代码中仔细分析此算法的实现。  
  再接下来我们看一下Hashtable类扩容的操作,代码如下:

//修改hash桶的大小
private void rehash( int newsize ) {
	occupancy=0;
	bucket[] newBuckets = new bucket[newsize];  //创建新的hash桶
	//对原有桶中每个数据重新计算其hash值,并将其放入到新创建的newBuckets中
	//对于之前已经被清除但是仍然残留在桶中的数据,不会进行填充操作
	int nb;
	for (nb = 0; nb < buckets.Length; nb++){
		bucket oldb = buckets[nb];
		if ((oldb.key != null) && (oldb.key != buckets)){
			putEntry(newBuckets, oldb.key, oldb.val, oldb.hash_coll & 0x7FFFFFFF);
		}
	}
	Thread.BeginCriticalRegion();            
	isWriterInProgress = true;
	buckets = newBuckets;
	loadsize = (int)(loadFactor * newsize);     //根据填充因子,计算当前hash可装数据项个数的上限,当数据项个数大于这个上限时,hash桶需要扩容
	UpdateVersion();
	isWriterInProgress = false;            
	Thread.EndCriticalRegion();            
	BCLDebug.Assert(loadsize < newsize, "Our current implementaion means this is not possible.");
	return;
}

//直接将(key, nvalue, hashcode)放入到hash桶中
private void putEntry (bucket[] newBuckets, Object key, Object nvalue, int hashcode)
{
	BCLDebug.Assert(hashcode >= 0, "hashcode >= 0");  

	uint seed = (uint) hashcode;
	uint incr = (uint)(1 + (((seed >> 5) + 1) % ((uint)newBuckets.Length - 1)));
	int bucketNumber = (int) (seed % (uint)newBuckets.Length);            
	do {

		if ((newBuckets[bucketNumber].key == null) || (newBuckets[bucketNumber].key == buckets)) {      //找到了一个空位置
			newBuckets[bucketNumber].val = nvalue;
			newBuckets[bucketNumber].key = key;
			newBuckets[bucketNumber].hash_coll |= hashcode;
			return;
		}
		
		if( newBuckets[bucketNumber].hash_coll >= 0 ) {     //如果冲突需要将该位置的hash_coll的32bit设置为1,表示被冲突位置后继有数
		newBuckets[bucketNumber].hash_coll |= unchecked((int)0x80000000);
			occupancy++;
		}
		bucketNumber = (int)(((long)bucketNumber + incr) % (uint)newBuckets.Length);      //增加incr参长             
	} while (true);
}

  注意:.NET优化认为0.72最为合适,所有数据将会被复制。

二、hash算法

  公式:h(key, n) = h1(key) + n*h2(key)。Key为要hash的关键字,N为由于发生冲突而需要重新hash的次数。
其中:h1(key) = “具体对象的HashCode数值”,h2(key) = 1 + (((h1(key) >> 5) + 1) % (hashsize - 1));Hashsize为保存hash数据的桶数组的大小。

三、插入删除算法

  计算seed值和遇到冲突时的hash值增量步长;
  根据hash值计算应放入的桶的位置;
  如果当前桶位置为空,如果当前桶位置冲突位为空,直接插入数据,如果当前桶位置冲突位被置位,记录下当前位置,继续向后扫描是否有重复的key插入;
  如果当前桶位置已经存在数据,将当前冲突位设置为1,hash值增加增量,重新判断hash桶是否可用。

public virtual void Add(Object key, Object value) {
	Insert(key, value, true);
}

private void Insert (Object key, Object nvalue, bool add) {
	if (key == null) {
		throw new ArgumentNullException("key", Environment.GetResourceString("ArgumentNull_Key"));
	}
	if (count >= loadsize) {
		expand();
	}
	else if(occupancy > loadsize && count > 100) {
		rehash();
	}
	
	uint seed;
	uint incr;
	uint hashcode = InitHash(key, buckets.Length, out seed, out incr);
	int  ntry = 0;
	int emptySlotNumber = -1; 
	int bucketNumber = (int) (seed % (uint)buckets.Length);
	do {
		//emptySlotNumber被设置为第一个曾经发过冲突的可用的桶位置的下标
		//为了保存整个hash桶中不存在冲突,需要继续检索下去
		if (emptySlotNumber == -1 && (buckets[bucketNumber].key == buckets) && (buckets[bucketNumber].hash_coll < 0))//(((buckets[bucketNumber].hash_coll & unchecked(0x80000000))!=0)))
			emptySlotNumber = bucketNumber;
		//当buckets[bucketNumber].key == null时表示找到了一个可用的位置
		//buckets[bucketNumber].key == buckets且buckets[bucketNumber]的冲突检查位没有被设置时,标识冲突检查已经结束,当前或者之前已经找到了一个可用的位置
		if ((buckets[bucketNumber].key == null) || 
			(buckets[bucketNumber].key == buckets && ((buckets[bucketNumber].hash_coll & unchecked(0x80000000))==0))) {

		   //使用第一个找到的虽然曾经发生过碰撞,但是目前可用的位置
			if (emptySlotNumber != -1) // Reuse slot
				bucketNumber = emptySlotNumber;
		  //添填数据到桶中,然后直接返回
			Thread.BeginCriticalRegion();            
			isWriterInProgress = true;                    
			buckets[bucketNumber].val = nvalue;
			buckets[bucketNumber].key  = key;
			buckets[bucketNumber].hash_coll |= (int) hashcode;
			count++;
			UpdateVersion();
			isWriterInProgress = false;                                        
			Thread.EndCriticalRegion();                    
			return;
		}
	   //在桶中找到了相同的key,更加add参数决定抛出异常,或者更新已经存在的数据
		if (((buckets[bucketNumber].hash_coll & 0x7FFFFFFF) == hashcode) && 
			KeyEquals (buckets[bucketNumber].key, key)) {
			if (add) {
				throw new ArgumentException(Environment.GetResourceString("Argument_AddingDuplicate__", buckets[bucketNumber].key, key));
			}
			Thread.BeginCriticalRegion();          
			isWriterInProgress = true;                    
			buckets[bucketNumber].val = nvalue;
			UpdateVersion();                    
			isWriterInProgress = false;        
			Thread.EndCriticalRegion();                    
			return;
		}

		  if (emptySlotNumber == -1) {
			if( buckets[bucketNumber].hash_coll >= 0 ) {
				buckets[bucketNumber].hash_coll |= unchecked((int)0x80000000);
				occupancy++;
			}
		}
		bucketNumber = (int)(((long)bucketNumber + incr) % (uint)buckets.Length);  //bucketNumber的大小增加incr             
	} while (++ntry < buckets.Length);

	if (emptySlotNumber != -1)
	{
		Thread.BeginCriticalRegion();           
		isWriterInProgress = true;                    
		buckets[emptySlotNumber].val = nvalue;
		buckets[emptySlotNumber].key  = key;
		buckets[emptySlotNumber].hash_coll |= (int) hashcode;
		count++;
		UpdateVersion();                
		isWriterInProgress = false;                     
		Thread.EndCriticalRegion();                
		return;
	}
	BCLDebug.Assert(false, "hash table insert failed!  Load factor too high, or our double hashing function is incorrect.");
	throw new InvalidOperationException(Environment.GetResourceString("InvalidOperation_HashInsertFailed"));
}

  下面是删除操作的代码:

//删除指定key对象
//函数采用了快速删除的方法,对于冲突链上的删除,仅仅通过将hash冲突标识设置成1就可快速完成,省去了重新生成hash冲突的开销
public virtual void Remove(Object key) {
	if (key == null) {
		throw new ArgumentNullException("key", Environment.GetResourceString("ArgumentNull_Key"));
	}
	uint seed;
	uint incr;
	uint hashcode = InitHash(key, buckets.Length, out seed, out incr);
	int ntry = 0;
	
	bucket b;
	int bn = (int) (seed % (uint)buckets.Length); 
	do {
		b = buckets[bn];
		if (((b.hash_coll & 0x7FFFFFFF) == hashcode) && 
			KeyEquals (b.key, key)) {   //判断是否找到要删除的key的数据项
			Thread.BeginCriticalRegion();                    
			isWriterInProgress = true;
			// Clear hash_coll field, then key, then value
			buckets[bn].hash_coll &= unchecked((int)0x80000000);    //清楚hash_coll项,仅将冲突标识位(31bit)设置为1
			if (buckets[bn].hash_coll != 0) {
				buckets[bn].key = buckets;
			} 
			else {
				buckets[bn].key = null;
			}
			buckets[bn].val = null;  
			count--;
			UpdateVersion();
			isWriterInProgress = false;                    
			Thread.EndCriticalRegion();                    
			return;
		}
		bn = (int)(((long)bn + incr) % (uint)buckets.Length);   //增开incr参长                            
	} while (b.hash_coll < 0 && ++ntry < buckets.Length);
}

四、通过key读取value算法

  计算seed值和遇到冲突时的hash值增量步长;
  获取当前hash桶的快照;
  根据hash值获取第一个可能的桶,注意这里面采用了lockfree类似的方法保证访问同步;
  如果找到相等的key则返回结构,否则hashcode += 增量步长,重新查找。

public virtual Object this[Object key]
{
	get
	{
		if (key == null)
		{
			throw new ArgumentNullException("key", Environment.GetResourceString("ArgumentNull_Key"));
		}
		uint seed;
		uint incr;
		//拿到当前hash的快照
		//即使在获取value值的过程中系统调用rehash,当前取值操作仍然针对表的数据快照进行。
		bucket[] lbuckets = buckets;
		uint hashcode = InitHash(key, lbuckets.Length, out seed, out incr);
		int ntry = 0;

		bucket b;
		int bucketNumber = (int)(seed % (uint)lbuckets.Length);
		do
		{
			int currentversion;
			//类似与lockfree的带有自旋特性的取值操作
			//CLR内存模型保证lockfree可以正常进行
			int spinCount = 0;
			do
			{
				currentversion = version;
				b = lbuckets[bucketNumber];

				if ((++spinCount) % 8 == 0)
				{
					Thread.Sleep(1);
				}
			} while (isWriterInProgress || (currentversion != version));
			//判断b是否包含了所需要的key
			if (b.key == null)
			{
				return null;
			}
			if (((b.hash_coll & 0x7FFFFFFF) == hashcode) &&
				KeyEquals(b.key, key))
				return b.val;
			bucketNumber = (int)(((long)bucketNumber + incr) % (uint)lbuckets.Length);
		} while (b.hash_coll < 0 && ++ntry < lbuckets.Length);
		return null;
	}
	set
	{
		Insert(key, value, false);
	}
}