Spiga

串(四):.NET Framework String类的实现(下)

2012-04-25 15:38:22

一、SubString方法的实现

如果理解了上一篇文章中介绍的FastAllocateString方法与wstrcpy方法,理解SubString方法的实现就会比较容易。此方法的两个重载都会先调用一个参数的校验函数InternalSubStringWithChecks。如果没有发生异常就会继续调用内部实现的核心方法InternalSubString,该方法在内部实现获取[startIndex, startIndex + length]之间的字串。具体代码如下

//返回从startIndex开始到结束的字符串
public String Substring (int startIndex) {
	return this.Substring (startIndex, Length-startIndex);
}

public String Substring (int startIndex, int length) {
	return InternalSubStringWithChecks(startIndex, length, false);
}  

//采用手术室清洁策略来对参数进行检验
internal String InternalSubStringWithChecks (int startIndex, int length, bool fAlwaysCopy) {  
	int thisLength = Length;   
	if (startIndex<0) {
		throw new ArgumentOutOfRangeException("startIndex", Environment.GetResourceString("ArgumentOutOfRange_StartIndex"));
	}
	if (startIndex > thisLength) {
		throw new ArgumentOutOfRangeException("startIndex", Environment.GetResourceString("ArgumentOutOfRange_StartIndexLargerThanLength"));
	}
	if (length<0) {
		throw new ArgumentOutOfRangeException("length", Environment.GetResourceString("ArgumentOutOfRange_NegativeLength"));
	} 
	if (startIndex > thisLength-length) {
		throw new ArgumentOutOfRangeException("length", Environment.GetResourceString("ArgumentOutOfRange_IndexLength"));
	}
	if( length == 0) {
		return String.Empty;
	}
	return InternalSubString(startIndex, length, fAlwaysCopy);
}

//substring的内部实现,获取[startIndex, startIndex + length]之间的字串
unsafe string InternalSubString(int startIndex, int length, bool fAlwaysCopy) {
	BCLDebug.Assert( startIndex >= 0 && startIndex <= this.Length, "StartIndex is out of range!");
	BCLDebug.Assert( length >= 0 && startIndex <= this.Length - length, "length is out of range!");        

	if( startIndex == 0 && length == this.Length && !fAlwaysCopy)  {
		return this;
	}

	String result = FastAllocateString(length);

	fixed(char* dest = &result.m_firstChar)
		fixed(char* src = &this.m_firstChar) {
			wstrcpy(dest, src + startIndex, length);
		}

	return result;
}

二、Split方法的实现

String类中Split方法提供6中不同的重载。我选其中一个重载方法的实现来介绍Split方法。其他的重载要不实现代码相似,要不间接调用此重载。

public String[] Split(char[] separator, int count, StringSplitOptions options) {
	if (count<0) {
		throw new ArgumentOutOfRangeException("count", 
			Environment.GetResourceString("ArgumentOutOfRange_NegativeCount"));
	}        

	if( options < StringSplitOptions.None || options > StringSplitOptions.RemoveEmptyEntries) {
		throw new ArgumentException(Environment.GetResourceString("Arg_EnumIllegalVal", (int)options));
	}

	bool omitEmptyEntries = (options == StringSplitOptions.RemoveEmptyEntries);
	if( (count == 0) || (omitEmptyEntries && this.Length ==0)) {       
		return new String[0];
	}

	//获取分割符字符出现在原字符串的次数与下标索引数组
	int[] sepList = new int[Length];        
	int numReplaces = MakeSeparatorList(separator, ref sepList);        

	if (0 == numReplaces || count == 1) {
		String[] stringArray = new String[1];
		stringArray[0] = this;
		return stringArray;
	}        

	if(omitEmptyEntries) {
		return InternalSplitOmitEmptyEntries(sepList, null, numReplaces, count);
	}
	else {
		return InternalSplitKeepEmptyEntries(sepList, null, numReplaces, count);
	}        
}

从上面方法可以看出,其核心代码中有3个辅助方法MakeSeparatorList、InternalSplitOmitEmptyEntries和InternalSplitKeepEmptyEntries。

MakeSeparatorList方法的功能是:创建原字符串经过separator分割后的分段索引数组sepList,实现也比较容易

//创建原字符串经过separator分割后的分段索引数组sepList
private unsafe int MakeSeparatorList(char[] separator, ref int[] sepList) {
	int foundCount=0;

	if (separator == null || separator.Length ==0) {
		//分割符为null,默认空格为分割符号
		fixed (char* pwzChars = &this.m_firstChar) {
			for (int i=0; i < Length && foundCount < sepList.Length; i++) {
				if (Char.IsWhiteSpace(pwzChars[i])) {
					sepList[foundCount++]=i;
				}
			}
		}
	} 
	else {
		int sepListCount = sepList.Length;
		int sepCount = separator.Length;
		//分割符数组不为空,获取分割符数组中的内容进行处理
		fixed (char* pwzChars = &this.m_firstChar, pSepChars = separator) {
			for (int i=0; i< Length && foundCount < sepListCount; i++) {                    
				char * pSep = pSepChars;
				for( int j =0; j < sepCount; j++, pSep++) {
				   if ( pwzChars[i] == *pSep) {
					   sepList[foundCount++]=i;
					   break;
				   }
				}
			}
		}
	}
	return foundCount;
}  

InternalSplitOmitEmptyEntries方法和InternalSplitKeepEmptyEntries方法功能相似,前者在分割后省略空项,后者则保留空项。具体实现如下:

private String[] InternalSplitKeepEmptyEntries(Int32 [] sepList, Int32 [] lengthList, Int32 numReplaces, int count) {   
	BCLDebug.Assert( count >= 2, "Count>=2");

	int currIndex = 0;
	int arrIndex = 0;

	count--;
	int numActualReplaces = (numReplaces < count) ? numReplaces : count;

	String[] splitStrings = new String[numActualReplaces+1];

	for (int i = 0; i < numActualReplaces && currIndex < Length; i++) {
		//第i个分割符出现的位置位于当前处理位置的后面,获取从当前位置到第i个分割符之前的字符串作为结果保存在数组中
		splitStrings[arrIndex++] = Substring(currIndex, sepList[i]-currIndex );                        
		currIndex=sepList[i] + ((lengthList == null) ? 1 : lengthList[i]);
	}

	if (currIndex < Length && numActualReplaces >= 0) {
		splitStrings[arrIndex] = Substring(currIndex);
	} 
	else if (arrIndex==numActualReplaces) {
		splitStrings[arrIndex] = String.Empty;
	}

	return splitStrings;
}

private String[] InternalSplitOmitEmptyEntries(Int32[] sepList, Int32[] lengthList, Int32 numReplaces, int count) {
	BCLDebug.Assert( count >= 2, "Count>=2");

	//计算拆分后的字串数量,并且生成对应大小的数组
	int maxItems = (numReplaces < count) ? (numReplaces+1): count ;
	String[] splitStrings = new String[maxItems];

	int currIndex = 0;
	int arrIndex = 0;

	for(int i=0; i< numReplaces && currIndex < Length; i++) {
		if( sepList[i]-currIndex > 0) { 
			//第i个分割符出现的位置位于当前处理位置的后面,获取从当前位置到第i个分割符之前的字符串作为结果保存在数组中
			splitStrings[arrIndex++] = Substring(currIndex, sepList[i]-currIndex );                        
		}
		currIndex=sepList[i] + ((lengthList == null) ? 1 : lengthList[i]);
		if( arrIndex == count -1 )  {
			//当结果字符串数量已满时,跳过后面的操作
			while( i < numReplaces - 1 && currIndex == sepList[++i]) { 
				currIndex += ((lengthList == null) ? 1 : lengthList[i]);
			}
			break;
		}
	}

	BCLDebug.Assert( arrIndex < maxItems, "arrIndex < maxItems");                              
	//将从最后一个分割符到结尾的字符串填充到数组中
	if (currIndex< Length) {            
		splitStrings[arrIndex++] = Substring(currIndex);
	}

	//如果splitStrings数组没有填充满,那么创建合适大小的数组,并且将非空的数据项插入到该数组中
	//注意该过程会产生复制与垃圾对象开销
	String[] stringArray = splitStrings;
	if( arrIndex!= maxItems) { 
		stringArray = new String[arrIndex];
		for( int j = 0; j < arrIndex; j++) {
			stringArray[j] = splitStrings[j];
		}   
	}
	return stringArray;
}

三、Join方法的实现

Join方法与Split方法功能相反,它将通过连接符合将数组拼接成字符串。我们来看它的具体实现:

//通过连接符合将value数组中的字符串连接起来
public static String Join (String separator, String[] value) {
	if (value==null) {
		throw new ArgumentNullException("value");
	}
	return Join(separator, value, 0, value.Length);
}

//从指定位置开始的count个字符,通过连接符合将value数组中的字符连接起来
public unsafe static String Join(String separator, String[] value, int startIndex, int count) {
	if (separator == null) {
		separator = String.Empty;
	}
	if (value == null) {
		throw new ArgumentNullException("value");
	}
	if (startIndex < 0) {
		throw new ArgumentOutOfRangeException("startIndex", Environment.GetResourceString("ArgumentOutOfRange_StartIndex"));
	}
	if (count < 0) {
		throw new ArgumentOutOfRangeException("count", Environment.GetResourceString("ArgumentOutOfRange_NegativeCount"));
	}
	if (startIndex > value.Length - count) {
		throw new ArgumentOutOfRangeException("startIndex", Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
	}
	if (count == 0) {
		return String.Empty;
	}

	int jointLength = 0;
	//计算出value数组的终止下标
	int endIndex = startIndex + count - 1;
	//计算结构字符串的总长度
	for (int stringToJoinIndex = startIndex; stringToJoinIndex <= endIndex; stringToJoinIndex++) {
		if (value[stringToJoinIndex] != null) {
			jointLength += value[stringToJoinIndex].Length;
		}
	}
	//追加连接字符所占用的空间
	jointLength += (count - 1) * separator.Length;
	if ((jointLength < 0) || ((jointLength + 1) < 0) ) {
		throw new OutOfMemoryException();
	}
	if (jointLength == 0) {
		return String.Empty;
	}
	//获取字符串对象
	string jointString = FastAllocateString( jointLength );
	//循环迭代进行字符串连接
	fixed (char * pointerToJointString = &jointString.m_firstChar) {
		//创建字符串缓冲区操作
		//该对象不会创建新的字符串缓冲区副本
		//该对象的构造函数认为传入的pointerToJointString缓冲区内当前没有字符串存在
		UnSafeCharBuffer charBuffer = new UnSafeCharBuffer( pointerToJointString, jointLength);            
		//迭代依次实现字符串的拼接
		charBuffer.AppendString( value[startIndex] );
		for (int stringToJoinIndex = startIndex + 1; stringToJoinIndex <= endIndex; stringToJoinIndex++) {
			charBuffer.AppendString( separator );
			charBuffer.AppendString( value[stringToJoinIndex] );
		}
		BCLDebug.Assert(*(pointerToJointString + charBuffer.Length) == '\0', "String must be null-terminated!");
	}

	return jointString;
}

四、Compare方法的实现

Compate方法的比较分为有语言性的与非语言性的两种。如果是有语言性,会通过当前系统文化区域的设置进行比较;如果是非语言性的会进行是序数比较。该方法重载较多,本篇文章也挑其中一种方法来介绍它的实现:

//根据比较类型comparisonType对字符strA与strB进行比较
//比较类型可以是有语言性的,也可以是非语言性的
//对于有语言性的比较,通过系统当前文化区域设置CurrentCulture进行比较
//对于非语言性的序数比较,通过String类或者非托管代码实现比较
public static int Compare(String strA, String strB, StringComparison comparisonType) {
	if( comparisonType < StringComparison.CurrentCulture || comparisonType > StringComparison.OrdinalIgnoreCase) {
		throw new ArgumentException(Environment.GetResourceString("NotSupported_StringComparison"), "comparisonType");
	}
	if ((Object)strA == (Object)strB) {
		return 0;
	}
	if( strA == null) {
		return -1;
	}  
	if( strB == null) {
		return 1;
	}

	//根据不同的类型进行比较
	switch (comparisonType) {
		case StringComparison.CurrentCulture:
			return CultureInfo.CurrentCulture.CompareInfo.Compare(strA, strB, CompareOptions.None);
		case StringComparison.CurrentCultureIgnoreCase:
			return CultureInfo.CurrentCulture.CompareInfo.Compare(strA, strB, CompareOptions.IgnoreCase);
		case StringComparison.InvariantCulture:
			return CultureInfo.InvariantCulture.CompareInfo.Compare(strA, strB, CompareOptions.None);
		case StringComparison.InvariantCultureIgnoreCase:
			return c.InvariantCulture.CompareInfo.Compare(strA, strB, CompareOptions.IgnoreCase);
		case StringComparison.Ordinal:
			return CompareOrdinalHelper(strA, strB);
		case StringComparison.OrdinalIgnoreCase:
			if (strA.IsAscii() && strB.IsAscii()) {
				return (String.nativeCompareOrdinal(strA, strB, true));
			}
			return TextInfo.CompareOrdinalIgnoreCase(strA, strB);
		default:
			throw new NotSupportedException(Environment.GetResourceString("NotSupported_StringComparison"));
	}
}

从代码中我们可以得知有语言性的比较会调用CultureInfo类和CompareInfo类,具体的实现这里就不介绍了,可以参见对应的cs文件。这里重点看一下序数比较方法CompareOrdinalHelper。该方法为了提高效率,该方法以长度10为分界分成两种不同的方式来进行比较,具体代码如下

//对字符串strA与strB进行序数比较
//为了提高效率,当strA与strB两者的最小长度大于等于10的时候,会首先执行快速比较代码,如果其中已经发现不等,则计算不等位置,然后直接返回
//否则进入较慢的比较执行代码
private unsafe static int CompareOrdinalHelper(String strA, String strB)
{
	BCLDebug.Assert(strA != null && strB != null, "strings cannot be null!");
	int length = Math.Min(strA.Length, strB.Length);
	int diffOffset = -1;        

	fixed(char* ap = strA) fixed(char* bp = strB)
	{
		char* a = ap;
		char* b = bp;

		//进行快速比较过程,每次前进10个字符
		while (length >= 10)
		{
			if (*(int*)a != *(int*)b) { 
				diffOffset = 0; 
				break;
			}           
			if (*(int*)(a+2) != *(int*)(b+2)) {
				diffOffset = 2;
				break;
			}
			if (*(int*)(a+4) != *(int*)(b+4)) {
				diffOffset = 4;                    
				break;
			}
			if (*(int*)(a+6) != *(int*)(b+6)) {
				diffOffset = 6;
				break;
			}
			if (*(int*)(a+8) != *(int*)(b+8)) {
				diffOffset = 8;
				break;
			}
			a += 10; 
			b += 10; 
			length -= 10;
		}
		if( diffOffset != -1) {
			//计算发生不等的索引位置
			a += diffOffset;
			b += diffOffset;
			int order;
			if ( (order = (int)*a - (int)*b) != 0) {
            return order;
			}
			BCLDebug.Assert( *(a+1) != *(b+1), "This byte must be different if we reach here!");
			return ((int)*(a+1) - (int)*(b+1));                
		}
		//长度小于10,进入较慢的比较
		while (length > 0) {
			if (*(int*)a != *(int*)b) {
				break;
			}
			a += 2; 
			b += 2; 
			length -= 2;
		}
		if( length > 0) { 
			int c;
			if ( (c = (int)*a - (int)*b) != 0) {
             //不等发生在低位
				return c;
			}
          //不等发生在高位
			BCLDebug.Assert( *(a+1) != *(b+1), "This byte must be different if we reach here!");
			return ((int)*(a+1) - (int)*(b+1));                                    
		}
		//返回两个字符串的长度差
		return strA.Length - strB.Length;                        
	}
}

五、Trim方法的实现

该方法删除在原字符串中trimChars指定的字符串,内部提供一个辅助方法TrimHelper来实现核心逻辑

internal static readonly char[] WhitespaceChars =   
	{ (char) 0x9, (char) 0xA, (char) 0xB, (char) 0xC, (char) 0xD, (char) 0x20,   (char) 0x85, 
	  (char) 0xA0, (char)0x1680,
	  (char) 0x2000, (char) 0x2001, (char) 0x2002, (char) 0x2003, (char) 0x2004, (char) 0x2005,
	  (char) 0x2006, (char) 0x2007, (char) 0x2008, (char) 0x2009, (char) 0x200A, (char) 0x200B,
	  (char) 0x2028, (char) 0x2029,
	  (char) 0x3000, (char) 0xFEFF };

//删除在原字符串中trimChars指定的字符串
public String Trim(params char[] trimChars) {
	if (null==trimChars || trimChars.Length == 0) {
		trimChars=WhitespaceChars;
	}
	return TrimHelper(trimChars,TrimBoth);
}

public String TrimStart(params char[] trimChars) {
	if (null==trimChars || trimChars.Length == 0) {
		trimChars=WhitespaceChars;
	}
	return TrimHelper(trimChars,TrimHead);
}

public String TrimEnd(params char[] trimChars) {
	if (null==trimChars || trimChars.Length == 0) {
		trimChars=WhitespaceChars;
	}
	return TrimHelper(trimChars,TrimTail);
}

public String Trim() {
	return TrimHelper(WhitespaceChars,TrimBoth);    
}

//根据删除类型trimType,删除trimChars中的字串
private String TrimHelper(char[] trimChars, int trimType) {
	int end = this.Length-1;
	int start=0;

	if (trimType !=TrimTail)  {
		//检查字符串首部开始应该删除的字符,检查后start指向第一个保留的字符的索引
		for (start=0; start < this.Length; start++) {
			int i = 0;
			char ch = this[start];
			for( i = 0; i < trimChars.Length; i++) {
				if( trimChars[i] == ch) break;
			}
			if( i == trimChars.Length) {
				break;  
			}
		}
	}

	if (trimType !=TrimHead) {
		//检查字符串尾部开始应该删除的字符,检查后end指向最后一个保留的字符的索引
		for (end= Length -1; end >= start;  end--) {
			int i = 0;  
			char ch = this[end];                
			for(i = 0; i < trimChars.Length; i++) {
				if( trimChars[i] == ch) break;
			}
			if( i == trimChars.Length) { 
				break;  
			}                
		}
	}
	int len = end -start + 1;
	//如果没有字符被删除,返回字符串自身
	if (len == this.Length) {
		return this;
	}
	else {
		if( len == 0) {
			return String.Empty;
		}
		//返回start到end的字串
		return InternalSubString(start, len, false);
	}
}

String类的实现就谈到这里了,还有一些方法这次没有谈到,比如IndexOf,Format,PadLeft,PadRight。象IndexOf方法String类会调用CompareInfo类的IndexOf方法,具体请参见CompareInfo类。Format方法则是通过StringBuilder类的AppendFormat方法来实现的。其他方法就不一一介绍了,象PadLeft,PadRight这些未讲到的方法都本都是通过非托管C++代码来实现的。