CharUnicodeInfo.cs source code in C# .NET

                        Code:
                         / 4.0 / 4.0 / DEVDIV_TFS / Dev10 / Releases / RTMRel / ndp / clr / src / BCL / System / Globalization / CharUnicodeInfo.cs / 1305376 / CharUnicodeInfo.cs
                        
                        
                            // ==++== 
//
//   Copyright (c) Microsoft Corporation.  All rights reserved.
//
// ==--== 
////////////////////////////////////////////////////////////////////////////
// 
//  Class:    CharacterInfo 
//
//  Purpose:  This class implements a set of methods for retrieving 
//            character type information.  Character type information is
//            independent of culture and region.
//
//  Date:     August 12, 1998 
//
//////////////////////////////////////////////////////////////////////////// 
 
namespace System.Globalization {
 
    //This class has only static members and therefore doesn't need to be serialized.

    using System;
    using System.Threading; 
    using System.Runtime.InteropServices;
    using System.Runtime.CompilerServices; 
    using System.Runtime.Versioning; 
    using System.Reflection;
    using System.Diagnostics.Contracts; 


    public static class CharUnicodeInfo
    { 
        //--------------------------------------------------------------------//
        //                        Internal Information                        // 
        //-------------------------------------------------------------------// 

        // 
        // Native methods to access the Unicode category data tables in charinfo.nlp.
        //
        internal const char  HIGH_SURROGATE_START  = '\ud800';
        internal const char  HIGH_SURROGATE_END    = '\udbff'; 
        internal const char  LOW_SURROGATE_START   = '\udc00';
        internal const char  LOW_SURROGATE_END     = '\udfff'; 
 
        internal const int UNICODE_CATEGORY_OFFSET = 0;
        internal const int BIDI_CATEGORY_OFFSET = 1; 

        // The base pointer of the data table
        unsafe static byte* s_pDataTable;
 
        // The native pointer to the 12:4:4 index table of the Unicode cateogry data.
        unsafe static ushort* s_pCategoryLevel1Index; 
        unsafe static byte* s_pCategoriesValue; 

        // The native pointer to the 12:4:4 index table of the Unicode numeric data. 
        // The value of this index table is an index into the real value table stored in s_pNumericValues.
        unsafe static ushort* s_pNumericLevel1Index;

        // The numeric value table, which is indexed by s_pNumericLevel1Index. 
        // Every item contains the value for numeric value.
        // unsafe static double* s_pNumericValues; 
        // To get around the IA64 alignment issue.  Our double data is aligned in 8-byte boundary, but loader loads the embeded table starting 
        // at 4-byte boundary.  This cause a alignment issue since double is 8-byte.
        unsafe static byte* s_pNumericValues; 

        // The digit value table, which is indexed by s_pNumericLevel1Index.  It shares the same indice as s_pNumericValues.
        // Every item contains the value for decimal digit/digit value.
        unsafe static DigitValues* s_pDigitValues; 

        internal const String UNICODE_INFO_FILE_NAME = "charinfo.nlp"; 
        // The starting codepoint for Unicode plane 1.  Plane 1 contains 0x010000 ~ 0x01ffff. 
        internal const int UNICODE_PLANE01_START = 0x10000;
 

        //
        // This is the header for the native data table that we load from UNICODE_INFO_FILE_NAME.
        // 
        // Excplicit layout is used here since a syntax like char[16] can not be used in sequential layout.
        [StructLayout(LayoutKind.Explicit)] 
        internal unsafe struct UnicodeDataHeader { 
            [FieldOffset(0)]
            internal char TableName;    // WCHAR[16] 
            [FieldOffset(0x20)]
            internal ushort version;    // WORD[4]
            [FieldOffset(0x28)]
            internal uint OffsetToCategoriesIndex; // DWORD 
            [FieldOffset(0x2c)]
            internal uint OffsetToCategoriesValue; // DWORD 
            [FieldOffset(0x30)] 
            internal uint OffsetToNumbericIndex; // DWORD
            [FieldOffset(0x34)] 
            internal uint OffsetToDigitValue; // DWORD
            [FieldOffset(0x38)]
            internal uint OffsetToNumbericValue; // DWORD
 
        }
 
        // NOTE: It's important to specify pack size here, since the size of the structure is 2 bytes.  Otherwise, 
        // the default pack size will be 4.
 
        [StructLayout(LayoutKind.Sequential, Pack=2)]
        internal struct DigitValues {
            internal sbyte decimalDigit;
            internal sbyte digit; 
        }
 
 
        //We need to allocate the underlying table that provides us with the information that we
        //use.  We allocate this once in the class initializer and then we don't need to worry 
        //about it again.
        //
        [System.Security.SecuritySafeCritical]  // auto-generated
        [ResourceExposure(ResourceScope.None)] 
        [ResourceConsumption(ResourceScope.Process, ResourceScope.Process)]
        unsafe static CharUnicodeInfo() { 
            s_pDataTable = GlobalizationAssembly.GetGlobalizationResourceBytePtr(typeof(CharUnicodeInfo).Assembly, UNICODE_INFO_FILE_NAME); 
            UnicodeDataHeader* mainHeader = (UnicodeDataHeader*)s_pDataTable;
 
            // Set up the native pointer to different part of the tables.
            s_pCategoryLevel1Index = (ushort*) (s_pDataTable + mainHeader->OffsetToCategoriesIndex);
            s_pCategoriesValue = (byte*) (s_pDataTable + mainHeader->OffsetToCategoriesValue);
            s_pNumericLevel1Index = (ushort*) (s_pDataTable + mainHeader->OffsetToNumbericIndex); 
            s_pNumericValues = (byte*) (s_pDataTable + mainHeader->OffsetToNumbericValue);
            s_pDigitValues = (DigitValues*) (s_pDataTable + mainHeader->OffsetToDigitValue); 
 
            // Go to native side to make sure the native CharacterInfoTable pointer in the native side is initialized.
            nativeInitTable(s_pDataTable); 
        }


        //////////////////////////////////////////////////////////////////////// 
        //
        // Actions: 
        // Convert the BMP character or surrogate pointed by index to a UTF32 value. 
        // This is similar to Char.ConvertToUTF32, but the difference is that
        // it does not throw exceptions when invalid surrogate characters are passed in. 
        //
        // WARNING: since it doesn't throw an exception it CAN return a value
        //          in the surrogate range D800-DFFF, which are not legal unicode values.
        // 
        ////////////////////////////////////////////////////////////////////////
 
        internal static int InternalConvertToUtf32(String s, int index) { 
            Contract.Assert(s != null, "s != null");
            Contract.Assert(index >= 0 && index < s.Length, "index < s.Length"); 
            if (index < s.Length - 1) {
                int temp1 = (int)s[index] - HIGH_SURROGATE_START;
                if (temp1 >= 0 && temp1 <= 0x3ff) {
                    int temp2 = (int)s[index+1] - LOW_SURROGATE_START; 
                    if (temp2 >= 0 && temp2 <= 0x3ff) {
                        // Convert the surrogate to UTF32 and get the result. 
                        return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START); 
                    }
                } 
            }
            return ((int)s[index]);
        }
 
        ////////////////////////////////////////////////////////////////////////
        // 
        // Convert a character or a surrogate pair starting at index of string s 
        // to UTF32 value.
        // 
        //  Parameters:
        //      s       The string
        //      index   The starting index.  It can point to a BMP character or
        //              a surrogate pair. 
        //      len     The length of the string.
        //      charLength  [out]   If the index points to a BMP char, charLength 
        //              will be 1.  If the index points to a surrogate pair, 
        //              charLength will be 2.
        // 
        // WARNING: since it doesn't throw an exception it CAN return a value
        //          in the surrogate range D800-DFFF, which are not legal unicode values.
        //
        //  Returns: 
        //      The UTF32 value
        // 
        //////////////////////////////////////////////////////////////////////// 

        internal static int InternalConvertToUtf32(String s, int index, out int charLength) { 
            Contract.Assert(s != null, "s != null");
            Contract.Assert(s.Length > 0, "s.Length > 0");
            Contract.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length");
            charLength = 1; 
            if (index < s.Length - 1) {
                int temp1 = (int)s[index] - HIGH_SURROGATE_START; 
                if (temp1 >= 0 && temp1 <= 0x3ff) { 
                    int temp2 = (int)s[index+1] - LOW_SURROGATE_START;
                    if (temp2 >= 0 && temp2 <= 0x3ff) { 
                        // Convert the surrogate to UTF32 and get the result.
                        charLength++;
                        return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START);
                    } 
                }
            } 
            return ((int)s[index]); 
        }
 
        ////////////////////////////////////////////////////////////////////////
        //
        //  IsWhiteSpace
        // 
        //  Determines if the given character is a white space character.
        // 
        //////////////////////////////////////////////////////////////////////// 

        internal static bool IsWhiteSpace(String s, int index) 
        {
            Contract.Assert(s != null, "s!=null");
            Contract.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length");
 
            UnicodeCategory uc = GetUnicodeCategory(s, index);
            // In Unicode 3.0, U+2028 is the only character which is under the category "LineSeparator". 
            // And U+2029 is th eonly character which is under the category "ParagraphSeparator". 
            switch (uc) {
                case (UnicodeCategory.SpaceSeparator): 
                case (UnicodeCategory.LineSeparator):
                case (UnicodeCategory.ParagraphSeparator):
                    return (true);
            } 
            return (false);
        } 
 

        internal static bool IsWhiteSpace(char c) 
        {
            UnicodeCategory uc = GetUnicodeCategory(c);
            // In Unicode 3.0, U+2028 is the only character which is under the category "LineSeparator".
            // And U+2029 is th eonly character which is under the category "ParagraphSeparator". 
            switch (uc) {
                case (UnicodeCategory.SpaceSeparator): 
                case (UnicodeCategory.LineSeparator): 
                case (UnicodeCategory.ParagraphSeparator):
                    return (true); 
            }

            return (false);
        } 

        // 
        // This is called by the public char and string, index versions 
        //
        // Note that for ch in the range D800-DFFF we just treat it as any other non-numeric character 
        //
        [System.Security.SecuritySafeCritical]  // auto-generated
        internal unsafe static double InternalGetNumericValue(int ch) {
            Contract.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range."); 
            // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
            ushort index = s_pNumericLevel1Index[ch >> 8]; 
            // Get the level 2 WORD offset from the 4 - 7 bit of ch.  This provides the base offset of the level 3 table. 
            // The offset is referred to an float item in m_pNumericFloatData.
            // Note that & has the lower precedence than addition, so don't forget the parathesis. 
            index = s_pNumericLevel1Index[index + ((ch >> 4) & 0x000f)];
            byte* pBytePtr = (byte*)&(s_pNumericLevel1Index[index]);
            // Get the result from the 0 -3 bit of ch.
#if WIN64 
            // To get around the IA64 alignment issue.  Our double data is aligned in 8-byte boundary, but loader loads the embeded table starting
            // at 4-byte boundary.  This cause a alignment issue since double is 8-byte. 
            byte* pSourcePtr = &(s_pNumericValues[pBytePtr[(ch & 0x000f)] * sizeof(double)]); 
            if (((long)pSourcePtr % 8) != 0) {
                // We are not aligned in 8-byte boundary.  Do a copy. 
                double ret;
                byte* retPtr = (byte*)&ret;
                Buffer.memcpyimpl(pSourcePtr, retPtr, sizeof(double));
                return (ret); 
            }
            return (((double*)s_pNumericValues)[pBytePtr[(ch & 0x000f)]]); 
#else 
            return (((double*)s_pNumericValues)[pBytePtr[(ch & 0x000f)]]);
#endif 
        }

        //
        // This is called by the public char and string, index versions 
        //
        // Note that for ch in the range D800-DFFF we just treat it as any other non-numeric character 
        // 
        [System.Security.SecuritySafeCritical]  // auto-generated
        internal unsafe static DigitValues* InternalGetDigitValues(int ch) { 
            Contract.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
            // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
            ushort index = s_pNumericLevel1Index[ch >> 8];
            // Get the level 2 WORD offset from the 4 - 7 bit of ch.  This provides the base offset of the level 3 table. 
            // The offset is referred to an float item in m_pNumericFloatData.
            // Note that & has the lower precedence than addition, so don't forget the parathesis. 
            index = s_pNumericLevel1Index[index + ((ch >> 4) & 0x000f)]; 
            byte* pBytePtr = (byte*)&(s_pNumericLevel1Index[index]);
            // Get the result from the 0 -3 bit of ch. 
            return &(s_pDigitValues[pBytePtr[(ch & 0x000f)]]);
        }

 
        [System.Security.SecuritySafeCritical]  // auto-generated
        internal unsafe static sbyte InternalGetDecimalDigitValue(int ch) { 
            return (InternalGetDigitValues(ch)->decimalDigit); 
        }
 
        [System.Security.SecuritySafeCritical]  // auto-generated
        internal unsafe static sbyte InternalGetDigitValue(int ch) {
            return (InternalGetDigitValues(ch)->digit);
        } 

 
        //////////////////////////////////////////////////////////////////////// 
        //
        //Returns the numeric value associated with the character c. If the character is a fraction, 
        // the return value will not be an integer. If the character does not have a numeric value, the return value is -1.
        //
        //Returns:
        //  the numeric value for the specified Unicode character.  If the character does not have a numeric value, the return value is -1. 
        //Arguments:
        //      ch  a Unicode character 
        //Exceptions: 
        //      ArgumentNullException
        //      ArgumentOutOfRangeException 
        //
        ////////////////////////////////////////////////////////////////////////

 
        [System.Security.SecuritySafeCritical]  // auto-generated
        public static double GetNumericValue(char ch) { 
            return (InternalGetNumericValue(ch)); 
        }
 

        [System.Security.SecuritySafeCritical]  // auto-generated
        public static double GetNumericValue(String s, int index) {
            if (s == null) { 
                throw new ArgumentNullException("s");
            } 
            if (index < 0 || index >= s.Length) { 
                throw new ArgumentOutOfRangeException("index", Environment.GetResourceString("ArgumentOutOfRange_Index"));
            } 
            Contract.EndContractBlock();
            return (InternalGetNumericValue(InternalConvertToUtf32(s, index)));

        } 

        //////////////////////////////////////////////////////////////////////// 
        // 
        //Returns the decimal digit value associated with the character c.
        // 
        // The value should be from 0 ~ 9.
        // If the character does not have a numeric value, the return value is -1.
        // From Unicode.org: Decimal Digits. Digits that can be used to form decimal-radix numbers.
        //Returns: 
        //  the decimal digit value for the specified Unicode character.  If the character does not have a decimal digit value, the return value is -1.
        //Arguments: 
        //      ch  a Unicode character 
        //Exceptions:
        //      ArgumentNullException 
        //      ArgumentOutOfRangeException
        //
        ////////////////////////////////////////////////////////////////////////
 

        public static int GetDecimalDigitValue(char ch) { 
            return (InternalGetDecimalDigitValue(ch)); 
        }
 

        public static int GetDecimalDigitValue(String s, int index) {
            if (s == null) {
                throw new ArgumentNullException("s"); 
            }
            if (index < 0 || index >= s.Length) { 
                throw new ArgumentOutOfRangeException("index", Environment.GetResourceString("ArgumentOutOfRange_Index")); 
            }
            Contract.EndContractBlock(); 

            return (InternalGetDecimalDigitValue(InternalConvertToUtf32(s, index)));
        }
 
        ////////////////////////////////////////////////////////////////////////
        // 
        //Action: Returns the digit value associated with the character c. 
        // If the character does not have a numeric value, the return value is -1.
        // From Unicode.org: If the character represents a digit, not necessarily a decimal digit, 
        // the value is here. This covers digits which do not form decimal radix forms, such as the compatibility superscript digits.
        //
        // An example is: U+2460 IRCLED DIGIT ONE. This character has digit value 1, but does not have associcated decimal digit value.
        // 
        //Returns:
        //  the digit value for the specified Unicode character.  If the character does not have a digit value, the return value is -1. 
        //Arguments: 
        //      ch  a Unicode character
        //Exceptions: 
        //      ArgumentNullException
        //      ArgumentOutOfRangeException
        //
        //////////////////////////////////////////////////////////////////////// 

 
        public static int GetDigitValue(char ch) { 
            return (InternalGetDigitValue(ch));
        } 


        public static int GetDigitValue(String s, int index) {
            if (s == null) { 
                throw new ArgumentNullException("s");
            } 
            if (index < 0 || index >= s.Length) { 
                throw new ArgumentOutOfRangeException("index", Environment.GetResourceString("ArgumentOutOfRange_Index"));
            } 
            Contract.EndContractBlock();
            return (InternalGetDigitValue(InternalConvertToUtf32(s, index)));
        }
 
        [System.Security.SecuritySafeCritical]  // auto-generated
        public static UnicodeCategory GetUnicodeCategory(char ch) 
        { 
            return (InternalGetUnicodeCategory(ch)) ;
        } 

        [System.Security.SecuritySafeCritical]  // auto-generated
        public static UnicodeCategory GetUnicodeCategory(String s, int index)
        { 
            if (s==null)
                throw new ArgumentNullException("s"); 
            if (((uint)index)>=((uint)s.Length)) { 
                throw new ArgumentOutOfRangeException("index");
            } 
            Contract.EndContractBlock();
            return InternalGetUnicodeCategory(s, index);
        }
 
        internal unsafe static UnicodeCategory InternalGetUnicodeCategory(int ch) {
            return ((UnicodeCategory)InternalGetCategoryValue(ch, UNICODE_CATEGORY_OFFSET)); 
        } 

        //////////////////////////////////////////////////////////////////////// 
        //
        //Action: Returns the Unicode Category property for the character c.
        //Returns:
        //  an value in UnicodeCategory enum 
        //Arguments:
        //  ch  a Unicode character 
        //Exceptions: 
        //  None
        // 
        //Note that this API will return values for D800-DF00 surrogate halves.
        //
        ////////////////////////////////////////////////////////////////////////
 
        [System.Security.SecuritySafeCritical]  // auto-generated
        internal unsafe static byte InternalGetCategoryValue(int ch, int offset) { 
            Contract.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range."); 
            // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
            ushort index = s_pCategoryLevel1Index[ch >> 8]; 
            // Get the level 2 WORD offset from the 4 - 7 bit of ch.  This provides the base offset of the level 3 table.
            // Note that & has the lower precedence than addition, so don't forget the parathesis.
            index = s_pCategoryLevel1Index[index + ((ch >> 4) & 0x000f)];
            byte* pBytePtr = (byte*)&(s_pCategoryLevel1Index[index]); 
            // Get the result from the 0 -3 bit of ch.
            byte valueIndex = pBytePtr[(ch & 0x000f)]; 
            byte uc = s_pCategoriesValue[valueIndex * 2 + offset]; 
            //
            // Make sure that OtherNotAssigned is the last category in UnicodeCategory. 
            // If that changes, change the following assertion as well.
            //
            //Contract.Assert(uc >= 0 && uc <= UnicodeCategory.OtherNotAssigned, "Table returns incorrect Unicode category");
            return (uc); 
        }
 
//      internal static BidiCategory GetBidiCategory(char ch) { 
//          return ((BidiCategory)InternalGetCategoryValue(c, BIDI_CATEGORY_OFFSET));
//      } 

        internal static BidiCategory GetBidiCategory(String s, int index) {
            if (s==null)
                throw new ArgumentNullException("s"); 
            if (((uint)index)>=((uint)s.Length)) {
                throw new ArgumentOutOfRangeException("index"); 
            } 
            Contract.EndContractBlock();
            return ((BidiCategory)InternalGetCategoryValue(InternalConvertToUtf32(s, index), BIDI_CATEGORY_OFFSET)); 
        }

        ////////////////////////////////////////////////////////////////////////
        // 
        //Action: Returns the Unicode Category property for the character c.
        //Returns: 
        //  an value in UnicodeCategory enum 
        //Arguments:
        //  value  a Unicode String 
        //  index  Index for the specified string.
        //Exceptions:
        //  None
        // 
        ////////////////////////////////////////////////////////////////////////
 
        internal static UnicodeCategory InternalGetUnicodeCategory(String value, int index) { 
            Contract.Assert(value != null, "value can not be null");
            Contract.Assert(index < value.Length, "index < value.Length"); 

            return (InternalGetUnicodeCategory(InternalConvertToUtf32(value, index)));
        }
 
        ////////////////////////////////////////////////////////////////////////
        // 
        // Get the Unicode category of the character starting at index.  If the character is in BMP, charLength will return 1. 
        // If the character is a valid surrogate pair, charLength will return 2.
        // 
        ////////////////////////////////////////////////////////////////////////

        internal static UnicodeCategory InternalGetUnicodeCategory(String str, int index, out int charLength) {
            Contract.Assert(str != null, "str can not be null"); 
            Contract.Assert(str.Length > 0, "str.Length > 0");;
            Contract.Assert(index >= 0 && index < str.Length, "index >= 0 && index < str.Length"); 
 
            return (InternalGetUnicodeCategory(InternalConvertToUtf32(str, index, out charLength)));
        } 

        internal static bool IsCombiningCategory(UnicodeCategory uc) {
            Contract.Assert(uc >= 0, "uc >= 0");
            return ( 
                uc == UnicodeCategory.NonSpacingMark ||
                uc == UnicodeCategory.SpacingCombiningMark || 
                uc == UnicodeCategory.EnclosingMark 
            );
        } 

        [System.Security.SecurityCritical]  // auto-generated
        [ResourceExposure(ResourceScope.Process)]
        [MethodImplAttribute(MethodImplOptions.InternalCall)] 
        private unsafe static extern void nativeInitTable(byte* bytePtr);
 
    } 
}

// File provided for Reference Use Only by Microsoft Corporation (c) 2007.
// ==++== 
//
//   Copyright (c) Microsoft Corporation.  All rights reserved.
//
// ==--== 
////////////////////////////////////////////////////////////////////////////
// 
//  Class:    CharacterInfo 
//
//  Purpose:  This class implements a set of methods for retrieving 
//            character type information.  Character type information is
//            independent of culture and region.
//
//  Date:     August 12, 1998 
//
//////////////////////////////////////////////////////////////////////////// 
 
namespace System.Globalization {
 
    //This class has only static members and therefore doesn't need to be serialized.

    using System;
    using System.Threading; 
    using System.Runtime.InteropServices;
    using System.Runtime.CompilerServices; 
    using System.Runtime.Versioning; 
    using System.Reflection;
    using System.Diagnostics.Contracts; 


    public static class CharUnicodeInfo
    { 
        //--------------------------------------------------------------------//
        //                        Internal Information                        // 
        //-------------------------------------------------------------------// 

        // 
        // Native methods to access the Unicode category data tables in charinfo.nlp.
        //
        internal const char  HIGH_SURROGATE_START  = '\ud800';
        internal const char  HIGH_SURROGATE_END    = '\udbff'; 
        internal const char  LOW_SURROGATE_START   = '\udc00';
        internal const char  LOW_SURROGATE_END     = '\udfff'; 
 
        internal const int UNICODE_CATEGORY_OFFSET = 0;
        internal const int BIDI_CATEGORY_OFFSET = 1; 

        // The base pointer of the data table
        unsafe static byte* s_pDataTable;
 
        // The native pointer to the 12:4:4 index table of the Unicode cateogry data.
        unsafe static ushort* s_pCategoryLevel1Index; 
        unsafe static byte* s_pCategoriesValue; 

        // The native pointer to the 12:4:4 index table of the Unicode numeric data. 
        // The value of this index table is an index into the real value table stored in s_pNumericValues.
        unsafe static ushort* s_pNumericLevel1Index;

        // The numeric value table, which is indexed by s_pNumericLevel1Index. 
        // Every item contains the value for numeric value.
        // unsafe static double* s_pNumericValues; 
        // To get around the IA64 alignment issue.  Our double data is aligned in 8-byte boundary, but loader loads the embeded table starting 
        // at 4-byte boundary.  This cause a alignment issue since double is 8-byte.
        unsafe static byte* s_pNumericValues; 

        // The digit value table, which is indexed by s_pNumericLevel1Index.  It shares the same indice as s_pNumericValues.
        // Every item contains the value for decimal digit/digit value.
        unsafe static DigitValues* s_pDigitValues; 

        internal const String UNICODE_INFO_FILE_NAME = "charinfo.nlp"; 
        // The starting codepoint for Unicode plane 1.  Plane 1 contains 0x010000 ~ 0x01ffff. 
        internal const int UNICODE_PLANE01_START = 0x10000;
 

        //
        // This is the header for the native data table that we load from UNICODE_INFO_FILE_NAME.
        // 
        // Excplicit layout is used here since a syntax like char[16] can not be used in sequential layout.
        [StructLayout(LayoutKind.Explicit)] 
        internal unsafe struct UnicodeDataHeader { 
            [FieldOffset(0)]
            internal char TableName;    // WCHAR[16] 
            [FieldOffset(0x20)]
            internal ushort version;    // WORD[4]
            [FieldOffset(0x28)]
            internal uint OffsetToCategoriesIndex; // DWORD 
            [FieldOffset(0x2c)]
            internal uint OffsetToCategoriesValue; // DWORD 
            [FieldOffset(0x30)] 
            internal uint OffsetToNumbericIndex; // DWORD
            [FieldOffset(0x34)] 
            internal uint OffsetToDigitValue; // DWORD
            [FieldOffset(0x38)]
            internal uint OffsetToNumbericValue; // DWORD
 
        }
 
        // NOTE: It's important to specify pack size here, since the size of the structure is 2 bytes.  Otherwise, 
        // the default pack size will be 4.
 
        [StructLayout(LayoutKind.Sequential, Pack=2)]
        internal struct DigitValues {
            internal sbyte decimalDigit;
            internal sbyte digit; 
        }
 
 
        //We need to allocate the underlying table that provides us with the information that we
        //use.  We allocate this once in the class initializer and then we don't need to worry 
        //about it again.
        //
        [System.Security.SecuritySafeCritical]  // auto-generated
        [ResourceExposure(ResourceScope.None)] 
        [ResourceConsumption(ResourceScope.Process, ResourceScope.Process)]
        unsafe static CharUnicodeInfo() { 
            s_pDataTable = GlobalizationAssembly.GetGlobalizationResourceBytePtr(typeof(CharUnicodeInfo).Assembly, UNICODE_INFO_FILE_NAME); 
            UnicodeDataHeader* mainHeader = (UnicodeDataHeader*)s_pDataTable;
 
            // Set up the native pointer to different part of the tables.
            s_pCategoryLevel1Index = (ushort*) (s_pDataTable + mainHeader->OffsetToCategoriesIndex);
            s_pCategoriesValue = (byte*) (s_pDataTable + mainHeader->OffsetToCategoriesValue);
            s_pNumericLevel1Index = (ushort*) (s_pDataTable + mainHeader->OffsetToNumbericIndex); 
            s_pNumericValues = (byte*) (s_pDataTable + mainHeader->OffsetToNumbericValue);
            s_pDigitValues = (DigitValues*) (s_pDataTable + mainHeader->OffsetToDigitValue); 
 
            // Go to native side to make sure the native CharacterInfoTable pointer in the native side is initialized.
            nativeInitTable(s_pDataTable); 
        }


        //////////////////////////////////////////////////////////////////////// 
        //
        // Actions: 
        // Convert the BMP character or surrogate pointed by index to a UTF32 value. 
        // This is similar to Char.ConvertToUTF32, but the difference is that
        // it does not throw exceptions when invalid surrogate characters are passed in. 
        //
        // WARNING: since it doesn't throw an exception it CAN return a value
        //          in the surrogate range D800-DFFF, which are not legal unicode values.
        // 
        ////////////////////////////////////////////////////////////////////////
 
        internal static int InternalConvertToUtf32(String s, int index) { 
            Contract.Assert(s != null, "s != null");
            Contract.Assert(index >= 0 && index < s.Length, "index < s.Length"); 
            if (index < s.Length - 1) {
                int temp1 = (int)s[index] - HIGH_SURROGATE_START;
                if (temp1 >= 0 && temp1 <= 0x3ff) {
                    int temp2 = (int)s[index+1] - LOW_SURROGATE_START; 
                    if (temp2 >= 0 && temp2 <= 0x3ff) {
                        // Convert the surrogate to UTF32 and get the result. 
                        return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START); 
                    }
                } 
            }
            return ((int)s[index]);
        }
 
        ////////////////////////////////////////////////////////////////////////
        // 
        // Convert a character or a surrogate pair starting at index of string s 
        // to UTF32 value.
        // 
        //  Parameters:
        //      s       The string
        //      index   The starting index.  It can point to a BMP character or
        //              a surrogate pair. 
        //      len     The length of the string.
        //      charLength  [out]   If the index points to a BMP char, charLength 
        //              will be 1.  If the index points to a surrogate pair, 
        //              charLength will be 2.
        // 
        // WARNING: since it doesn't throw an exception it CAN return a value
        //          in the surrogate range D800-DFFF, which are not legal unicode values.
        //
        //  Returns: 
        //      The UTF32 value
        // 
        //////////////////////////////////////////////////////////////////////// 

        internal static int InternalConvertToUtf32(String s, int index, out int charLength) { 
            Contract.Assert(s != null, "s != null");
            Contract.Assert(s.Length > 0, "s.Length > 0");
            Contract.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length");
            charLength = 1; 
            if (index < s.Length - 1) {
                int temp1 = (int)s[index] - HIGH_SURROGATE_START; 
                if (temp1 >= 0 && temp1 <= 0x3ff) { 
                    int temp2 = (int)s[index+1] - LOW_SURROGATE_START;
                    if (temp2 >= 0 && temp2 <= 0x3ff) { 
                        // Convert the surrogate to UTF32 and get the result.
                        charLength++;
                        return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START);
                    } 
                }
            } 
            return ((int)s[index]); 
        }
 
        ////////////////////////////////////////////////////////////////////////
        //
        //  IsWhiteSpace
        // 
        //  Determines if the given character is a white space character.
        // 
        //////////////////////////////////////////////////////////////////////// 

        internal static bool IsWhiteSpace(String s, int index) 
        {
            Contract.Assert(s != null, "s!=null");
            Contract.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length");
 
            UnicodeCategory uc = GetUnicodeCategory(s, index);
            // In Unicode 3.0, U+2028 is the only character which is under the category "LineSeparator". 
            // And U+2029 is th eonly character which is under the category "ParagraphSeparator". 
            switch (uc) {
                case (UnicodeCategory.SpaceSeparator): 
                case (UnicodeCategory.LineSeparator):
                case (UnicodeCategory.ParagraphSeparator):
                    return (true);
            } 
            return (false);
        } 
 

        internal static bool IsWhiteSpace(char c) 
        {
            UnicodeCategory uc = GetUnicodeCategory(c);
            // In Unicode 3.0, U+2028 is the only character which is under the category "LineSeparator".
            // And U+2029 is th eonly character which is under the category "ParagraphSeparator". 
            switch (uc) {
                case (UnicodeCategory.SpaceSeparator): 
                case (UnicodeCategory.LineSeparator): 
                case (UnicodeCategory.ParagraphSeparator):
                    return (true); 
            }

            return (false);
        } 

        // 
        // This is called by the public char and string, index versions 
        //
        // Note that for ch in the range D800-DFFF we just treat it as any other non-numeric character 
        //
        [System.Security.SecuritySafeCritical]  // auto-generated
        internal unsafe static double InternalGetNumericValue(int ch) {
            Contract.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range."); 
            // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
            ushort index = s_pNumericLevel1Index[ch >> 8]; 
            // Get the level 2 WORD offset from the 4 - 7 bit of ch.  This provides the base offset of the level 3 table. 
            // The offset is referred to an float item in m_pNumericFloatData.
            // Note that & has the lower precedence than addition, so don't forget the parathesis. 
            index = s_pNumericLevel1Index[index + ((ch >> 4) & 0x000f)];
            byte* pBytePtr = (byte*)&(s_pNumericLevel1Index[index]);
            // Get the result from the 0 -3 bit of ch.
#if WIN64 
            // To get around the IA64 alignment issue.  Our double data is aligned in 8-byte boundary, but loader loads the embeded table starting
            // at 4-byte boundary.  This cause a alignment issue since double is 8-byte. 
            byte* pSourcePtr = &(s_pNumericValues[pBytePtr[(ch & 0x000f)] * sizeof(double)]); 
            if (((long)pSourcePtr % 8) != 0) {
                // We are not aligned in 8-byte boundary.  Do a copy. 
                double ret;
                byte* retPtr = (byte*)&ret;
                Buffer.memcpyimpl(pSourcePtr, retPtr, sizeof(double));
                return (ret); 
            }
            return (((double*)s_pNumericValues)[pBytePtr[(ch & 0x000f)]]); 
#else 
            return (((double*)s_pNumericValues)[pBytePtr[(ch & 0x000f)]]);
#endif 
        }

        //
        // This is called by the public char and string, index versions 
        //
        // Note that for ch in the range D800-DFFF we just treat it as any other non-numeric character 
        // 
        [System.Security.SecuritySafeCritical]  // auto-generated
        internal unsafe static DigitValues* InternalGetDigitValues(int ch) { 
            Contract.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
            // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
            ushort index = s_pNumericLevel1Index[ch >> 8];
            // Get the level 2 WORD offset from the 4 - 7 bit of ch.  This provides the base offset of the level 3 table. 
            // The offset is referred to an float item in m_pNumericFloatData.
            // Note that & has the lower precedence than addition, so don't forget the parathesis. 
            index = s_pNumericLevel1Index[index + ((ch >> 4) & 0x000f)]; 
            byte* pBytePtr = (byte*)&(s_pNumericLevel1Index[index]);
            // Get the result from the 0 -3 bit of ch. 
            return &(s_pDigitValues[pBytePtr[(ch & 0x000f)]]);
        }

 
        [System.Security.SecuritySafeCritical]  // auto-generated
        internal unsafe static sbyte InternalGetDecimalDigitValue(int ch) { 
            return (InternalGetDigitValues(ch)->decimalDigit); 
        }
 
        [System.Security.SecuritySafeCritical]  // auto-generated
        internal unsafe static sbyte InternalGetDigitValue(int ch) {
            return (InternalGetDigitValues(ch)->digit);
        } 

 
        //////////////////////////////////////////////////////////////////////// 
        //
        //Returns the numeric value associated with the character c. If the character is a fraction, 
        // the return value will not be an integer. If the character does not have a numeric value, the return value is -1.
        //
        //Returns:
        //  the numeric value for the specified Unicode character.  If the character does not have a numeric value, the return value is -1. 
        //Arguments:
        //      ch  a Unicode character 
        //Exceptions: 
        //      ArgumentNullException
        //      ArgumentOutOfRangeException 
        //
        ////////////////////////////////////////////////////////////////////////

 
        [System.Security.SecuritySafeCritical]  // auto-generated
        public static double GetNumericValue(char ch) { 
            return (InternalGetNumericValue(ch)); 
        }
 

        [System.Security.SecuritySafeCritical]  // auto-generated
        public static double GetNumericValue(String s, int index) {
            if (s == null) { 
                throw new ArgumentNullException("s");
            } 
            if (index < 0 || index >= s.Length) { 
                throw new ArgumentOutOfRangeException("index", Environment.GetResourceString("ArgumentOutOfRange_Index"));
            } 
            Contract.EndContractBlock();
            return (InternalGetNumericValue(InternalConvertToUtf32(s, index)));

        } 

        //////////////////////////////////////////////////////////////////////// 
        // 
        //Returns the decimal digit value associated with the character c.
        // 
        // The value should be from 0 ~ 9.
        // If the character does not have a numeric value, the return value is -1.
        // From Unicode.org: Decimal Digits. Digits that can be used to form decimal-radix numbers.
        //Returns: 
        //  the decimal digit value for the specified Unicode character.  If the character does not have a decimal digit value, the return value is -1.
        //Arguments: 
        //      ch  a Unicode character 
        //Exceptions:
        //      ArgumentNullException 
        //      ArgumentOutOfRangeException
        //
        ////////////////////////////////////////////////////////////////////////
 

        public static int GetDecimalDigitValue(char ch) { 
            return (InternalGetDecimalDigitValue(ch)); 
        }
 

        public static int GetDecimalDigitValue(String s, int index) {
            if (s == null) {
                throw new ArgumentNullException("s"); 
            }
            if (index < 0 || index >= s.Length) { 
                throw new ArgumentOutOfRangeException("index", Environment.GetResourceString("ArgumentOutOfRange_Index")); 
            }
            Contract.EndContractBlock(); 

            return (InternalGetDecimalDigitValue(InternalConvertToUtf32(s, index)));
        }
 
        ////////////////////////////////////////////////////////////////////////
        // 
        //Action: Returns the digit value associated with the character c. 
        // If the character does not have a numeric value, the return value is -1.
        // From Unicode.org: If the character represents a digit, not necessarily a decimal digit, 
        // the value is here. This covers digits which do not form decimal radix forms, such as the compatibility superscript digits.
        //
        // An example is: U+2460 IRCLED DIGIT ONE. This character has digit value 1, but does not have associcated decimal digit value.
        // 
        //Returns:
        //  the digit value for the specified Unicode character.  If the character does not have a digit value, the return value is -1. 
        //Arguments: 
        //      ch  a Unicode character
        //Exceptions: 
        //      ArgumentNullException
        //      ArgumentOutOfRangeException
        //
        //////////////////////////////////////////////////////////////////////// 

 
        public static int GetDigitValue(char ch) { 
            return (InternalGetDigitValue(ch));
        } 


        public static int GetDigitValue(String s, int index) {
            if (s == null) { 
                throw new ArgumentNullException("s");
            } 
            if (index < 0 || index >= s.Length) { 
                throw new ArgumentOutOfRangeException("index", Environment.GetResourceString("ArgumentOutOfRange_Index"));
            } 
            Contract.EndContractBlock();
            return (InternalGetDigitValue(InternalConvertToUtf32(s, index)));
        }
 
        [System.Security.SecuritySafeCritical]  // auto-generated
        public static UnicodeCategory GetUnicodeCategory(char ch) 
        { 
            return (InternalGetUnicodeCategory(ch)) ;
        } 

        [System.Security.SecuritySafeCritical]  // auto-generated
        public static UnicodeCategory GetUnicodeCategory(String s, int index)
        { 
            if (s==null)
                throw new ArgumentNullException("s"); 
            if (((uint)index)>=((uint)s.Length)) { 
                throw new ArgumentOutOfRangeException("index");
            } 
            Contract.EndContractBlock();
            return InternalGetUnicodeCategory(s, index);
        }
 
        internal unsafe static UnicodeCategory InternalGetUnicodeCategory(int ch) {
            return ((UnicodeCategory)InternalGetCategoryValue(ch, UNICODE_CATEGORY_OFFSET)); 
        } 

        //////////////////////////////////////////////////////////////////////// 
        //
        //Action: Returns the Unicode Category property for the character c.
        //Returns:
        //  an value in UnicodeCategory enum 
        //Arguments:
        //  ch  a Unicode character 
        //Exceptions: 
        //  None
        // 
        //Note that this API will return values for D800-DF00 surrogate halves.
        //
        ////////////////////////////////////////////////////////////////////////
 
        [System.Security.SecuritySafeCritical]  // auto-generated
        internal unsafe static byte InternalGetCategoryValue(int ch, int offset) { 
            Contract.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range."); 
            // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
            ushort index = s_pCategoryLevel1Index[ch >> 8]; 
            // Get the level 2 WORD offset from the 4 - 7 bit of ch.  This provides the base offset of the level 3 table.
            // Note that & has the lower precedence than addition, so don't forget the parathesis.
            index = s_pCategoryLevel1Index[index + ((ch >> 4) & 0x000f)];
            byte* pBytePtr = (byte*)&(s_pCategoryLevel1Index[index]); 
            // Get the result from the 0 -3 bit of ch.
            byte valueIndex = pBytePtr[(ch & 0x000f)]; 
            byte uc = s_pCategoriesValue[valueIndex * 2 + offset]; 
            //
            // Make sure that OtherNotAssigned is the last category in UnicodeCategory. 
            // If that changes, change the following assertion as well.
            //
            //Contract.Assert(uc >= 0 && uc <= UnicodeCategory.OtherNotAssigned, "Table returns incorrect Unicode category");
            return (uc); 
        }
 
//      internal static BidiCategory GetBidiCategory(char ch) { 
//          return ((BidiCategory)InternalGetCategoryValue(c, BIDI_CATEGORY_OFFSET));
//      } 

        internal static BidiCategory GetBidiCategory(String s, int index) {
            if (s==null)
                throw new ArgumentNullException("s"); 
            if (((uint)index)>=((uint)s.Length)) {
                throw new ArgumentOutOfRangeException("index"); 
            } 
            Contract.EndContractBlock();
            return ((BidiCategory)InternalGetCategoryValue(InternalConvertToUtf32(s, index), BIDI_CATEGORY_OFFSET)); 
        }

        ////////////////////////////////////////////////////////////////////////
        // 
        //Action: Returns the Unicode Category property for the character c.
        //Returns: 
        //  an value in UnicodeCategory enum 
        //Arguments:
        //  value  a Unicode String 
        //  index  Index for the specified string.
        //Exceptions:
        //  None
        // 
        ////////////////////////////////////////////////////////////////////////
 
        internal static UnicodeCategory InternalGetUnicodeCategory(String value, int index) { 
            Contract.Assert(value != null, "value can not be null");
            Contract.Assert(index < value.Length, "index < value.Length"); 

            return (InternalGetUnicodeCategory(InternalConvertToUtf32(value, index)));
        }
 
        ////////////////////////////////////////////////////////////////////////
        // 
        // Get the Unicode category of the character starting at index.  If the character is in BMP, charLength will return 1. 
        // If the character is a valid surrogate pair, charLength will return 2.
        // 
        ////////////////////////////////////////////////////////////////////////

        internal static UnicodeCategory InternalGetUnicodeCategory(String str, int index, out int charLength) {
            Contract.Assert(str != null, "str can not be null"); 
            Contract.Assert(str.Length > 0, "str.Length > 0");;
            Contract.Assert(index >= 0 && index < str.Length, "index >= 0 && index < str.Length"); 
 
            return (InternalGetUnicodeCategory(InternalConvertToUtf32(str, index, out charLength)));
        } 

        internal static bool IsCombiningCategory(UnicodeCategory uc) {
            Contract.Assert(uc >= 0, "uc >= 0");
            return ( 
                uc == UnicodeCategory.NonSpacingMark ||
                uc == UnicodeCategory.SpacingCombiningMark || 
                uc == UnicodeCategory.EnclosingMark 
            );
        } 

        [System.Security.SecurityCritical]  // auto-generated
        [ResourceExposure(ResourceScope.Process)]
        [MethodImplAttribute(MethodImplOptions.InternalCall)] 
        private unsafe static extern void nativeInitTable(byte* bytePtr);
 
    } 
}

// File provided for Reference Use Only by Microsoft Corporation (c) 2007.
Link Menu

Network programming in C#, Network Programming in VB.NET, Network Programming in .NET
This book is available now!
Buy at Amazon US or
Buy at Amazon UK
CharUnicodeInfo.cs source code in C# .NET

Source code for the .NET framework in C#

Code:

Link Menu