Tokenizer.cs source code in C# .NET

                        Code:
                         / DotNET / DotNET / 8.0 / untmp / whidbey / REDBITS / ndp / clr / src / BCL / System / Security / Util / Tokenizer.cs / 1 / Tokenizer.cs
                        
                        
                            // ==++== 
//
//   Copyright (c) Microsoft Corporation.  All rights reserved.
//
// ==--== 
/*============================================================
** 
** CLASS:    Tokenizer.cs 
**
** 
** PURPOSE:  Tokenize "Elementary XML", that is, XML without
**           attributes or DTDs, in other words, XML with
**           elements only.
** 
**
===========================================================*/ 
namespace System.Security.Util 
{
    using System.Text; 
    using System;
    using System.IO;
    using System.Globalization;
 
    internal sealed class Tokenizer
    { 
        // There are five externally knowable token types: bras, kets, 
        // slashes, cstrs, and equals.
 
        internal const byte bra = 0;
        internal const byte ket = 1;
        internal const byte slash = 2;
        internal const byte cstr = 3; 
        internal const byte equals = 4;
        internal const byte quest = 5; 
        internal const byte bang = 6; 
        internal const byte dash = 7;
 
        // these are the assorted text characters that the tokenizer must
        // understand to do its job

        internal const int intOpenBracket = (int) '<'; 
        internal const int intCloseBracket = (int) '>';
        internal const int intSlash = (int) '/'; 
        internal const int intEquals = (int) '='; 
        internal const int intQuote = (int) '\"';
        internal const int intQuest = (int) '?'; 
        internal const int intBang = (int) '!';
        internal const int intDash = (int) '-';
        internal const int intTab = (int) '\t';
        internal const int intCR = (int) '\r'; 
        internal const int intLF = (int) '\n';
        internal const int intSpace = (int) ' '; 
 
        // this tells us where we will be getting our input from
        // and what the encoding is (if any) 

        private enum TokenSource
        {
            UnicodeByteArray,  // this is little-endian unicode (there are other encodings) 
            UTF8ByteArray,
            ASCIIByteArray, 
            CharArray, 
            String,
            NestedStrings, 
            Other
        }

        // byte streams can come in 3 different flavors 

        internal enum ByteTokenEncoding 
        { 
            UnicodeTokens,  // this is little-endian unicode (there are other encodings)
            UTF8Tokens, 
            ByteTokens
        }

        public int  LineNo; 

        // these variables represent the input state of the of the tokenizer 
 
        private int _inProcessingTag;
        private byte[] _inBytes; 
        private char[] _inChars;
        private String _inString;
        private int _inIndex;
        private int _inSize; 
        private int _inSavedCharacter;
        private TokenSource _inTokenSource; 
        private ITokenReader _inTokenReader; 

        // these variables are used to build and deliver tokenizer output strings 

        private StringMaker _maker = null;

        private String[] _searchStrings; 
        private String[] _replaceStrings;
 
        private int _inNestedIndex; 
        private int _inNestedSize;
        private String _inNestedString; 


        //===============================================================
        // Constructor uses given ICharInputStream 
        //
 
        internal void BasicInitialization() 
        {
            LineNo = 1 ; 
            _inProcessingTag = 0;
            _inSavedCharacter = -1;
            _inIndex = 0;
            _inSize = 0; 
            _inNestedSize = 0;
            _inNestedIndex = 0; 
            _inTokenSource = TokenSource.Other; 
            _maker = System.SharedStatics.GetSharedStringMaker();
        } 

        public void Recycle()
        {
            System.SharedStatics.ReleaseSharedStringMaker(ref _maker); // will set _maker to null 
        }
 
        internal Tokenizer (String input) 
        {
            BasicInitialization(); 
            _inString = input;
            _inSize = input.Length;
            _inTokenSource = TokenSource.String;
        } 

        internal Tokenizer (String input, String[] searchStrings, String[] replaceStrings) 
        { 
            BasicInitialization();
            _inString = input; 
            _inSize = _inString.Length;
            _inTokenSource = TokenSource.NestedStrings;
            _searchStrings = searchStrings;
            _replaceStrings = replaceStrings; 

#if DEBUG 
            BCLDebug.Assert(searchStrings.Length == replaceStrings.Length, "different number of search/replace strings"); 
            BCLDebug.Assert(searchStrings.Length != 0, "no search replace strings, shouldn't be using this ctor");
 
            for (int istr=0; istr= 3 , "XML Slug too small");
                BCLDebug.Assert( str[0] == '{', "XML Slug doesn't start with '{'" ); 
                BCLDebug.Assert( str[str.Length-1] == '}', "XML Slug doesn't end with '}'" ); 

                str = replaceStrings[istr]; 
                BCLDebug.Assert( str != null, "XML Replacement null");
                BCLDebug.Assert( str.Length >= 1, "XML Replacement empty");
            }
#endif 
        }
 
        internal Tokenizer (byte[] array, ByteTokenEncoding encoding, int startIndex) 
           {
            BasicInitialization(); 
            _inBytes = array;
            _inSize = array.Length;
            _inIndex = startIndex;
 
            switch (encoding)
            { 
            case ByteTokenEncoding.UnicodeTokens: 
                _inTokenSource = TokenSource.UnicodeByteArray;
                break; 

            case ByteTokenEncoding.UTF8Tokens:
                _inTokenSource = TokenSource.UTF8ByteArray;
                break; 

            case ByteTokenEncoding.ByteTokens: 
                _inTokenSource = TokenSource.ASCIIByteArray; 
                break;
 
            default:
                throw new ArgumentException(String.Format(CultureInfo.CurrentCulture, Environment.GetResourceString("Arg_EnumIllegalVal"), (int)encoding));
            }
           } 

        internal Tokenizer (char[] array) 
        { 
            BasicInitialization();
            _inChars = array; 
            _inSize = array.Length;
            _inTokenSource = TokenSource.CharArray;
        }
 
        internal Tokenizer (StreamReader input)
        { 
            BasicInitialization(); 
            _inTokenReader = new StreamTokenReader(input);
        } 

        internal void ChangeFormat( System.Text.Encoding encoding )
        {
            if (encoding == null) 
            {
                return; 
            } 

            BCLDebug.Assert( _inSavedCharacter == -1, "There was a lookahead character at the stream change point, that means the parser is changing encodings too late" ); 

            switch (_inTokenSource)
            {
                case TokenSource.UnicodeByteArray: 
                case TokenSource.UTF8ByteArray:
                case TokenSource.ASCIIByteArray: 
                    // these are the ones we can change on the fly 

                    if (encoding == System.Text.Encoding.Unicode) 
                    {
                        _inTokenSource = TokenSource.UnicodeByteArray;
                        return;
                    } 

                    if (encoding == System.Text.Encoding.UTF8) 
                    { 
                        _inTokenSource = TokenSource.UTF8ByteArray;
                        return; 
                    }

                    if (encoding == System.Text.Encoding.ASCII)
                    { 
                        _inTokenSource = TokenSource.ASCIIByteArray;
                        return; 
                    } 
                    break;
 
                case TokenSource.String:
                case TokenSource.CharArray:
                case TokenSource.NestedStrings:
                    // these are already unicode and encoding changes are moot 
                    // they can't be further decoded
                    return; 
            } 

            // if we're here it means we don't know how to change 
            // to the desired encoding with the memory that we have
            // we'll have to do this the hard way -- that means
            // creating a suitable stream from what we've got
 
            // this is thankfully the rare case as UTF8 and unicode
            // dominate the scene 
 
            Stream stream = null;
 
            switch (_inTokenSource)
            {
                case TokenSource.UnicodeByteArray:
                case TokenSource.UTF8ByteArray: 
                case TokenSource.ASCIIByteArray:
                    stream = new MemoryStream(_inBytes, _inIndex, _inSize - _inIndex); 
                    break; 

                case TokenSource.CharArray: 
                case TokenSource.String:
                case TokenSource.NestedStrings:
                    BCLDebug.Assert(false, "attempting to change encoding on a non-changable source, should have been prevented earlier" );
                    return; 

                default: 
                    StreamTokenReader reader = _inTokenReader as StreamTokenReader; 

                    if (reader == null) 
                    {
                        BCLDebug.Assert(false, "A new input source type has been added to the Tokenizer but it doesn't support encoding changes");
                        return;
                    } 

                    stream = reader._in.BaseStream; 
 
                    BCLDebug.Assert( reader._in.CurrentEncoding != null, "Tokenizer's StreamReader does not have an encoding" );
 
                    String fakeReadString = new String(' ', reader.NumCharEncountered);
                    stream.Position = reader._in.CurrentEncoding.GetByteCount( fakeReadString );
                    break;
            } 

            BCLDebug.Assert(stream != null, "The XML stream with new encoding was not properly initialized for kind of input we had"); 
 
            // we now have an initialized memory stream based on whatever source we had before
            _inTokenReader = new StreamTokenReader( new StreamReader( stream, encoding ) ); 
            _inTokenSource = TokenSource.Other;
        }

        internal void GetTokens( TokenizerStream stream, int maxNum, bool endAfterKet ) 
        {
            while (maxNum == -1 || stream.GetTokenCount() < maxNum) 
            { 
                int i = -1;
                byte ch; 
                int cb = 0;
                bool inLiteral = false;
                bool inQuotedString = false;
 
                StringMaker m = _maker;
 
                m._outStringBuilder = null; 
                m._outIndex = 0;
 
            BEGINNING:

                if (_inSavedCharacter != -1)
                { 
                       i = _inSavedCharacter;
                       _inSavedCharacter = -1; 
                } 
                else switch (_inTokenSource)
                { 
                case TokenSource.UnicodeByteArray:
                    if (_inIndex + 1 >= _inSize)
                    {
                        stream.AddToken( -1 ); 
                        return;
                    } 
 
                     i = (int)((_inBytes[_inIndex+1]<<8) + _inBytes[_inIndex]);
                      _inIndex += 2; 
                    break;

                 case TokenSource.UTF8ByteArray:
                    if (_inIndex >= _inSize) 
                    {
                        stream.AddToken( -1 ); 
                        return; 
                    }
 
                    i = (int)(_inBytes[_inIndex++]);

                    // single byte -- case, early out as we're done already
                    if ((i & 0x80) == 0x00) 
                        break;
 
                    // to decode the lead byte switch on the high nibble 
                    // shifted down so the switch gets dense integers
                    switch ((i & 0xf0) >>4) 
                    {
                    case 0x8:   // 1000  (together these 4 make 10xxxxx)
                    case 0x9:   // 1001
                    case 0xa:   // 1010 
                    case 0xb:   // 1011
                        // trail byte is an error 
                        throw new XmlSyntaxException( LineNo ); 

                    case 0xc:   // 1100  (these two make 110xxxxx) 
                    case 0xd:   // 1101
                        // two byte encoding (1 trail byte)
                        i &= 0x1f;
                        cb = 2; 
                        break;
 
                    case 0xe:   // 1110 (this gets us 1110xxxx) 
                        // three byte encoding (2 trail bytes)
                        i &= 0x0f; 
                        cb = 3;
                        break;

                    case 0xf:   // 1111 (and finally 1111xxxx) 
                        // 4 byte encoding is an error
                        throw new XmlSyntaxException( LineNo ); 
                    } 

                    // at least one trail byte, fetch it 
                    if (_inIndex >= _inSize)
                        throw new XmlSyntaxException (LineNo, Environment.GetResourceString( "XMLSyntax_UnexpectedEndOfFile" ));

                    ch = _inBytes[_inIndex++]; 

                    // must be trail byte encoding 
                    if ((ch & 0xc0) != 0x80) 
                        throw new XmlSyntaxException( LineNo );
 
                    i = (i<<6) | (ch & 0x3f);

                    // done now if 2 byte encoding, otherwise go for 3
                    if (cb == 2) 
                        break;
 
                    if (_inIndex >= _inSize) 
                        throw new XmlSyntaxException (LineNo, Environment.GetResourceString( "XMLSyntax_UnexpectedEndOfFile" ));
 
                    ch = _inBytes[_inIndex++];

                    // must be trail byte encoding
                    if ((ch & 0xc0) != 0x80) 
                        throw new XmlSyntaxException( LineNo );
 
                    i = (i<<6) | (ch & 0x3f); 
                    break;
 
                case TokenSource.ASCIIByteArray:
                    if (_inIndex >= _inSize)
                    {
                        stream.AddToken( -1 ); 
                        return;
                    } 
 
                    i = (int)(_inBytes[_inIndex++]);
                    break; 

                case TokenSource.CharArray:
                    if (_inIndex >= _inSize)
                    { 
                        stream.AddToken( -1 );
                        return; 
                    } 

                     i = (int)(_inChars[_inIndex++]); 
                    break;

                case TokenSource.String:
                    if (_inIndex >= _inSize) 
                    {
                        stream.AddToken( -1 ); 
                        return; 
                    }
 
                    i = (int)(_inString[_inIndex++]);
                    break;

                case TokenSource.NestedStrings: 
                    if (_inNestedSize != 0)
                    { 
                        if (_inNestedIndex < _inNestedSize) 
                        {
                            i = _inNestedString[_inNestedIndex++]; 
                            break;
                        }

                        _inNestedSize = 0; 
                    }
 
                    if (_inIndex >= _inSize) 
                    {
                        stream.AddToken( -1 ); 
                        return;
                    }

                    i = (int)(_inString[_inIndex++]); 

                    if (i != '{') 
                        break; 

                    for (int istr=0; istr < _searchStrings.Length; istr++) 
                    {
                        if (0==String.Compare(_searchStrings[istr], 0, _inString, _inIndex-1, _searchStrings[istr].Length, StringComparison.Ordinal))
                        {
                            _inNestedString = _replaceStrings[istr]; 
                            _inNestedSize = _inNestedString.Length;
                            _inNestedIndex = 1; 
                            i = _inNestedString[0]; 
                            _inIndex += _searchStrings[istr].Length - 1;
                            break; 
                        }
                    }

                    break; 

                default: 
                    i = _inTokenReader.Read(); 
                    if (i == -1)
                    { 
                        stream.AddToken( -1 );
                        return;
                    }
                    break; 
                }
 
                if (!inLiteral) 
                {
                    switch (i) 
                    {
                    // skip whitespace
                    case intSpace:
                    case intTab: 
                    case intCR:
                        goto BEGINNING; 
 
                    // count linefeeds
                    case intLF: 
                        LineNo++;
                        goto BEGINNING;

                    case intOpenBracket: 
                        _inProcessingTag++;
                        stream.AddToken( bra ); 
                        continue; 

                    case intCloseBracket: 
                        _inProcessingTag--;
                        stream.AddToken( ket );
                        if (endAfterKet)
                            return; 
                        continue;
 
                    case intEquals: 
                        stream.AddToken( equals );
                        continue; 

                    case intSlash:
                        if (_inProcessingTag != 0)
                        { 
                            stream.AddToken( slash );
                            continue; 
                        } 
                        break;
 
                    case intQuest:
                        if (_inProcessingTag != 0)
                        {
                            stream.AddToken( quest ); 
                            continue;
                        } 
                        break; 

                    case intBang: 
                        if (_inProcessingTag != 0)
                        {
                            stream.AddToken( bang );
                            continue; 
                        }
                        break; 
 
                    case intDash:
                        if (_inProcessingTag != 0) 
                        {
                            stream.AddToken( dash );
                            continue;
                        } 
                        break;
 
                    case intQuote: 
                        inLiteral = true;
                        inQuotedString = true; 
                        goto BEGINNING;
                    }
                }
                else 
                   {
                    switch (i) 
                    { 
                    case intOpenBracket:
                        if (!inQuotedString) 
                        {
                            _inSavedCharacter = i;
                            stream.AddToken( cstr );
                            stream.AddString( this.GetStringToken() ); 
                            continue;
                        } 
                        break; 

                    case intCloseBracket: 
                    case intEquals:
                    case intSlash:
                        if (!inQuotedString && _inProcessingTag != 0)
                        { 
                            _inSavedCharacter = i;
                            stream.AddToken( cstr ); 
                            stream.AddString( this.GetStringToken() ); 
                            continue;
                        } 
                        break;

                    case intQuote:
                        if (inQuotedString) 
                        {
                            stream.AddToken( cstr ); 
                            stream.AddString( this.GetStringToken() ); 
                            continue;
                        } 
                        break;

                    case intTab:
                    case intCR: 
                    case intSpace:
                        if (!inQuotedString) 
                        { 
                            stream.AddToken( cstr );
                            stream.AddString( this.GetStringToken() ); 
                            continue;
                        }
                        break;
 
                    // count linefeeds
                    case intLF: 
                        LineNo++; 

                        if (!inQuotedString) 
                        {
                            stream.AddToken( cstr );
                            stream.AddString( this.GetStringToken() );
                            continue; 
                        }
                        break; 
                    } 
                }
 
                inLiteral = true;

                // add character  to the string
                if (m._outIndex < StringMaker.outMaxSize) 
                {
                    // easy case 
                    m._outChars[m._outIndex++] = (char)i; 
                }
                else 
                {
                    if (m._outStringBuilder == null)
                    {
                       // OK, first check if we have to init the StringBuilder 
                        m._outStringBuilder = new StringBuilder();
                    } 
 
                    // OK, copy from _outChars to _outStringBuilder
                    m._outStringBuilder.Append(m._outChars, 0, StringMaker.outMaxSize); 

                    // reset _outChars pointer
                    m._outChars[0] = (char)i;
                    m._outIndex = 1; 
                }
 
                goto BEGINNING; 
            }
        } 

        [Serializable]
        internal sealed class StringMaker
        { 
            String[] aStrings;
            uint cStringsMax; 
            uint cStringsUsed; 

            public StringBuilder _outStringBuilder; 
            public char[] _outChars;
            public int _outIndex;

            public const int outMaxSize = 512; 

            static uint HashString(String str) 
            { 
                uint hash = 0;
 
                int l = str.Length;

                // rotate in string character
                for (int i=0; i < l; i++) 
                {
                    hash = (hash << 3) ^ (uint)str[i] ^ (hash >> 29); 
                } 

                return hash; 
            }

            static uint HashCharArray(char[] a, int l)
            { 
                uint hash = 0;
 
                // rotate in a character 
                for (int i=0; i < l; i++)
                { 
                    hash = (hash << 3) ^ (uint)a[i] ^ (hash >> 29);
                }

                return hash; 
            }
 
            public StringMaker() 
            {
                cStringsMax = 2048; 
                cStringsUsed = 0;
                aStrings = new String[cStringsMax];
                _outChars = new char[outMaxSize];
            } 

            bool CompareStringAndChars(String str, char [] a, int l) 
            { 
                if (str.Length != l)
                    return false; 

                for (int i=0; i (cStringsMax / 4) * 3) 
                { 
                    // we need to rehash
 
                    uint cNewMax = cStringsMax * 2;
                    String [] aStringsNew = new String[cNewMax];

                    for (int i=0; i < cStringsMax;i++) 
                    {
                        if (aStrings[i] != null) 
                        { 
                            hash = HashString(aStrings[i]) % cNewMax;
 
                            while (aStringsNew[hash] != null)
                            {
                                // slot full, skip
                                if (++hash >= cNewMax) 
                                    hash = 0;
                            } 
 
                            aStringsNew[hash] = aStrings[i];
                        } 
                    }

                    // all done, cutover to the new hash table
                    cStringsMax = cNewMax; 
                    aStrings = aStringsNew;
                } 
 
                hash = HashCharArray(a, l) % cStringsMax;
 
                String str;

                while ((str = aStrings[hash]) != null)
                { 
                    if (CompareStringAndChars(str, a, l))
                        return str; 
 
                    if (++hash >= cStringsMax)
                        hash = 0; 
                }

                str = new String(a,0,l);
                aStrings[hash] = str; 
                cStringsUsed++;
 
                return str; 
            }
        } 

        //===============================================================
        //
        // 

        private String GetStringToken () 
        { 
            return _maker.MakeString();
        } 

        internal interface ITokenReader
        {
            int Read(); 
        }
 
        internal class StreamTokenReader : ITokenReader 
        {
 
            internal StreamReader _in;
            internal int _numCharRead;

            internal StreamTokenReader(StreamReader input) 
            {
                _in = input; 
                _numCharRead = 0; 
            }
 
            public virtual int Read()
            {
                int value = _in.Read();
                if (value != -1) 
                    _numCharRead++;
                return value; 
            } 

            internal int NumCharEncountered 
            {
                get
                {
                    return _numCharRead; 
                }
            } 
        } 
    }
 
    internal sealed class TokenizerShortBlock
    {
        internal short[] m_block = new short[16];
        internal TokenizerShortBlock m_next = null; 
    }
 
    internal sealed class TokenizerStringBlock 
    {
        internal String[] m_block = new String[16]; 
        internal TokenizerStringBlock m_next = null;
    }

 
    internal sealed class TokenizerStream
    { 
        private int m_countTokens; 

        private TokenizerShortBlock m_headTokens; 
        private TokenizerShortBlock m_lastTokens;
        private TokenizerShortBlock m_currentTokens;
        private int m_indexTokens;
 
        private TokenizerStringBlock m_headStrings;
        private TokenizerStringBlock m_currentStrings; 
        private int m_indexStrings; 

#if _DEBUG 
        private bool m_bLastWasCStr;
#endif

        internal TokenizerStream() 
        {
            m_countTokens = 0; 
            m_headTokens = new TokenizerShortBlock(); 
            m_headStrings = new TokenizerStringBlock();
            Reset(); 
        }

        internal void AddToken( short token )
        { 
            if (m_currentTokens.m_block.Length <= m_indexTokens)
            { 
                m_currentTokens.m_next = new TokenizerShortBlock(); 
                m_currentTokens = m_currentTokens.m_next;
                m_indexTokens = 0; 
            }

            m_countTokens++;
            m_currentTokens.m_block[m_indexTokens++] = token; 
        }
 
        internal void AddString( String str ) 
        {
            if (m_currentStrings.m_block.Length <= m_indexStrings) 
            {
                m_currentStrings.m_next = new TokenizerStringBlock();
                m_currentStrings = m_currentStrings.m_next;
                m_indexStrings = 0; 
            }
 
            m_currentStrings.m_block[m_indexStrings++] = str; 
        }
 
        internal void Reset()
        {
            m_lastTokens = null;
            m_currentTokens = m_headTokens; 
            m_currentStrings = m_headStrings;
            m_indexTokens = 0; 
            m_indexStrings = 0; 
#if _DEBUG
            m_bLastWasCStr = false; 
#endif
        }

        internal short GetNextFullToken() 
        {
            if (m_currentTokens.m_block.Length <= m_indexTokens) 
            { 
                m_lastTokens = m_currentTokens;
                m_currentTokens = m_currentTokens.m_next; 
                m_indexTokens = 0;
            }

            return m_currentTokens.m_block[m_indexTokens++]; 
        }
 
        internal short GetNextToken() 
        {
            short retval = (short)(GetNextFullToken() & 0x00FF); 
#if _DEBUG
            BCLDebug.Assert( !m_bLastWasCStr, "CStr token not followed by GetNextString()" );
            m_bLastWasCStr = (retval == Tokenizer.cstr);
#endif 
            return retval;
        } 
 
        internal String GetNextString()
        { 
            if (m_currentStrings.m_block.Length <= m_indexStrings)
            {
                m_currentStrings = m_currentStrings.m_next;
                m_indexStrings = 0; 
            }
#if _DEBUG 
            m_bLastWasCStr = false; 
#endif
            return m_currentStrings.m_block[m_indexStrings++]; 
        }

        internal void ThrowAwayNextString()
        { 
            GetNextString();
        } 
 
        internal void TagLastToken( short tag )
        { 
            if (m_indexTokens == 0)
                m_lastTokens.m_block[m_lastTokens.m_block.Length-1] = (short)((ushort)m_lastTokens.m_block[m_lastTokens.m_block.Length-1] | (ushort)tag);
            else
                m_currentTokens.m_block[m_indexTokens-1] = (short)((ushort)m_currentTokens.m_block[m_indexTokens-1] | (ushort)tag); 
        }
 
        internal int GetTokenCount() 
        {
            return m_countTokens; 
        }

        internal void GoToPosition( int position )
        { 
            Reset();
 
            for (int count = 0; count < position; ++count) 
            {
                if (GetNextToken() == Tokenizer.cstr) 
                    ThrowAwayNextString();
            }
        }
    } 
}
Link Menu

Network programming in C#, Network Programming in VB.NET, Network Programming in .NET
This book is available now!
Buy at Amazon US or
Buy at Amazon UK
Tokenizer.cs source code in C# .NET

Source code for the .NET framework in C#

Code:

Link Menu