RegexBoyerMoore.cs source code in C# .NET

                        Code:
                         / 4.0 / 4.0 / DEVDIV_TFS / Dev10 / Releases / RTMRel / ndp / fx / src / Regex / System / Text / RegularExpressions / RegexBoyerMoore.cs / 1305376 / RegexBoyerMoore.cs
                        
                        
                            //------------------------------------------------------------------------------ 
// 
//     Copyright (c) Microsoft Corporation.  All rights reserved.
// 
//----------------------------------------------------------------------------- 

// The RegexBoyerMoore object precomputes the Boyer-Moore 
// tables for fast string scanning. These tables allow 
// you to scan for the first occurance of a string within
// a large body of text without examining every character. 
// The performance of the heuristic depends on the actual
// string and the text being searched, but usually, the longer
// the string that is being searched for, the fewer characters
// need to be examined. 

namespace System.Text.RegularExpressions 
{ 

    using System.Collections; 
    using System.Diagnostics;
    using System.Globalization;

    internal sealed class RegexBoyerMoore { 
        internal int[] _positive;
        internal int[] _negativeASCII; 
        internal int[][] _negativeUnicode; 
        internal String _pattern;
        internal int _lowASCII; 
        internal int _highASCII;
        internal bool _rightToLeft;
        internal bool _caseInsensitive;
        internal CultureInfo _culture; 

        internal const int infinite = 0x7FFFFFFF; 
 
        /*
         * Constructs a Boyer-Moore state machine for searching for the string 
         * pattern. The string must not be zero-length.
         */
        internal RegexBoyerMoore(String pattern, bool caseInsensitive, bool rightToLeft, CultureInfo culture) {
            /* 
             * Sorry,  you just can't use Boyer-Moore to find an empty pattern.
             * We're doing this for your own protection. (Really, for speed.) 
             */ 
            Debug.Assert(pattern.Length != 0, "RegexBoyerMoore called with an empty string.  This is bad for perf");
 
            int beforefirst;
            int last;
            int bump;
            int examine; 
            int scan;
            int match; 
            char ch; 

            // We do the ToLower character by character for consistency.  With surrogate chars, doing 
            // a ToLower on the entire string could actually change the surrogate pair.  This is more correct
            // linguistically, but since Regex doesn't support surrogates, it's more important to be
            // consistent.
            if (caseInsensitive) { 
                StringBuilder sb = new StringBuilder(pattern.Length);
                for (int i=0; i  This algorithm appears to be a simplified variant of the 
             *          standard Boyer-Moore good suffix calculation.  It could
             *          be one of D.M. Sunday's variations, but I have not found which one. 
             * 
             * <

 
*/
            _positive = new int[pattern.Length]; 
 
            examine = last;
            ch = pattern[examine]; 
            _positive[examine] = bump;
            examine -= bump;

            for (;;) { 
                // find an internal char (examine) that matches the tail
 
                for (;;) { 
                    if (examine == beforefirst)
                        goto OuterloopBreak; 
                    if (pattern[examine] == ch)
                        break;
                    examine -= bump;
                } 

                match = last; 
                scan = examine; 

                // find the length of the match 

                for (;;) {
                    if (scan == beforefirst || pattern[match] != pattern[scan]) {
                        // at the end of the match, note the difference in _positive 
                        // this is not the length of the match, but the distance from the internal match
                        // to the tail suffix. 
                        if (_positive[match] == 0) 
                            _positive[match] = match - scan;
 
                        // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan));

                        break;
                    } 

                    scan -= bump; 
                    match -= bump; 
                }
 
                examine -= bump;
            }

            OuterloopBreak: 

            match = last - bump; 
 
            // scan for the chars for which there are no shifts that yield a different candidate
 
            /* 
             *  The inside of the if statement used to say
             *  "_positive[match] = last - beforefirst;"
             *  I've changed it to the below code.  This 
             *  is slightly less agressive in how much we skip, but at worst it
             *  should mean a little more work rather than skipping a potential 
             *  match. 
             * 
             */ 
            while (match != beforefirst) {
                if (_positive[match] == 0)
                    _positive[match] = bump;
 
                match -= bump;
            } 
 
            //System.Diagnostics.Debug.WriteLine("good suffix shift table:");
            //for (int i=0; i<_positive.Length; i++) 
            //    System.Diagnostics.Debug.WriteLine("\t_positive[" + i + "] = " + _positive[i]);


            /* 
             * PART II - the bad-character shift table
             * 
             * compute the negative requirement: 
             * if char "ch" is the reject character when testing position "i",
             * we can slide up by _negative[ch]; 
             * (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
             *
             * the lookup table is divided into ASCII and Unicode portions;
             * only those parts of the Unicode 16-bit code set that actually 
             * appear in the string are in the table. (Maximum size with
             * Unicode is 65K; ASCII only case is 512 bytes.) 
             */ 

            _negativeASCII = new int[128]; 

            for (int i = 0; i < 128; i++)
                _negativeASCII[i] = last - beforefirst;
 
            _lowASCII = 127;
            _highASCII = 0; 
 
            for (examine = last; examine != beforefirst; examine -= bump) {
                ch = pattern[examine]; 

                if (ch < 128) {
                    if (_lowASCII > ch)
                        _lowASCII = ch; 

                    if (_highASCII < ch) 
                        _highASCII = ch; 

                    if (_negativeASCII[ch] == last - beforefirst) 
                        _negativeASCII[ch] = last - examine;
                }
                else {
                    int i = ch >> 8; 
                    int j = ch & 0xFF;
 
                    if (_negativeUnicode == null) { 
                        _negativeUnicode = new int[256][];
                    } 

                    if (_negativeUnicode[i] == null) {
                        int[] newarray = new int[256];
 
                        for (int k = 0; k < 256; k++)
                            newarray[k] = last - beforefirst; 
 
                        if (i == 0) {
                            System.Array.Copy(_negativeASCII, newarray, 128); 
                            _negativeASCII = newarray;
                        }

                        _negativeUnicode[i] = newarray; 
                    }
 
                    if (_negativeUnicode[i][j] == last - beforefirst) 
                        _negativeUnicode[i][j] = last - examine;
                } 
            }
        }

        private bool MatchPattern(string text, int index) { 
                if (_caseInsensitive) {
                    if( text.Length - index < _pattern.Length) { 
                        return false; 
                    }
 
                    TextInfo textinfo = _culture.TextInfo;
                    for( int i = 0; i < _pattern.Length; i++) {
                        Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!");
                        if( textinfo.ToLower(text[index + i]) != _pattern[i]) { 
                            return false;
                        } 
                    } 
                    return true;
                } 
                else {
                    return(0 == String.CompareOrdinal(_pattern, 0, text, index, _pattern.Length));
                }
        } 

        /* 
         * When a regex is anchored, we can do a quick IsMatch test instead of a Scan 
         */
        internal bool IsMatch(String text, int index, int beglimit, int endlimit) { 

            if (!_rightToLeft) {
                if (index < beglimit || endlimit - index < _pattern.Length)
                    return false; 

                return MatchPattern(text, index); 
            } 
            else {
                if (index > endlimit || index - beglimit < _pattern.Length) 
                    return false;

                return MatchPattern(text, index - _pattern.Length);
            } 
        }
 
 
        /*
         * Scan uses the Boyer-Moore algorithm to find the first occurrance 
         * of the specified string within text, beginning at index, and
         * constrained within beglimit and endlimit.
         *
         * The direction and case-sensitivity of the match is determined 
         * by the arguments to the RegexBoyerMoore constructor.
         */ 
        internal int Scan(String text, int index, int beglimit, int endlimit) { 
            int test;
            int test2; 
            int match;
            int startmatch;
            int endmatch;
            int advance; 
            int defadv;
            int bump; 
            char chMatch; 
            char chTest;
            int[] unicodeLookup; 

            if (!_rightToLeft) {
                defadv = _pattern.Length;
                startmatch = _pattern.Length - 1; 
                endmatch = 0;
                test = index + defadv - 1; 
                bump = 1; 
            }
            else { 
                defadv = -_pattern.Length;
                startmatch = 0;
                endmatch = -defadv - 1;
                test = index + defadv; 
                bump = -1;
            } 
 
            chMatch = _pattern[startmatch];
 
            for (;;) {
                if (test >= endlimit || test < beglimit)
                    return -1;
 
                chTest = text[test];
 
                if (_caseInsensitive) 
                    chTest = Char.ToLower(chTest, _culture);
 
                if (chTest != chMatch) {
                    if (chTest < 128)
                        advance = _negativeASCII[chTest];
                    else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8]))) 
                        advance = unicodeLookup[chTest & 0xFF];
                    else 
                        advance = defadv; 

                    test += advance; 
                }
                else { // if (chTest == chMatch)
                    test2 = test;
                    match = startmatch; 

                    for (;;) { 
                        if (match == endmatch) 
                            return(_rightToLeft ? test2 + 1 : test2);
 
                        match -= bump;
                        test2 -= bump;

                        chTest = text[test2]; 

                        if (_caseInsensitive) 
                            chTest = Char.ToLower(chTest, _culture); 

                        if (chTest != _pattern[match]) { 
                            advance = _positive[match];
                            if ((chTest & 0xFF80) == 0)
                                test2 = (match - startmatch) + _negativeASCII[chTest];
                            else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8]))) 
                                test2 = (match - startmatch) + unicodeLookup[chTest & 0xFF];
                            else { 
                                test += advance; 
                                break;
                            } 

                            if (_rightToLeft ? test2 < advance : test2 > advance)
                                advance = test2;
 
                            test += advance;
                            break; 
                        } 
                    }
                } 
            }
        }

        /* 
         * Used when dumping for debugging.
         */ 
        public override String ToString() { 
            return _pattern;
        } 

#if DBG
        public String Dump(String indent) {
            StringBuilder sb = new StringBuilder(); 

            sb.Append(indent + "BM Pattern: " + _pattern + "\n"); 
            sb.Append(indent + "Positive: "); 
            for (int i = 0; i < _positive.Length; i++) {
                sb.Append(_positive[i].ToString(CultureInfo.InvariantCulture) + " "); 
            }
            sb.Append("\n");

            if (_negativeASCII != null) { 
                sb.Append(indent + "Negative table\n");
                for (int i = 0; i < _negativeASCII.Length; i++) { 
                    if (_negativeASCII[i] != _pattern.Length) { 
                        sb.Append(indent + "  " + Regex.Escape(Convert.ToString((char)i, CultureInfo.InvariantCulture)) + " " + _negativeASCII[i].ToString(CultureInfo.InvariantCulture) + "\n");
                    } 
                }
            }

            return sb.ToString(); 
        }
#endif 
    } 
}

// File provided for Reference Use Only by Microsoft Corporation (c) 2007.
//------------------------------------------------------------------------------ 
// 
//     Copyright (c) Microsoft Corporation.  All rights reserved.
// 
//----------------------------------------------------------------------------- 

// The RegexBoyerMoore object precomputes the Boyer-Moore 
// tables for fast string scanning. These tables allow 
// you to scan for the first occurance of a string within
// a large body of text without examining every character. 
// The performance of the heuristic depends on the actual
// string and the text being searched, but usually, the longer
// the string that is being searched for, the fewer characters
// need to be examined. 

namespace System.Text.RegularExpressions 
{ 

    using System.Collections; 
    using System.Diagnostics;
    using System.Globalization;

    internal sealed class RegexBoyerMoore { 
        internal int[] _positive;
        internal int[] _negativeASCII; 
        internal int[][] _negativeUnicode; 
        internal String _pattern;
        internal int _lowASCII; 
        internal int _highASCII;
        internal bool _rightToLeft;
        internal bool _caseInsensitive;
        internal CultureInfo _culture; 

        internal const int infinite = 0x7FFFFFFF; 
 
        /*
         * Constructs a Boyer-Moore state machine for searching for the string 
         * pattern. The string must not be zero-length.
         */
        internal RegexBoyerMoore(String pattern, bool caseInsensitive, bool rightToLeft, CultureInfo culture) {
            /* 
             * Sorry,  you just can't use Boyer-Moore to find an empty pattern.
             * We're doing this for your own protection. (Really, for speed.) 
             */ 
            Debug.Assert(pattern.Length != 0, "RegexBoyerMoore called with an empty string.  This is bad for perf");
 
            int beforefirst;
            int last;
            int bump;
            int examine; 
            int scan;
            int match; 
            char ch; 

            // We do the ToLower character by character for consistency.  With surrogate chars, doing 
            // a ToLower on the entire string could actually change the surrogate pair.  This is more correct
            // linguistically, but since Regex doesn't support surrogates, it's more important to be
            // consistent.
            if (caseInsensitive) { 
                StringBuilder sb = new StringBuilder(pattern.Length);
                for (int i=0; i  This algorithm appears to be a simplified variant of the 
             *          standard Boyer-Moore good suffix calculation.  It could
             *          be one of D.M. Sunday's variations, but I have not found which one. 
             * 
             * <

 
*/
            _positive = new int[pattern.Length]; 
 
            examine = last;
            ch = pattern[examine]; 
            _positive[examine] = bump;
            examine -= bump;

            for (;;) { 
                // find an internal char (examine) that matches the tail
 
                for (;;) { 
                    if (examine == beforefirst)
                        goto OuterloopBreak; 
                    if (pattern[examine] == ch)
                        break;
                    examine -= bump;
                } 

                match = last; 
                scan = examine; 

                // find the length of the match 

                for (;;) {
                    if (scan == beforefirst || pattern[match] != pattern[scan]) {
                        // at the end of the match, note the difference in _positive 
                        // this is not the length of the match, but the distance from the internal match
                        // to the tail suffix. 
                        if (_positive[match] == 0) 
                            _positive[match] = match - scan;
 
                        // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan));

                        break;
                    } 

                    scan -= bump; 
                    match -= bump; 
                }
 
                examine -= bump;
            }

            OuterloopBreak: 

            match = last - bump; 
 
            // scan for the chars for which there are no shifts that yield a different candidate
 
            /* 
             *  The inside of the if statement used to say
             *  "_positive[match] = last - beforefirst;"
             *  I've changed it to the below code.  This 
             *  is slightly less agressive in how much we skip, but at worst it
             *  should mean a little more work rather than skipping a potential 
             *  match. 
             * 
             */ 
            while (match != beforefirst) {
                if (_positive[match] == 0)
                    _positive[match] = bump;
 
                match -= bump;
            } 
 
            //System.Diagnostics.Debug.WriteLine("good suffix shift table:");
            //for (int i=0; i<_positive.Length; i++) 
            //    System.Diagnostics.Debug.WriteLine("\t_positive[" + i + "] = " + _positive[i]);


            /* 
             * PART II - the bad-character shift table
             * 
             * compute the negative requirement: 
             * if char "ch" is the reject character when testing position "i",
             * we can slide up by _negative[ch]; 
             * (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
             *
             * the lookup table is divided into ASCII and Unicode portions;
             * only those parts of the Unicode 16-bit code set that actually 
             * appear in the string are in the table. (Maximum size with
             * Unicode is 65K; ASCII only case is 512 bytes.) 
             */ 

            _negativeASCII = new int[128]; 

            for (int i = 0; i < 128; i++)
                _negativeASCII[i] = last - beforefirst;
 
            _lowASCII = 127;
            _highASCII = 0; 
 
            for (examine = last; examine != beforefirst; examine -= bump) {
                ch = pattern[examine]; 

                if (ch < 128) {
                    if (_lowASCII > ch)
                        _lowASCII = ch; 

                    if (_highASCII < ch) 
                        _highASCII = ch; 

                    if (_negativeASCII[ch] == last - beforefirst) 
                        _negativeASCII[ch] = last - examine;
                }
                else {
                    int i = ch >> 8; 
                    int j = ch & 0xFF;
 
                    if (_negativeUnicode == null) { 
                        _negativeUnicode = new int[256][];
                    } 

                    if (_negativeUnicode[i] == null) {
                        int[] newarray = new int[256];
 
                        for (int k = 0; k < 256; k++)
                            newarray[k] = last - beforefirst; 
 
                        if (i == 0) {
                            System.Array.Copy(_negativeASCII, newarray, 128); 
                            _negativeASCII = newarray;
                        }

                        _negativeUnicode[i] = newarray; 
                    }
 
                    if (_negativeUnicode[i][j] == last - beforefirst) 
                        _negativeUnicode[i][j] = last - examine;
                } 
            }
        }

        private bool MatchPattern(string text, int index) { 
                if (_caseInsensitive) {
                    if( text.Length - index < _pattern.Length) { 
                        return false; 
                    }
 
                    TextInfo textinfo = _culture.TextInfo;
                    for( int i = 0; i < _pattern.Length; i++) {
                        Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!");
                        if( textinfo.ToLower(text[index + i]) != _pattern[i]) { 
                            return false;
                        } 
                    } 
                    return true;
                } 
                else {
                    return(0 == String.CompareOrdinal(_pattern, 0, text, index, _pattern.Length));
                }
        } 

        /* 
         * When a regex is anchored, we can do a quick IsMatch test instead of a Scan 
         */
        internal bool IsMatch(String text, int index, int beglimit, int endlimit) { 

            if (!_rightToLeft) {
                if (index < beglimit || endlimit - index < _pattern.Length)
                    return false; 

                return MatchPattern(text, index); 
            } 
            else {
                if (index > endlimit || index - beglimit < _pattern.Length) 
                    return false;

                return MatchPattern(text, index - _pattern.Length);
            } 
        }
 
 
        /*
         * Scan uses the Boyer-Moore algorithm to find the first occurrance 
         * of the specified string within text, beginning at index, and
         * constrained within beglimit and endlimit.
         *
         * The direction and case-sensitivity of the match is determined 
         * by the arguments to the RegexBoyerMoore constructor.
         */ 
        internal int Scan(String text, int index, int beglimit, int endlimit) { 
            int test;
            int test2; 
            int match;
            int startmatch;
            int endmatch;
            int advance; 
            int defadv;
            int bump; 
            char chMatch; 
            char chTest;
            int[] unicodeLookup; 

            if (!_rightToLeft) {
                defadv = _pattern.Length;
                startmatch = _pattern.Length - 1; 
                endmatch = 0;
                test = index + defadv - 1; 
                bump = 1; 
            }
            else { 
                defadv = -_pattern.Length;
                startmatch = 0;
                endmatch = -defadv - 1;
                test = index + defadv; 
                bump = -1;
            } 
 
            chMatch = _pattern[startmatch];
 
            for (;;) {
                if (test >= endlimit || test < beglimit)
                    return -1;
 
                chTest = text[test];
 
                if (_caseInsensitive) 
                    chTest = Char.ToLower(chTest, _culture);
 
                if (chTest != chMatch) {
                    if (chTest < 128)
                        advance = _negativeASCII[chTest];
                    else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8]))) 
                        advance = unicodeLookup[chTest & 0xFF];
                    else 
                        advance = defadv; 

                    test += advance; 
                }
                else { // if (chTest == chMatch)
                    test2 = test;
                    match = startmatch; 

                    for (;;) { 
                        if (match == endmatch) 
                            return(_rightToLeft ? test2 + 1 : test2);
 
                        match -= bump;
                        test2 -= bump;

                        chTest = text[test2]; 

                        if (_caseInsensitive) 
                            chTest = Char.ToLower(chTest, _culture); 

                        if (chTest != _pattern[match]) { 
                            advance = _positive[match];
                            if ((chTest & 0xFF80) == 0)
                                test2 = (match - startmatch) + _negativeASCII[chTest];
                            else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8]))) 
                                test2 = (match - startmatch) + unicodeLookup[chTest & 0xFF];
                            else { 
                                test += advance; 
                                break;
                            } 

                            if (_rightToLeft ? test2 < advance : test2 > advance)
                                advance = test2;
 
                            test += advance;
                            break; 
                        } 
                    }
                } 
            }
        }

        /* 
         * Used when dumping for debugging.
         */ 
        public override String ToString() { 
            return _pattern;
        } 

#if DBG
        public String Dump(String indent) {
            StringBuilder sb = new StringBuilder(); 

            sb.Append(indent + "BM Pattern: " + _pattern + "\n"); 
            sb.Append(indent + "Positive: "); 
            for (int i = 0; i < _positive.Length; i++) {
                sb.Append(_positive[i].ToString(CultureInfo.InvariantCulture) + " "); 
            }
            sb.Append("\n");

            if (_negativeASCII != null) { 
                sb.Append(indent + "Negative table\n");
                for (int i = 0; i < _negativeASCII.Length; i++) { 
                    if (_negativeASCII[i] != _pattern.Length) { 
                        sb.Append(indent + "  " + Regex.Escape(Convert.ToString((char)i, CultureInfo.InvariantCulture)) + " " + _negativeASCII[i].ToString(CultureInfo.InvariantCulture) + "\n");
                    } 
                }
            }

            return sb.ToString(); 
        }
#endif 
    } 
}

// File provided for Reference Use Only by Microsoft Corporation (c) 2007.
Link Menu

Network programming in C#, Network Programming in VB.NET, Network Programming in .NET
This book is available now!
Buy at Amazon US or
Buy at Amazon UK
RegexBoyerMoore.cs source code in C# .NET

Source code for the .NET framework in C#

Code:

Link Menu