Code:
/ 4.0 / 4.0 / DEVDIV_TFS / Dev10 / Releases / RTMRel / ndp / fx / src / Regex / System / Text / RegularExpressions / RegexBoyerMoore.cs / 1305376 / RegexBoyerMoore.cs
//------------------------------------------------------------------------------ //// Copyright (c) Microsoft Corporation. All rights reserved. // //----------------------------------------------------------------------------- // The RegexBoyerMoore object precomputes the Boyer-Moore // tables for fast string scanning. These tables allow // you to scan for the first occurance of a string within // a large body of text without examining every character. // The performance of the heuristic depends on the actual // string and the text being searched, but usually, the longer // the string that is being searched for, the fewer characters // need to be examined. namespace System.Text.RegularExpressions { using System.Collections; using System.Diagnostics; using System.Globalization; internal sealed class RegexBoyerMoore { internal int[] _positive; internal int[] _negativeASCII; internal int[][] _negativeUnicode; internal String _pattern; internal int _lowASCII; internal int _highASCII; internal bool _rightToLeft; internal bool _caseInsensitive; internal CultureInfo _culture; internal const int infinite = 0x7FFFFFFF; /* * Constructs a Boyer-Moore state machine for searching for the string * pattern. The string must not be zero-length. */ internal RegexBoyerMoore(String pattern, bool caseInsensitive, bool rightToLeft, CultureInfo culture) { /* * Sorry, you just can't use Boyer-Moore to find an empty pattern. * We're doing this for your own protection. (Really, for speed.) */ Debug.Assert(pattern.Length != 0, "RegexBoyerMoore called with an empty string. This is bad for perf"); int beforefirst; int last; int bump; int examine; int scan; int match; char ch; // We do the ToLower character by character for consistency. With surrogate chars, doing // a ToLower on the entire string could actually change the surrogate pair. This is more correct // linguistically, but since Regex doesn't support surrogates, it's more important to be // consistent. if (caseInsensitive) { StringBuilder sb = new StringBuilder(pattern.Length); for (int i=0; iThis algorithm appears to be a simplified variant of the * standard Boyer-Moore good suffix calculation. It could * be one of D.M. Sunday's variations, but I have not found which one. * * < */ _positive = new int[pattern.Length]; examine = last; ch = pattern[examine]; _positive[examine] = bump; examine -= bump; for (;;) { // find an internal char (examine) that matches the tail for (;;) { if (examine == beforefirst) goto OuterloopBreak; if (pattern[examine] == ch) break; examine -= bump; } match = last; scan = examine; // find the length of the match for (;;) { if (scan == beforefirst || pattern[match] != pattern[scan]) { // at the end of the match, note the difference in _positive // this is not the length of the match, but the distance from the internal match // to the tail suffix. if (_positive[match] == 0) _positive[match] = match - scan; // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan)); break; } scan -= bump; match -= bump; } examine -= bump; } OuterloopBreak: match = last - bump; // scan for the chars for which there are no shifts that yield a different candidate /* * The inside of the if statement used to say * "_positive[match] = last - beforefirst;" * I've changed it to the below code. This * is slightly less agressive in how much we skip, but at worst it * should mean a little more work rather than skipping a potential * match. * */ while (match != beforefirst) { if (_positive[match] == 0) _positive[match] = bump; match -= bump; } //System.Diagnostics.Debug.WriteLine("good suffix shift table:"); //for (int i=0; i<_positive.Length; i++) // System.Diagnostics.Debug.WriteLine("\t_positive[" + i + "] = " + _positive[i]); /* * PART II - the bad-character shift table * * compute the negative requirement: * if char "ch" is the reject character when testing position "i", * we can slide up by _negative[ch]; * (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch)) * * the lookup table is divided into ASCII and Unicode portions; * only those parts of the Unicode 16-bit code set that actually * appear in the string are in the table. (Maximum size with * Unicode is 65K; ASCII only case is 512 bytes.) */ _negativeASCII = new int[128]; for (int i = 0; i < 128; i++) _negativeASCII[i] = last - beforefirst; _lowASCII = 127; _highASCII = 0; for (examine = last; examine != beforefirst; examine -= bump) { ch = pattern[examine]; if (ch < 128) { if (_lowASCII > ch) _lowASCII = ch; if (_highASCII < ch) _highASCII = ch; if (_negativeASCII[ch] == last - beforefirst) _negativeASCII[ch] = last - examine; } else { int i = ch >> 8; int j = ch & 0xFF; if (_negativeUnicode == null) { _negativeUnicode = new int[256][]; } if (_negativeUnicode[i] == null) { int[] newarray = new int[256]; for (int k = 0; k < 256; k++) newarray[k] = last - beforefirst; if (i == 0) { System.Array.Copy(_negativeASCII, newarray, 128); _negativeASCII = newarray; } _negativeUnicode[i] = newarray; } if (_negativeUnicode[i][j] == last - beforefirst) _negativeUnicode[i][j] = last - examine; } } } private bool MatchPattern(string text, int index) { if (_caseInsensitive) { if( text.Length - index < _pattern.Length) { return false; } TextInfo textinfo = _culture.TextInfo; for( int i = 0; i < _pattern.Length; i++) { Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!"); if( textinfo.ToLower(text[index + i]) != _pattern[i]) { return false; } } return true; } else { return(0 == String.CompareOrdinal(_pattern, 0, text, index, _pattern.Length)); } } /* * When a regex is anchored, we can do a quick IsMatch test instead of a Scan */ internal bool IsMatch(String text, int index, int beglimit, int endlimit) { if (!_rightToLeft) { if (index < beglimit || endlimit - index < _pattern.Length) return false; return MatchPattern(text, index); } else { if (index > endlimit || index - beglimit < _pattern.Length) return false; return MatchPattern(text, index - _pattern.Length); } } /* * Scan uses the Boyer-Moore algorithm to find the first occurrance * of the specified string within text, beginning at index, and * constrained within beglimit and endlimit. * * The direction and case-sensitivity of the match is determined * by the arguments to the RegexBoyerMoore constructor. */ internal int Scan(String text, int index, int beglimit, int endlimit) { int test; int test2; int match; int startmatch; int endmatch; int advance; int defadv; int bump; char chMatch; char chTest; int[] unicodeLookup; if (!_rightToLeft) { defadv = _pattern.Length; startmatch = _pattern.Length - 1; endmatch = 0; test = index + defadv - 1; bump = 1; } else { defadv = -_pattern.Length; startmatch = 0; endmatch = -defadv - 1; test = index + defadv; bump = -1; } chMatch = _pattern[startmatch]; for (;;) { if (test >= endlimit || test < beglimit) return -1; chTest = text[test]; if (_caseInsensitive) chTest = Char.ToLower(chTest, _culture); if (chTest != chMatch) { if (chTest < 128) advance = _negativeASCII[chTest]; else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8]))) advance = unicodeLookup[chTest & 0xFF]; else advance = defadv; test += advance; } else { // if (chTest == chMatch) test2 = test; match = startmatch; for (;;) { if (match == endmatch) return(_rightToLeft ? test2 + 1 : test2); match -= bump; test2 -= bump; chTest = text[test2]; if (_caseInsensitive) chTest = Char.ToLower(chTest, _culture); if (chTest != _pattern[match]) { advance = _positive[match]; if ((chTest & 0xFF80) == 0) test2 = (match - startmatch) + _negativeASCII[chTest]; else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8]))) test2 = (match - startmatch) + unicodeLookup[chTest & 0xFF]; else { test += advance; break; } if (_rightToLeft ? test2 < advance : test2 > advance) advance = test2; test += advance; break; } } } } } /* * Used when dumping for debugging. */ public override String ToString() { return _pattern; } #if DBG public String Dump(String indent) { StringBuilder sb = new StringBuilder(); sb.Append(indent + "BM Pattern: " + _pattern + "\n"); sb.Append(indent + "Positive: "); for (int i = 0; i < _positive.Length; i++) { sb.Append(_positive[i].ToString(CultureInfo.InvariantCulture) + " "); } sb.Append("\n"); if (_negativeASCII != null) { sb.Append(indent + "Negative table\n"); for (int i = 0; i < _negativeASCII.Length; i++) { if (_negativeASCII[i] != _pattern.Length) { sb.Append(indent + " " + Regex.Escape(Convert.ToString((char)i, CultureInfo.InvariantCulture)) + " " + _negativeASCII[i].ToString(CultureInfo.InvariantCulture) + "\n"); } } } return sb.ToString(); } #endif } } // File provided for Reference Use Only by Microsoft Corporation (c) 2007. //------------------------------------------------------------------------------ //// Copyright (c) Microsoft Corporation. All rights reserved. // //----------------------------------------------------------------------------- // The RegexBoyerMoore object precomputes the Boyer-Moore // tables for fast string scanning. These tables allow // you to scan for the first occurance of a string within // a large body of text without examining every character. // The performance of the heuristic depends on the actual // string and the text being searched, but usually, the longer // the string that is being searched for, the fewer characters // need to be examined. namespace System.Text.RegularExpressions { using System.Collections; using System.Diagnostics; using System.Globalization; internal sealed class RegexBoyerMoore { internal int[] _positive; internal int[] _negativeASCII; internal int[][] _negativeUnicode; internal String _pattern; internal int _lowASCII; internal int _highASCII; internal bool _rightToLeft; internal bool _caseInsensitive; internal CultureInfo _culture; internal const int infinite = 0x7FFFFFFF; /* * Constructs a Boyer-Moore state machine for searching for the string * pattern. The string must not be zero-length. */ internal RegexBoyerMoore(String pattern, bool caseInsensitive, bool rightToLeft, CultureInfo culture) { /* * Sorry, you just can't use Boyer-Moore to find an empty pattern. * We're doing this for your own protection. (Really, for speed.) */ Debug.Assert(pattern.Length != 0, "RegexBoyerMoore called with an empty string. This is bad for perf"); int beforefirst; int last; int bump; int examine; int scan; int match; char ch; // We do the ToLower character by character for consistency. With surrogate chars, doing // a ToLower on the entire string could actually change the surrogate pair. This is more correct // linguistically, but since Regex doesn't support surrogates, it's more important to be // consistent. if (caseInsensitive) { StringBuilder sb = new StringBuilder(pattern.Length); for (int i=0; iThis algorithm appears to be a simplified variant of the * standard Boyer-Moore good suffix calculation. It could * be one of D.M. Sunday's variations, but I have not found which one. * * < */ _positive = new int[pattern.Length]; examine = last; ch = pattern[examine]; _positive[examine] = bump; examine -= bump; for (;;) { // find an internal char (examine) that matches the tail for (;;) { if (examine == beforefirst) goto OuterloopBreak; if (pattern[examine] == ch) break; examine -= bump; } match = last; scan = examine; // find the length of the match for (;;) { if (scan == beforefirst || pattern[match] != pattern[scan]) { // at the end of the match, note the difference in _positive // this is not the length of the match, but the distance from the internal match // to the tail suffix. if (_positive[match] == 0) _positive[match] = match - scan; // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan)); break; } scan -= bump; match -= bump; } examine -= bump; } OuterloopBreak: match = last - bump; // scan for the chars for which there are no shifts that yield a different candidate /* * The inside of the if statement used to say * "_positive[match] = last - beforefirst;" * I've changed it to the below code. This * is slightly less agressive in how much we skip, but at worst it * should mean a little more work rather than skipping a potential * match. * */ while (match != beforefirst) { if (_positive[match] == 0) _positive[match] = bump; match -= bump; } //System.Diagnostics.Debug.WriteLine("good suffix shift table:"); //for (int i=0; i<_positive.Length; i++) // System.Diagnostics.Debug.WriteLine("\t_positive[" + i + "] = " + _positive[i]); /* * PART II - the bad-character shift table * * compute the negative requirement: * if char "ch" is the reject character when testing position "i", * we can slide up by _negative[ch]; * (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch)) * * the lookup table is divided into ASCII and Unicode portions; * only those parts of the Unicode 16-bit code set that actually * appear in the string are in the table. (Maximum size with * Unicode is 65K; ASCII only case is 512 bytes.) */ _negativeASCII = new int[128]; for (int i = 0; i < 128; i++) _negativeASCII[i] = last - beforefirst; _lowASCII = 127; _highASCII = 0; for (examine = last; examine != beforefirst; examine -= bump) { ch = pattern[examine]; if (ch < 128) { if (_lowASCII > ch) _lowASCII = ch; if (_highASCII < ch) _highASCII = ch; if (_negativeASCII[ch] == last - beforefirst) _negativeASCII[ch] = last - examine; } else { int i = ch >> 8; int j = ch & 0xFF; if (_negativeUnicode == null) { _negativeUnicode = new int[256][]; } if (_negativeUnicode[i] == null) { int[] newarray = new int[256]; for (int k = 0; k < 256; k++) newarray[k] = last - beforefirst; if (i == 0) { System.Array.Copy(_negativeASCII, newarray, 128); _negativeASCII = newarray; } _negativeUnicode[i] = newarray; } if (_negativeUnicode[i][j] == last - beforefirst) _negativeUnicode[i][j] = last - examine; } } } private bool MatchPattern(string text, int index) { if (_caseInsensitive) { if( text.Length - index < _pattern.Length) { return false; } TextInfo textinfo = _culture.TextInfo; for( int i = 0; i < _pattern.Length; i++) { Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!"); if( textinfo.ToLower(text[index + i]) != _pattern[i]) { return false; } } return true; } else { return(0 == String.CompareOrdinal(_pattern, 0, text, index, _pattern.Length)); } } /* * When a regex is anchored, we can do a quick IsMatch test instead of a Scan */ internal bool IsMatch(String text, int index, int beglimit, int endlimit) { if (!_rightToLeft) { if (index < beglimit || endlimit - index < _pattern.Length) return false; return MatchPattern(text, index); } else { if (index > endlimit || index - beglimit < _pattern.Length) return false; return MatchPattern(text, index - _pattern.Length); } } /* * Scan uses the Boyer-Moore algorithm to find the first occurrance * of the specified string within text, beginning at index, and * constrained within beglimit and endlimit. * * The direction and case-sensitivity of the match is determined * by the arguments to the RegexBoyerMoore constructor. */ internal int Scan(String text, int index, int beglimit, int endlimit) { int test; int test2; int match; int startmatch; int endmatch; int advance; int defadv; int bump; char chMatch; char chTest; int[] unicodeLookup; if (!_rightToLeft) { defadv = _pattern.Length; startmatch = _pattern.Length - 1; endmatch = 0; test = index + defadv - 1; bump = 1; } else { defadv = -_pattern.Length; startmatch = 0; endmatch = -defadv - 1; test = index + defadv; bump = -1; } chMatch = _pattern[startmatch]; for (;;) { if (test >= endlimit || test < beglimit) return -1; chTest = text[test]; if (_caseInsensitive) chTest = Char.ToLower(chTest, _culture); if (chTest != chMatch) { if (chTest < 128) advance = _negativeASCII[chTest]; else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8]))) advance = unicodeLookup[chTest & 0xFF]; else advance = defadv; test += advance; } else { // if (chTest == chMatch) test2 = test; match = startmatch; for (;;) { if (match == endmatch) return(_rightToLeft ? test2 + 1 : test2); match -= bump; test2 -= bump; chTest = text[test2]; if (_caseInsensitive) chTest = Char.ToLower(chTest, _culture); if (chTest != _pattern[match]) { advance = _positive[match]; if ((chTest & 0xFF80) == 0) test2 = (match - startmatch) + _negativeASCII[chTest]; else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8]))) test2 = (match - startmatch) + unicodeLookup[chTest & 0xFF]; else { test += advance; break; } if (_rightToLeft ? test2 < advance : test2 > advance) advance = test2; test += advance; break; } } } } } /* * Used when dumping for debugging. */ public override String ToString() { return _pattern; } #if DBG public String Dump(String indent) { StringBuilder sb = new StringBuilder(); sb.Append(indent + "BM Pattern: " + _pattern + "\n"); sb.Append(indent + "Positive: "); for (int i = 0; i < _positive.Length; i++) { sb.Append(_positive[i].ToString(CultureInfo.InvariantCulture) + " "); } sb.Append("\n"); if (_negativeASCII != null) { sb.Append(indent + "Negative table\n"); for (int i = 0; i < _negativeASCII.Length; i++) { if (_negativeASCII[i] != _pattern.Length) { sb.Append(indent + " " + Regex.Escape(Convert.ToString((char)i, CultureInfo.InvariantCulture)) + " " + _negativeASCII[i].ToString(CultureInfo.InvariantCulture) + "\n"); } } } return sb.ToString(); } #endif } } // File provided for Reference Use Only by Microsoft Corporation (c) 2007.
Link Menu
This book is available now!
Buy at Amazon US or
Buy at Amazon UK
- XmlNavigatorFilter.cs
- XmlSchemaSimpleContentExtension.cs
- CustomPopupPlacement.cs
- InvalidComObjectException.cs
- DataGridViewImageCell.cs
- WindowsFormsHost.cs
- HashHelper.cs
- UInt32.cs
- PositiveTimeSpanValidator.cs
- TypefaceMap.cs
- SecurityDescriptor.cs
- TokenizerHelper.cs
- OracleInfoMessageEventArgs.cs
- ComponentResourceManager.cs
- ReadOnlyCollection.cs
- CompiledELinqQueryState.cs
- Normalization.cs
- CleanUpVirtualizedItemEventArgs.cs
- XPathAncestorQuery.cs
- CurrentChangingEventManager.cs
- ServiceModelEnumValidator.cs
- COM2Properties.cs
- UTF32Encoding.cs
- AttachedPropertyBrowsableForTypeAttribute.cs
- oledbmetadatacollectionnames.cs
- SignatureDescription.cs
- ExceptionHelpers.cs
- DataGridViewSelectedColumnCollection.cs
- ApplicationSettingsBase.cs
- IdentitySection.cs
- DeflateStream.cs
- Triangle.cs
- _TransmitFileOverlappedAsyncResult.cs
- WebPartVerbsEventArgs.cs
- HwndSource.cs
- InputEventArgs.cs
- HttpStaticObjectsCollectionBase.cs
- LazyTextWriterCreator.cs
- Transform.cs
- ObjectStorage.cs
- DragAssistanceManager.cs
- RelationshipSet.cs
- PropertyDescriptorCollection.cs
- IndexerReference.cs
- MemoryRecordBuffer.cs
- SelectionHighlightInfo.cs
- TextView.cs
- CodeParameterDeclarationExpression.cs
- ReadOnlyPropertyMetadata.cs
- ToolStripTextBox.cs
- TargetConverter.cs
- ToolStripGripRenderEventArgs.cs
- MetadataPropertyvalue.cs
- TypeTypeConverter.cs
- WindowsPrincipal.cs
- ApplicationCommands.cs
- ImageAttributes.cs
- Simplifier.cs
- XPathAxisIterator.cs
- StatusBarPanelClickEvent.cs
- DEREncoding.cs
- FrameDimension.cs
- FontStretches.cs
- COM2Enum.cs
- UIElementPropertyUndoUnit.cs
- RegexRunnerFactory.cs
- OleDbErrorCollection.cs
- RemoveFromCollection.cs
- DataListItem.cs
- Brush.cs
- DesignerSerializationVisibilityAttribute.cs
- MetadataItemEmitter.cs
- MediaEntryAttribute.cs
- BitmapEncoder.cs
- BamlLocalizableResourceKey.cs
- ValidationHelper.cs
- XmlnsCache.cs
- WindowsClaimSet.cs
- AddInControllerImpl.cs
- HitTestFilterBehavior.cs
- XmlCustomFormatter.cs
- FormsAuthenticationUserCollection.cs
- WhitespaceRuleReader.cs
- InheritedPropertyChangedEventArgs.cs
- FixedElement.cs
- TemplateControl.cs
- CellIdBoolean.cs
- ComponentChangingEvent.cs
- BoolExpression.cs
- Frame.cs
- ControlAdapter.cs
- SafeLibraryHandle.cs
- Int16AnimationBase.cs
- StreamWriter.cs
- ValueHandle.cs
- Cursors.cs
- RijndaelManaged.cs
- TrackingMemoryStreamFactory.cs
- StylusButtonEventArgs.cs
- ModulesEntry.cs