13using System.Collections;
14using System.Collections.Generic;
15using System.Globalization;
16using System.Text.RegularExpressions;
31 DollarMatchesEndOfStringOnly = 8,
39 #region PerlRegExpConverter
52 private static Regex quantifiers
56 if (_quantifiers ==
null)
57 _quantifiers =
new Regex(
@"\G(?:\?|\*|\+|\{[0-9]+,[0-9]*\})");
61 private static Regex _quantifiers;
66 private static Regex posixCharClasses
70 if (_posixCharClasses ==
null)
71 _posixCharClasses =
new Regex(
"^\\[:(^)?(alpha|alnum|ascii|cntrl|digit|graph|lower|print|punct|space|upper|word|xdigit):]", RegexOptions.Singleline);
72 return _posixCharClasses;
75 private static Regex _posixCharClasses =
null;
80 private string perlRegEx;
91 private string dotNetMatchExpression;
97 private string dotNetReplaceExpression;
104 private RegexOptions dotNetOptions;
109 public Encoding Encoding {
get {
return encoding; } }
110 private readonly Encoding encoding;
122 if (encoding ==
null)
123 throw new ArgumentNullException(
"encoding");
125 this.encoding = encoding;
127 ConvertPattern(pattern);
129 if (replacement !=
null)
130 dotNetReplaceExpression = ConvertReplacement(replacement);
133 private void ConvertPattern(
string pattern)
135 string string_pattern =
null;
137 string_pattern = pattern;
139 LoadPerlRegex(string_pattern);
141 dotNetMatchExpression = ConvertRegex(perlRegEx, perlOptions, encoding);
146 regex =
new Regex(dotNetMatchExpression, dotNetOptions);
148 catch (ArgumentException e)
150 throw new ArgumentException(ExtractExceptionalMessage(e.Message));
158 private string ExtractExceptionalMessage(
string message)
162 message = message.Replace(dotNetMatchExpression,
"<pattern>");
164 int i = message.IndexOf(
"\r\n");
166 message = message.Substring(0, i);
168 i = message.IndexOf(
"-");
170 message = message.Substring(i + 2);
175 internal string ConvertString(
string str,
int start,
int length)
178 return Encoding.UTF8.GetString(encoding.GetBytes(str.Substring(start, length)));
180 return str.Substring(start, length);
183 internal string ConvertBytes(
byte[] bytes,
int start,
int length)
186 return Encoding.UTF8.GetString(bytes, start, length);
188 return encoding.GetString(bytes, start, length);
191 private void LoadPerlRegex(
byte[] pattern)
193 if (pattern ==
null) pattern =
new byte[0];
194 int regex_start, regex_end;
196 StringBuilder upattern =
new StringBuilder();
197 upattern.Append(pattern);
199 FindRegexDelimiters(upattern, out regex_start, out regex_end);
200 ParseRegexOptions(upattern, regex_end + 2, out dotNetOptions, out perlOptions);
202 perlRegEx = ConvertBytes(pattern, regex_start, regex_end - regex_start + 1);
205 private void LoadPerlRegex(
string pattern)
207 if (pattern ==
null) pattern =
"";
208 int regex_start, regex_end;
210 StringBuilder upattern =
new StringBuilder();
211 upattern.Append(pattern);
213 FindRegexDelimiters(upattern, out regex_start, out regex_end);
214 ParseRegexOptions(upattern, regex_end + 2, out dotNetOptions, out perlOptions);
216 perlRegEx = ConvertString(pattern, regex_start, regex_end - regex_start + 1);
219 private void FindRegexDelimiters(StringBuilder pattern, out
int start, out
int end)
222 while (i < pattern.Length && Char.IsWhiteSpace(pattern[i])) i++;
224 if (i == pattern.Length)
225 throw new ArgumentException(
"RegExp empty");
227 char start_delimiter = pattern[i++];
228 if (Char.IsLetterOrDigit(start_delimiter) || start_delimiter ==
'\\')
229 throw new ArgumentException(
"Something bad with delimiter");
233 if (start_delimiter ==
'[') end_delimiter =
']';
234 else if (start_delimiter ==
'(') end_delimiter =
')';
235 else if (start_delimiter ==
'{') end_delimiter =
'}';
236 else if (start_delimiter ==
'<') end_delimiter =
'>';
237 else end_delimiter = start_delimiter;
240 while (i < pattern.Length)
242 if (pattern[i] ==
'\\' && i + 1 < pattern.Length)
247 else if (pattern[i] == end_delimiter)
250 if (depth == 0)
break;
252 else if (pattern[i] == start_delimiter)
259 if (i == pattern.Length)
260 throw new ArgumentException(
"No end delimiter");
265 private static void ParseRegexOptions(StringBuilder pattern,
int start,
268 dotNetOptions = RegexOptions.None;
271 for (
int i = start; i < pattern.Length; i++)
273 char option = pattern[i];
278 dotNetOptions |= RegexOptions.IgnoreCase;
282 dotNetOptions |= RegexOptions.Multiline;
286 dotNetOptions |= RegexOptions.Singleline;
290 dotNetOptions |= RegexOptions.IgnorePatternWhitespace;
329 (dotNetOptions & RegexOptions.Multiline) != 0 &&
333 throw new Exception(
"Modifier inconsistent");
337 private static int AlphaNumericToDigit(
char x)
383 private static bool ParseEscapeCode(Encoding encoding,
string str, ref
int pos, ref
char ch, ref
bool escaped)
387 if (pos + 3 >= str.Length)
return false;
391 if (str[pos + 1] ==
'x')
393 if (str[pos + 2] ==
'{')
397 while (i < str.Length && str[i] !=
'}' && number < Char.MaxValue)
399 int digit = AlphaNumericToDigit(str[i]);
400 if (digit > 16)
return false;
401 number = (number << 4) + digit;
404 if (number > Char.MaxValue || i >= str.Length)
return false;
407 escaped = IsCharRegexSpecial(ch);
412 for (
int i = pos + 2; i < pos + 4; i++)
415 int digit = AlphaNumericToDigit(str[i]);
416 if (digit > 16)
return false;
417 number = (number << 4) + digit;
420 char[] chars = encoding.GetChars(
new byte[] { (byte)number });
421 if (chars.Length == 1)
425 escaped = IsCharRegexSpecial(ch);
429 else if (str[pos + 1] >=
'0' && str[pos + 1] <=
'7')
432 for (
int i = pos + 1; i < pos + 4; i++)
435 int digit = AlphaNumericToDigit(str[i]);
436 if (digit > 8)
return false;
437 number = (number << 3) + digit;
440 ch = encoding.GetChars(
new byte[] { (byte)number })[0];
441 escaped = IsCharRegexSpecial(ch);
444 else if (str[pos + 1] ==
'p' || str[pos + 1] ==
'P')
446 bool complement = str[pos + 1] ==
'P';
449 if (str[pos + 2] ==
'{')
451 if (!complement && str[pos + 3] ==
'^')
468 int cat_length = str.Length;
469 int cat_end = cat_start + cat_length - 1;
475 if (str[pos + 2] ==
'{' && (cat_end + 1 >= str.Length || str[cat_end + 1] !=
'}'))
483 else if (str[pos + 1] ==
'X')
494 static char[] encodeChars =
new char[] {
'.',
'$',
'(',
')',
'*',
'+',
'?',
'[',
']',
'{',
'}',
'\\',
'^',
'|' };
499 private static bool IsCharRegexSpecial(
char ch)
501 return Array.IndexOf(encodeChars, ch) != -1;
512 private static string ConvertRegex(
string perlExpr,
PerlRegexOptions opt, Encoding encoding)
518 StringBuilder result =
new StringBuilder(perlExpr.Length + 1);
522 if ((opt &
PerlRegexOptions.Anchored) != 0 && (perlExpr.Length == 0 || perlExpr[0] !=
'^'))
527 bool last_quantifier =
false;
531 bool leaving_range =
false;
533 bool escaped =
false;
538 while (i < perlExpr.Length)
540 char ch = perlExpr[i];
543 if (ch ==
'\\' && !ParseEscapeCode(encoding, perlExpr, ref i, ref ch, ref escaped))
550 if (ch ==
'_') escaped =
false;
else escaped =
true;
560 last_quantifier =
false;
568 case 0: group_state = (ch ==
'(') ? 1 : 0;
break;
569 case 1: group_state = (ch ==
'?') ? 2 : 0;
break;
570 case 2:
if (ch ==
'P') { i++;
continue; }
break;
576 Match m = quantifiers.Match(perlExpr, i);
579 if (m.Success && (m.Value !=
"?" || i == 0 || perlExpr[i - 1] !=
'('))
583 throw new ArgumentException(
"regexp_duplicate_quantifier");
586 result.Append(perlExpr, i, m.Length);
589 if (i < perlExpr.Length && perlExpr[i] ==
'?')
594 else if (i < perlExpr.Length && perlExpr[i] ==
'+')
604 if (result.Length != 0 && result[result.Length - 1] ==
'?')
613 last_quantifier =
true;
618 last_quantifier =
false;
620 if (ch ==
'$' && (opt &
PerlRegexOptions.DollarMatchesEndOfStringOnly) != 0)
623 result.Append(
@"\z");
643 if (ch ==
'^' || ch ==
']' || ch ==
'-')
661 leaving_range =
false;
665 if (ch ==
'-' && !leaving_range)
670 leaving_range =
false;
673 Match match = posixCharClasses.Match(perlExpr.Substring(i), 0);
676 string chars = CountCharacterClass(match.Groups[2].Value);
678 throw new ArgumentException( String.Format(
"Unknown character class '{0}'", match.Groups[2].Value));
680 if (match.Groups[1].Value.Length > 0)
681 throw new ArgumentException(
"POSIX character classes negation not supported.");
683 result.Append(chars);
684 i += match.Length - 1;
691 result.Append(
"\\x2d");
697 if (!escaped && ch ==
']')
706 if (!CountRange(result[result.Length - 1], ch, out range, out error, encoding))
708 if ((error != 1) || (!CountUnicodeRange(result[result.Length - 1], ch, out range)))
711 throw new ArgumentException(
"range_first_character_greater");
714 result.Append(EscapeBracketExpressionSpecialChars(range));
716 leaving_range =
true;
723 return result.ToString();
731 internal static string EscapeBracketExpressionSpecialChars(
string chars)
733 StringBuilder sb =
new StringBuilder();
735 for (
int i = 0; i < chars.Length; i++)
752 return sb.ToString();
765 internal static bool CountRange(
char firstCharacter,
char secondCharacter, out
string characters, out
int result, Encoding encoding)
771 char[] chars =
new char[2];
772 chars[0] = firstCharacter;
773 chars[1] = secondCharacter;
775 byte[] two_bytes =
new byte[encoding.GetMaxByteCount(2)];
778 if (encoding.GetBytes(chars, 0, 2, two_bytes, 0) != 2)
784 if (two_bytes[0] > two_bytes[1])
791 byte[] bytes =
new byte[two_bytes[1] - two_bytes[0] + 1];
794 for (
int ch = two_bytes[0]; ch <= two_bytes[1]; i++, ch++)
800 characters = encoding.GetString(bytes, 0, i);
810 internal static string CountCharacterClass(
string chClassName)
817 ret =
@"\p{Ll}\p{Lu}\p{Lt}\p{Lo}\p{Nd}";
826 ret =
@"\p{Ll}\p{Lu}\p{Lt}\p{Lo}";
829 ret =
@"\p{L}\p{M}\p{N}\p{P}\p{S}";
847 ret =
@"\p{L}\p{M}\p{N}\p{P}\p{S}\p{Zs}";
850 ret =
@"abcdefABCDEF\d";
853 ret =
@"\u0000-\u007F";
856 ret =
@"_\p{Ll}\p{Lu}\p{Lt}\p{Lo}\p{Nd}";
871 private static bool CountUnicodeRange(
char f,
char t, out
string range)
874 if (f > t)
return false;
875 StringBuilder sb =
new StringBuilder(t - f);
876 for (
char c = f; c <= t; c++) sb.Append(c);
877 range = sb.ToString();
886 private static void ModifyRegExpAnchored(ref
string expr)
890 if (expr.Length == 0 || expr[0] !=
'^')
894 internal static bool IsDigitGroupReference(
string replacement,
int i)
896 return (replacement[i] ==
'$' || replacement[i] ==
'\\') &&
897 (i + 1 < replacement.Length && Char.IsDigit(replacement, i + 1));
900 internal static bool IsParenthesizedGroupReference(
string replacement,
int i)
902 return replacement[i] ==
'$' && i + 3 < replacement.Length && replacement[i + 1] ==
'{' &&
903 Char.IsDigit(replacement, i + 2) &&
905 replacement[i + 3] ==
'}' ||
906 i + 4 < replacement.Length && replacement[i + 4] ==
'}' && Char.IsDigit(replacement, i + 3)
915 private string ConvertReplacement(
string replacement)
917 StringBuilder result =
new StringBuilder();
918 int[] group_numbers = regex.GetGroupNumbers();
919 int max_number = (group_numbers.Length > 0) ? group_numbers[group_numbers.Length - 1] : 0;
922 while (i < replacement.Length)
924 if (IsDigitGroupReference(replacement, i) ||
925 IsParenthesizedGroupReference(replacement, i))
930 if (replacement[i] ==
'{') { i++; add = 1; }
932 int number = replacement[i++] -
'0';
933 if (i < replacement.Length && Char.IsDigit(replacement, i))
935 number = number * 10 + replacement[i];
940 if (number <= max_number)
944 result.Append(number.ToString());
950 else if (replacement[i] ==
'$')
956 else if (replacement[i] ==
'\\' && i + 1 < replacement.Length)
958 if (replacement[i + 1] ==
'\\')
967 result.Append(replacement, i, 2);
974 result.Append(replacement, i++, 1);
978 return result.ToString();
Implements PERL extended regular expressions as they are implemented in PHP.
string DotNetReplaceExpression
Returns .NET replacement string.
Regex Regex
Returns Regex class that can be used for matching.
PerlRegExpConverter(string pattern, string replacement, Encoding encoding)
Creates new PerlRegExpConverter and converts Perl regular expression to .NET.
RegexOptions DotNetOptions
RegexOptions which should be set while matching the expression. May be null if regex is already set.
PerlRegexOptions
Perl regular expression specific options that are not captured by .NET RegexOptions or by transformat...