lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [22/22] lucenenet git commit: Ported Analysis.Miscellaneous.PatternAnalyzer + tests
Date Thu, 01 Sep 2016 14:36:42 GMT
Ported Analysis.Miscellaneous.PatternAnalyzer + tests


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/7f877fdf
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/7f877fdf
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/7f877fdf

Branch: refs/heads/analysis-work
Commit: 7f877fdfc2ba25a7c1b0386795b4f83b46f50767
Parents: 8a05b16
Author: Shad Storhaug <shad@shadstorhaug.com>
Authored: Sun Aug 28 00:11:18 2016 +0700
Committer: Shad Storhaug <shad@shadstorhaug.com>
Committed: Sun Aug 28 11:24:43 2016 +0700

----------------------------------------------------------------------
 .../Analysis/Miscellaneous/PatternAnalyzer.cs   | 159 +++++++---
 .../Lucene.Net.Analysis.Common.csproj           |   1 +
 .../Miscellaneous/PatternAnalyzerTest.cs        | 317 +++++++------------
 .../Lucene.Net.Tests.Analysis.Common.csproj     |   1 +
 4 files changed, 236 insertions(+), 242 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7f877fdf/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzer.cs
index 6c28927..933e714 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzer.cs
@@ -1,13 +1,14 @@
-using System;
-using System.IO;
-using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Core;
 using Lucene.Net.Analysis.Tokenattributes;
 using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
 using Lucene.Net.Util;
+using System;
+using System.IO;
+using System.Text.RegularExpressions;
 
 namespace Lucene.Net.Analysis.Miscellaneous
 {
-
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
@@ -24,6 +25,7 @@ namespace Lucene.Net.Analysis.Miscellaneous
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */
+
     /// <summary>
     /// Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than
a
     /// <seealso cref="TextReader"/>, that can flexibly separate text into terms via
a regular expression <seealso cref="Pattern"/>
@@ -61,19 +63,65 @@ namespace Lucene.Net.Analysis.Miscellaneous
 
         /// <summary>
         /// <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c))
</summary>
-        public static readonly Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
+        public static readonly Regex NON_WORD_PATTERN = new Regex("\\W+", RegexOptions.Compiled);
 
         /// <summary>
         /// <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c))
</summary>
-        public static readonly Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
-
-        private static readonly CharArraySet EXTENDED_ENGLISH_STOP_WORDS = CharArraySet.UnmodifiableSet(new
CharArraySet(LuceneVersion.LUCENE_CURRENT, Arrays.asList("a", "about", "above", "across",
"adj", "after", "afterwards", "again", "against", "albeit", "all", "almost", "alone", "along",
"already", "also", "although", "always", "among", "amongst", "an", "and", "another", "any",
"anyhow", "anyone", "anything", "anywhere", "are", "around", "as", "at", "be", "became", "because",
"become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below",
"beside", "besides", "between", "beyond", "both", "but", "by", "can", "cannot", "co", "could",
"down", "during", "each", "eg", "either", "else", "elsewhere", "enough", "etc", "even", "ever",
"every", "everyone", "everything", "everywhere", "except", "few", "first", "for", "former",
"formerly", "from", "further", "had", "has", "have", "he", "hence", "her", "here", "hereafter",
"hereby", "herein", "hereupon", "hers", 
 "herself", "him", "himself", "his", "how", "however", "i", "ie", "if", "in", "inc", "indeed",
"into", "is", "it", "its", "itself", "last", "latter", "latterly", "least", "less", "ltd",
"many", "may", "me", "meanwhile", "might", "more", "moreover", "most", "mostly", "much", "must",
"my", "myself", "namely", "neither", "never", "nevertheless", "next", "no", "nobody", "none",
"noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once one",
"only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over",
"own", "per", "perhaps", "rather", "s", "same", "seem", "seemed", "seeming", "seems", "several",
"she", "should", "since", "so", "some", "somehow", "someone", "something", "sometime", "sometimes",
"somewhere", "still", "such", "t", "than", "that", "the", "their", "them", "themselves", "then",
"thence", "there", "thereafter", "thereby", "therefor", "therein", "thereupon", "these", "they",
"this", "those", "though", "through", "th
 roughout", "thru", "thus", "to", "together", "too", "toward", "towards", "under", "until",
"up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "whatsoever",
"when", "whence", "whenever", "whensoever", "where", "whereafter", "whereas", "whereat", "whereby",
"wherefrom", "wherein", "whereinto", "whereof", "whereon", "whereto", "whereunto", "whereupon",
"wherever", "wherewith", "whether", "which", "whichever", "whichsoever", "while", "whilst",
"whither", "who", "whoever", "whole", "whom", "whomever", "whomsoever", "whose", "whosoever",
"why", "will", "with", "within", "without", "would", "xsubj", "xcal", "xauthor", "xother ",
"xnote", "yet", "you", "your", "yours", "yourself", "yourselves"), true));
+        public static readonly Regex WHITESPACE_PATTERN = new Regex("\\s+", RegexOptions.Compiled);
+
+        private static readonly CharArraySet EXTENDED_ENGLISH_STOP_WORDS = 
+            CharArraySet.UnmodifiableSet(new CharArraySet(LuceneVersion.LUCENE_CURRENT, 
+                Arrays.AsList(
+                    "a", "about", "above", "across", "adj", "after", "afterwards",
+                    "again", "against", "albeit", "all", "almost", "alone", "along",
+                    "already", "also", "although", "always", "among", "amongst", "an",
+                    "and", "another", "any", "anyhow", "anyone", "anything",
+                    "anywhere", "are", "around", "as", "at", "be", "became", "because",
+                    "become", "becomes", "becoming", "been", "before", "beforehand",
+                    "behind", "being", "below", "beside", "besides", "between",
+                    "beyond", "both", "but", "by", "can", "cannot", "co", "could",
+                    "down", "during", "each", "eg", "either", "else", "elsewhere",
+                    "enough", "etc", "even", "ever", "every", "everyone", "everything",
+                    "everywhere", "except", "few", "first", "for", "former",
+                    "formerly", "from", "further", "had", "has", "have", "he", "hence",
+                    "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
+                    "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
+                    "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
+                    "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
+                    "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
+                    "must", "my", "myself", "namely", "neither", "never",
+                    "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
+                    "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
+                    "once one", "only", "onto", "or", "other", "others", "otherwise",
+                    "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
+                    "rather", "s", "same", "seem", "seemed", "seeming", "seems",
+                    "several", "she", "should", "since", "so", "some", "somehow",
+                    "someone", "something", "sometime", "sometimes", "somewhere",
+                    "still", "such", "t", "than", "that", "the", "their", "them",
+                    "themselves", "then", "thence", "there", "thereafter", "thereby",
+                    "therefor", "therein", "thereupon", "these", "they", "this",
+                    "those", "though", "through", "throughout", "thru", "thus", "to",
+                    "together", "too", "toward", "towards", "under", "until", "up",
+                    "upon", "us", "very", "via", "was", "we", "well", "were", "what",
+                    "whatever", "whatsoever", "when", "whence", "whenever",
+                    "whensoever", "where", "whereafter", "whereas", "whereat",
+                    "whereby", "wherefrom", "wherein", "whereinto", "whereof",
+                    "whereon", "whereto", "whereunto", "whereupon", "wherever",
+                    "wherewith", "whether", "which", "whichever", "whichsoever",
+                    "while", "whilst", "whither", "who", "whoever", "whole", "whom",
+                    "whomever", "whomsoever", "whose", "whosoever", "why", "will",
+                    "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
+                    "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
+                    "yourselves"
+                
+                    ), true));
 
         /// <summary>
         /// A lower-casing word analyzer with English stop words (can be shared
         /// freely across threads without harm); global per class loader.
         /// </summary>
-        public static readonly PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(LuceneVersion.LUCENE_CURRENT,
NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+        public static readonly PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
+            LuceneVersion.LUCENE_CURRENT, NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
 
         /// <summary>
         /// A lower-casing word analyzer with <b>extended </b> English stop words
@@ -82,9 +130,10 @@ namespace Lucene.Net.Analysis.Miscellaneous
         /// http://thomas.loc.gov/home/stopwords.html, see
         /// http://thomas.loc.gov/home/all.about.inquery.html
         /// </summary>
-        public static readonly PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(LuceneVersion.LUCENE_CURRENT,
NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
+        public static readonly PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
+            LuceneVersion.LUCENE_CURRENT, NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
 
-        private readonly Pattern pattern;
+        private readonly Regex pattern;
         private readonly bool toLowerCase;
         private readonly CharArraySet stopWords;
 
@@ -108,23 +157,23 @@ namespace Lucene.Net.Analysis.Miscellaneous
         ///            <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
         ///            or <a href="http://www.unine.ch/info/clef/">other stop words
         ///            lists </a>. </param>
-        public PatternAnalyzer(LuceneVersion matchVersion, Pattern pattern, bool toLowerCase,
CharArraySet stopWords)
+        public PatternAnalyzer(LuceneVersion matchVersion, Regex pattern, bool toLowerCase,
CharArraySet stopWords)
         {
             if (pattern == null)
             {
                 throw new System.ArgumentException("pattern must not be null");
             }
 
-            if (eqPattern(NON_WORD_PATTERN, pattern))
+            if (EqPattern(NON_WORD_PATTERN, pattern))
             {
                 pattern = NON_WORD_PATTERN;
             }
-            else if (eqPattern(WHITESPACE_PATTERN, pattern))
+            else if (EqPattern(WHITESPACE_PATTERN, pattern))
             {
                 pattern = WHITESPACE_PATTERN;
             }
 
-            if (stopWords != null && stopWords.Size == 0)
+            if (stopWords != null && stopWords.Count == 0)
             {
                 stopWords = null;
             }
@@ -146,7 +195,7 @@ namespace Lucene.Net.Analysis.Miscellaneous
         /// <param name="text">
         ///            the string to tokenize </param>
         /// <returns> a new token stream </returns>
-        public TokenStreamComponents createComponents(string fieldName, TextReader reader,
string text)
+        public TokenStreamComponents CreateComponents(string fieldName, TextReader reader,
string text)
         {
             // Ideally the Analyzer superclass should have a method with the same signature,

             // with a default impl that simply delegates to the StringReader flavour. 
@@ -165,7 +214,7 @@ namespace Lucene.Net.Analysis.Miscellaneous
             }
 
             Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase);
-            TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer,
stopWords) : tokenizer;
+            TokenStream result = (stopWords != null) ? (TokenStream)new StopFilter(matchVersion,
tokenizer, stopWords) : tokenizer;
             return new TokenStreamComponents(tokenizer, result);
         }
 
@@ -181,7 +230,7 @@ namespace Lucene.Net.Analysis.Miscellaneous
         /// <returns> a new token stream </returns>
         public override TokenStreamComponents CreateComponents(string fieldName, TextReader
reader)
         {
-            return createComponents(fieldName, reader, null);
+            return CreateComponents(fieldName, reader, null);
         }
 
         /// <summary>
@@ -208,7 +257,7 @@ namespace Lucene.Net.Analysis.Miscellaneous
             var p2 = other as PatternAnalyzer;
             if (p2 != null)
             {
-                return toLowerCase == p2.toLowerCase && eqPattern(pattern, p2.pattern)
&& eq(stopWords, p2.stopWords);
+                return toLowerCase == p2.toLowerCase && EqPattern(pattern, p2.pattern)
&& Eq(stopWords, p2.stopWords);
             }
             return false;
         }
@@ -229,8 +278,8 @@ namespace Lucene.Net.Analysis.Miscellaneous
             }
 
             int h = 1;
-            h = 31 * h + pattern.pattern().GetHashCode();
-            h = 31 * h + pattern.flags();
+            h = 31 * h + pattern.ToString().GetHashCode();
+            h = 31 * h + (int)pattern.Options;
             h = 31 * h + (toLowerCase ? 1231 : 1237);
             h = 31 * h + (stopWords != null ? stopWords.GetHashCode() : 0);
             return h;
@@ -238,16 +287,16 @@ namespace Lucene.Net.Analysis.Miscellaneous
 
         /// <summary>
         /// equality where o1 and/or o2 can be null </summary>
-        private static bool eq(object o1, object o2)
+        private static bool Eq(object o1, object o2)
         {
             return (o1 == o2) || (o1 != null ? o1.Equals(o2) : false);
         }
 
         /// <summary>
         /// assumes p1 and p2 are not null </summary>
-        private static bool eqPattern(Pattern p1, Pattern p2)
+        private static bool EqPattern(Regex p1, Regex p2)
         {
-            return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().Equals(p2.pattern()));
+            return p1 == p2 || (p1.Options == p2.Options && p1.ToString().Equals(p2.ToString()));
         }
 
         /// <summary>
@@ -271,7 +320,7 @@ namespace Lucene.Net.Analysis.Miscellaneous
 
                 len = 0;
                 int n;
-                while ((n = input.Read(buffer)) >= 0)
+                while ((n = input.Read(buffer, 0, buffer.Length)) > 0)
                 {
                     if (len + n > output.Length) // grow capacity
                     {
@@ -306,23 +355,23 @@ namespace Lucene.Net.Analysis.Miscellaneous
         /// </summary>
         private sealed class PatternTokenizer : Tokenizer
         {
-            private readonly Pattern pattern;
+            private readonly Regex pattern;
             private string str;
             private readonly bool toLowerCase;
-            private Matcher matcher;
+            private Match matcher;
             private int pos = 0;
             private bool initialized = false;
-            private static readonly Locale locale = Locale.Default;
+            private bool isReset = false; // Flag to keep track of the first match vs subsequent
matches
             private readonly ICharTermAttribute termAtt;
             private readonly IOffsetAttribute offsetAtt;
 
-            public PatternTokenizer(TextReader input, Pattern pattern, bool toLowerCase)
+            public PatternTokenizer(TextReader input, Regex pattern, bool toLowerCase)
                 : base(input)
             {
                 termAtt = AddAttribute<ICharTermAttribute>();
                 offsetAtt = AddAttribute<IOffsetAttribute>();
                 this.pattern = pattern;
-                this.matcher = pattern.matcher("");
+                this.matcher = pattern.Match("");
                 this.toLowerCase = toLowerCase;
             }
 
@@ -340,28 +389,33 @@ namespace Lucene.Net.Analysis.Miscellaneous
                 while (true) // loop takes care of leading and trailing boundary cases
                 {
                     int start = pos;
-                    int end_Renamed;
-                    bool isMatch = matcher.find();
+                    int end;
+                    if (!isReset)
+                    {
+                        matcher = matcher.NextMatch();
+                    }
+                    isReset = false;
+                    bool isMatch = matcher.Success;
                     if (isMatch)
                     {
-                        end_Renamed = matcher.start();
-                        pos = matcher.end();
+                        end = matcher.Index;
+                        pos = matcher.Index + matcher.Length;
                     }
                     else
                     {
-                        end_Renamed = str.Length;
+                        end = str.Length;
                         matcher = null; // we're finished
                     }
 
-                    if (start != end_Renamed) // non-empty match (header/trailer)
+                    if (start != end) // non-empty match (header/trailer)
                     {
-                        string text = str.Substring(start, end_Renamed - start);
+                        string text = str.Substring(start, end - start);
                         if (toLowerCase)
                         {
-                            text = text.ToLower(locale);
+                            text = text.ToLower();
                         }
                         termAtt.SetEmpty().Append(text);
-                        offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end_Renamed));
+                        offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end));
                         return true;
                     }
                     if (!isMatch)
@@ -389,7 +443,18 @@ namespace Lucene.Net.Analysis.Miscellaneous
             {
                 base.Reset();
                 this.str = PatternAnalyzer.ToString(input);
-                this.matcher = pattern.matcher(this.str);
+
+                // LUCENENET: Since we need to "reset" the Match
+                // object, we also need an "isReset" flag to indicate
+                // whether we are at the head of the match and to 
+                // take the appropriate measures to ensure we don't 
+                // overwrite our matcher variable with 
+                // matcher = matcher.NextMatch();
+                // before it is time. A string could potentially
+                // match on index 0, so we need another variable to
+                // manage this state.
+                this.matcher = pattern.Match(this.str);
+                this.isReset = true;
                 this.pos = 0;
                 this.initialized = true;
             }
@@ -410,7 +475,6 @@ namespace Lucene.Net.Analysis.Miscellaneous
             private readonly bool isLetter;
             private readonly bool toLowerCase;
             private readonly CharArraySet stopWords;
-            private static readonly Locale locale = Locale.Default;
             private readonly ICharTermAttribute termAtt;
             private readonly IOffsetAttribute offsetAtt;
 
@@ -444,7 +508,7 @@ namespace Lucene.Net.Analysis.Miscellaneous
                 {
                     // find beginning of token
                     text = null;
-                    while (i < len && !isTokenChar(s[i], letter))
+                    while (i < len && !IsTokenChar(s[i], letter))
                     {
                         i++;
                     }
@@ -452,7 +516,7 @@ namespace Lucene.Net.Analysis.Miscellaneous
                     if (i < len) // found beginning; now find end of token
                     {
                         start = i;
-                        while (i < len && isTokenChar(s[i], letter))
+                        while (i < len && IsTokenChar(s[i], letter))
                         {
                             i++;
                         }
@@ -460,7 +524,7 @@ namespace Lucene.Net.Analysis.Miscellaneous
                         text = s.Substring(start, i - start);
                         if (toLowerCase)
                         {
-                            text = text.ToLower(locale);
+                            text = text.ToLower();
                         }
                         //          if (toLowerCase) {            
                         ////            use next line once JDK 1.5 String.toLowerCase() performance
regression is fixed
@@ -473,7 +537,7 @@ namespace Lucene.Net.Analysis.Miscellaneous
                         //            text = s.substring(start, i);
                         //          }
                     }
-                } while (text != null && isStopWord(text));
+                } while (text != null && IsStopWord(text));
 
                 pos = i;
                 if (text == null)
@@ -493,12 +557,12 @@ namespace Lucene.Net.Analysis.Miscellaneous
                 this.offsetAtt.SetOffset(CorrectOffset(finalOffset), CorrectOffset(finalOffset));
             }
 
-            private bool isTokenChar(char c, bool isLetter)
+            private bool IsTokenChar(char c, bool isLetter)
             {
                 return isLetter ? char.IsLetter(c) : !char.IsWhiteSpace(c);
             }
 
-            private bool isStopWord(string text)
+            private bool IsStopWord(string text)
             {
                 return stopWords != null && stopWords.Contains(text);
             }
@@ -544,6 +608,5 @@ namespace Lucene.Net.Analysis.Miscellaneous
                 }
             }
         }
-
     }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7f877fdf/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj b/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
index 0679473..475338b 100644
--- a/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
+++ b/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
@@ -256,6 +256,7 @@
     <Compile Include="Analysis\Miscellaneous\LimitTokenPositionFilter.cs" />
     <Compile Include="Analysis\Miscellaneous\LimitTokenPositionFilterFactory.cs" />
     <Compile Include="Analysis\Miscellaneous\Lucene47WordDelimiterFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\PatternAnalyzer.cs" />
     <Compile Include="Analysis\Miscellaneous\PatternKeywordMarkerFilter.cs" />
     <Compile Include="Analysis\Miscellaneous\PerFieldAnalyzerWrapper.cs" />
     <Compile Include="Analysis\Miscellaneous\PrefixAndSuffixAwareTokenFilter.cs" />

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7f877fdf/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzerTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzerTest.cs
b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzerTest.cs
index de1db38..85a9632 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzerTest.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/PatternAnalyzerTest.cs
@@ -1,11 +1,13 @@
-using System;
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Support;
+using NUnit.Framework;
+using System.IO;
 using System.Text;
-using System.Threading;
+using System.Text.RegularExpressions;
 
-namespace org.apache.lucene.analysis.miscellaneous
+namespace Lucene.Net.Analysis.Miscellaneous
 {
-
-	/*
+    /*
 	 * Licensed to the Apache Software Foundation (ASF) under one or more
 	 * contributor license agreements.  See the NOTICE file distributed with
 	 * this work for additional information regarding copyright ownership.
@@ -22,193 +24,120 @@ namespace org.apache.lucene.analysis.miscellaneous
 	 * limitations under the License.
 	 */
 
-	using UncaughtExceptionHandler = Thread.UncaughtExceptionHandler;
-
-	using StopAnalyzer = org.apache.lucene.analysis.core.StopAnalyzer;
-
-	/// <summary>
-	/// Verifies the behavior of PatternAnalyzer.
-	/// </summary>
-	public class PatternAnalyzerTest : BaseTokenStreamTestCase
-	{
-
-	  /// <summary>
-	  /// Test PatternAnalyzer when it is configured with a non-word pattern.
-	  /// Behavior can be similar to SimpleAnalyzer (depending upon options)
-	  /// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testNonWordPattern() throws java.io.IOException
-	  public virtual void testNonWordPattern()
-	  {
-		// Split on non-letter pattern, do not lowercase, no stopwords
-		PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
false, null);
-		check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new string[] {"The", "quick",
"brown", "Fox", "the", "abcd", "dc"});
-
-		// split on non-letter pattern, lowercase, english stopwords
-		PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
-		check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new string[] {"quick", "brown",
"fox", "abcd", "dc"});
-	  }
-
-	  /// <summary>
-	  /// Test PatternAnalyzer when it is configured with a whitespace pattern.
-	  /// Behavior can be similar to WhitespaceAnalyzer (depending upon options)
-	  /// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testWhitespacePattern() throws java.io.IOException
-	  public virtual void testWhitespacePattern()
-	  {
-		// Split on whitespace patterns, do not lowercase, no stopwords
-		PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
false, null);
-		check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new string[] {"The", "quick",
"brown", "Fox,the", "abcd1234", "(56.78)", "dc."});
-
-		// Split on whitespace patterns, lowercase, english stopwords
-		PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
-		check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new string[] {"quick", "brown",
"fox,the", "abcd1234", "(56.78)", "dc."});
-	  }
-
-	  /// <summary>
-	  /// Test PatternAnalyzer when it is configured with a custom pattern. In this
-	  /// case, text is tokenized on the comma ","
-	  /// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testCustomPattern() throws java.io.IOException
-	  public virtual void testCustomPattern()
-	  {
-		// Split on comma, do not lowercase, no stopwords
-		PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), false,
null);
-		check(a, "Here,Are,some,Comma,separated,words,", new string[] {"Here", "Are", "some", "Comma",
"separated", "words"});
-
-		// split on comma, lowercase, english stopwords
-		PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true,
StopAnalyzer.ENGLISH_STOP_WORDS_SET);
-		check(b, "Here,Are,some,Comma,separated,words,", new string[] {"here", "some", "comma",
"separated", "words"});
-	  }
-
-	  /// <summary>
-	  /// Test PatternAnalyzer against a large document.
-	  /// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testHugeDocument() throws java.io.IOException
-	  public virtual void testHugeDocument()
-	  {
-		StringBuilder document = new StringBuilder();
-		// 5000 a's
-		char[] largeWord = new char[5000];
-		Arrays.fill(largeWord, 'a');
-		document.Append(largeWord);
-
-		// a space
-		document.Append(' ');
-
-		// 2000 b's
-		char[] largeWord2 = new char[2000];
-		Arrays.fill(largeWord2, 'b');
-		document.Append(largeWord2);
-
-		// Split on whitespace patterns, do not lowercase, no stopwords
-		PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
false, null);
-		check(a, document.ToString(), new string[]
-		{
-			new string(largeWord),
-			new string(largeWord2)
-		});
-	  }
-
-	  /// <summary>
-	  /// Verify the analyzer analyzes to the expected contents. For PatternAnalyzer,
-	  /// several methods are verified:
-	  /// <ul>
-	  /// <li>Analysis with a normal Reader
-	  /// <li>Analysis with a FastStringReader
-	  /// <li>Analysis with a String
-	  /// </ul>
-	  /// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: private void check(PatternAnalyzer analyzer, String document, String expected[])
throws java.io.IOException
-	  private void check(PatternAnalyzer analyzer, string document, string[] expected)
-	  {
-		// ordinary analysis of a Reader
-		assertAnalyzesTo(analyzer, document, expected);
-
-		// analysis with a "FastStringReader"
-		TokenStream ts = analyzer.tokenStream("dummy", new PatternAnalyzer.FastStringReader(document));
-		assertTokenStreamContents(ts, expected);
-
-		// analysis of a String, uses PatternAnalyzer.tokenStream(String, String)
-		TokenStream ts2 = analyzer.tokenStream("dummy", new StringReader(document));
-		assertTokenStreamContents(ts2, expected);
-	  }
-
-	  /// <summary>
-	  /// blast some random strings through the analyzer </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testRandomStrings() throws Exception
-	  public virtual void testRandomStrings()
-	  {
-		Analyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
-
-		// dodge jre bug http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=7104012
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final Thread.UncaughtExceptionHandler savedHandler = Thread.getDefaultUncaughtExceptionHandler();
-		UncaughtExceptionHandler savedHandler = Thread.DefaultUncaughtExceptionHandler;
-		Thread.DefaultUncaughtExceptionHandler = new UncaughtExceptionHandlerAnonymousInnerClassHelper(this,
savedHandler);
-
-		try
-		{
-		  Thread.DefaultUncaughtExceptionHandler;
-		  checkRandomData(random(), a, 10000 * RANDOM_MULTIPLIER);
-		}
-		catch (System.IndexOutOfRangeException ex)
-		{
-		  assumeTrue("not failing due to jre bug ", !isJREBug7104012(ex));
-		  throw ex; // otherwise rethrow
-		}
-		finally
-		{
-		  Thread.DefaultUncaughtExceptionHandler = savedHandler;
-		}
-	  }
-
-	  private class UncaughtExceptionHandlerAnonymousInnerClassHelper : UncaughtExceptionHandler
-	  {
-		  private readonly PatternAnalyzerTest outerInstance;
-
-		  private UncaughtExceptionHandler savedHandler;
-
-		  public UncaughtExceptionHandlerAnonymousInnerClassHelper(PatternAnalyzerTest outerInstance,
UncaughtExceptionHandler savedHandler)
-		  {
-			  this.outerInstance = outerInstance;
-			  this.savedHandler = savedHandler;
-		  }
-
-		  public override void uncaughtException(Thread thread, Exception throwable)
-		  {
-			assumeTrue("not failing due to jre bug ", !isJREBug7104012(throwable));
-			// otherwise its some other bug, pass to default handler
-			savedHandler.uncaughtException(thread, throwable);
-		  }
-	  }
-
-	  internal static bool isJREBug7104012(Exception t)
-	  {
-		if (!(t is System.IndexOutOfRangeException))
-		{
-		  // BaseTokenStreamTestCase now wraps exc in a new RuntimeException:
-		  t = t.InnerException;
-		  if (!(t is System.IndexOutOfRangeException))
-		  {
-			return false;
-		  }
-		}
-		StackTraceElement[] trace = t.StackTrace;
-		foreach (StackTraceElement st in trace)
-		{
-		  if ("java.text.RuleBasedBreakIterator".Equals(st.ClassName) || "sun.util.locale.provider.RuleBasedBreakIterator".Equals(st.ClassName)
&& "lookupBackwardState".Equals(st.MethodName))
-		  {
-			return true;
-		  }
-		}
-		return false;
-	  }
-	}
-
+    /// <summary>
+    /// Verifies the behavior of PatternAnalyzer.
+    /// </summary>
+    public class PatternAnalyzerTest : BaseTokenStreamTestCase
+    {
+
+        /// <summary>
+        /// Test PatternAnalyzer when it is configured with a non-word pattern.
+        /// Behavior can be similar to SimpleAnalyzer (depending upon options)
+        /// </summary>
+        [Test]
+        public virtual void TestNonWordPattern()
+        {
+            // Split on non-letter pattern, do not lowercase, no stopwords
+            PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
false, null);
+            Check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new string[] { "The",
"quick", "brown", "Fox", "the", "abcd", "dc" });
+
+            // split on non-letter pattern, lowercase, english stopwords
+            PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+            Check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new string[] { "quick",
"brown", "fox", "abcd", "dc" });
+        }
+
+        /// <summary>
+        /// Test PatternAnalyzer when it is configured with a whitespace pattern.
+        /// Behavior can be similar to WhitespaceAnalyzer (depending upon options)
+        /// </summary>
+        [Test]
+        public virtual void TestWhitespacePattern()
+        {
+            // Split on whitespace patterns, do not lowercase, no stopwords
+            PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
false, null);
+            Check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new string[] { "The",
"quick", "brown", "Fox,the", "abcd1234", "(56.78)", "dc." });
+
+            // Split on whitespace patterns, lowercase, english stopwords
+            PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+            Check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new string[] { "quick",
"brown", "fox,the", "abcd1234", "(56.78)", "dc." });
+        }
+
+        /// <summary>
+        /// Test PatternAnalyzer when it is configured with a custom pattern. In this
+        /// case, text is tokenized on the comma ","
+        /// </summary>
+        [Test]
+        public virtual void TestCustomPattern()
+        {
+            // Split on comma, do not lowercase, no stopwords
+            PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, new Regex(",",
RegexOptions.Compiled), false, null);
+            Check(a, "Here,Are,some,Comma,separated,words,", new string[] { "Here", "Are",
"some", "Comma", "separated", "words" });
+
+            // split on comma, lowercase, english stopwords
+            PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, new Regex(",",
RegexOptions.Compiled), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+            Check(b, "Here,Are,some,Comma,separated,words,", new string[] { "here", "some",
"comma", "separated", "words" });
+        }
+
+        /// <summary>
+        /// Test PatternAnalyzer against a large document.
+        /// </summary>
+        [Test]
+        public virtual void TestHugeDocument()
+        {
+            StringBuilder document = new StringBuilder();
+            // 5000 a's
+            char[] largeWord = new char[5000];
+            Arrays.Fill(largeWord, 'a');
+            document.Append(largeWord);
+
+            // a space
+            document.Append(' ');
+
+            // 2000 b's
+            char[] largeWord2 = new char[2000];
+            Arrays.Fill(largeWord2, 'b');
+            document.Append(largeWord2);
+
+            // Split on whitespace patterns, do not lowercase, no stopwords
+            PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
false, null);
+            Check(a, document.ToString(), new string[]
+            {
+            new string(largeWord),
+            new string(largeWord2)
+            });
+        }
+
+        /// <summary>
+        /// Verify the analyzer analyzes to the expected contents. For PatternAnalyzer,
+        /// several methods are verified:
+        /// <ul>
+        /// <li>Analysis with a normal Reader
+        /// <li>Analysis with a FastStringReader
+        /// <li>Analysis with a String
+        /// </ul>
+        /// </summary>
+        private void Check(PatternAnalyzer analyzer, string document, string[] expected)
+        {
+            // ordinary analysis of a Reader
+            AssertAnalyzesTo(analyzer, document, expected);
+
+            // analysis with a "FastStringReader"
+            TokenStream ts = analyzer.TokenStream("dummy", new PatternAnalyzer.FastStringReader(document));
+            AssertTokenStreamContents(ts, expected);
+
+            // analysis of a String, uses PatternAnalyzer.tokenStream(String, String)
+            TokenStream ts2 = analyzer.TokenStream("dummy", new StringReader(document));
+            AssertTokenStreamContents(ts2, expected);
+        }
+
+        /// <summary>
+        /// blast some random strings through the analyzer </summary>
+        [Test]
+        public virtual void TestRandomStrings()
+        {
+            Analyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, new Regex(",", RegexOptions.Compiled),
true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+
+            CheckRandomData(Random(), a, 10000 * RANDOM_MULTIPLIER);
+        }
+    }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7f877fdf/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
b/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
index 6d6c668..029a40f 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
+++ b/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
@@ -194,6 +194,7 @@
     <Compile Include="Analysis\Lv\TestLatvianAnalyzer.cs" />
     <Compile Include="Analysis\Lv\TestLatvianStemFilterFactory.cs" />
     <Compile Include="Analysis\Lv\TestLatvianStemmer.cs" />
+    <Compile Include="Analysis\Miscellaneous\PatternAnalyzerTest.cs" />
     <Compile Include="Analysis\Miscellaneous\TestASCIIFoldingFilter.cs" />
     <Compile Include="Analysis\Miscellaneous\TestCapitalizationFilter.cs" />
     <Compile Include="Analysis\Miscellaneous\TestCapitalizationFilterFactory.cs" />


Mime
View raw message