lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ccurr...@apache.org
Subject [30/51] [partial] Mass convert mixed tabs to spaces
Date Wed, 03 Apr 2013 17:40:13 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/62f018ab/src/contrib/SpellChecker/Spell/SuggestWord.cs
----------------------------------------------------------------------
diff --git a/src/contrib/SpellChecker/Spell/SuggestWord.cs b/src/contrib/SpellChecker/Spell/SuggestWord.cs
index c8bec15..54840b2 100644
--- a/src/contrib/SpellChecker/Spell/SuggestWord.cs
+++ b/src/contrib/SpellChecker/Spell/SuggestWord.cs
@@ -19,7 +19,7 @@ using System;
 
 namespace SpellChecker.Net.Search.Spell
 {
-	
+    
     /// <summary>  SuggestWord Class, used in suggestSimilar method in SpellChecker
class.
     /// 
     /// </summary>
@@ -29,13 +29,13 @@ namespace SpellChecker.Net.Search.Spell
     {
         /// <summary> the score of the word</summary>
         public float score;
-		
+        
         /// <summary> The freq of the word</summary>
         public int freq;
-		
+        
         /// <summary> the suggested word</summary>
         public System.String termString;
-		
+        
         public int CompareTo(SuggestWord a)
         {
             //first criteria: the edit distance
@@ -47,18 +47,18 @@ namespace SpellChecker.Net.Search.Spell
             {
                 return - 1;
             }
-			
+            
             //second criteria (if first criteria is equal): the popularity
             if (freq > a.freq)
             {
                 return 1;
             }
-			
+            
             if (freq < a.freq)
             {
                 return - 1;
             }
-			
+            
             return 0;
         }
     }

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/62f018ab/src/contrib/SpellChecker/Spell/SuggestWordQueue.cs
----------------------------------------------------------------------
diff --git a/src/contrib/SpellChecker/Spell/SuggestWordQueue.cs b/src/contrib/SpellChecker/Spell/SuggestWordQueue.cs
index de4dc09..7ae17ec 100644
--- a/src/contrib/SpellChecker/Spell/SuggestWordQueue.cs
+++ b/src/contrib/SpellChecker/Spell/SuggestWordQueue.cs
@@ -22,12 +22,12 @@ namespace SpellChecker.Net.Search.Spell
 
     sealed class SuggestWordQueue : PriorityQueue
     {
-		
+        
         internal SuggestWordQueue(int size)
         {
             Initialize(size);
         }
-		
+        
         override public bool LessThan(SuggestWord a, SuggestWord b)
         {
             var val = a.CompareTo(b);

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/62f018ab/src/contrib/SpellChecker/Spell/TRStringDistance.cs
----------------------------------------------------------------------
diff --git a/src/contrib/SpellChecker/Spell/TRStringDistance.cs b/src/contrib/SpellChecker/Spell/TRStringDistance.cs
index f797f59..79b2314 100644
--- a/src/contrib/SpellChecker/Spell/TRStringDistance.cs
+++ b/src/contrib/SpellChecker/Spell/TRStringDistance.cs
@@ -18,16 +18,16 @@
 
 namespace SpellChecker.Net.Search.Spell
 {
-	
+    
     /// <summary> Edit distance  class</summary>
     public class TRStringDistance
     {
-		
+        
         internal char[] sa;
         internal int n;
         internal int[][][] cache = new int[30][][];
-		
-		
+        
+        
         /// <summary> Optimized to run a bit faster than the static getDistance().
         /// In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus
37% faster.
         /// </summary>
@@ -36,8 +36,8 @@ namespace SpellChecker.Net.Search.Spell
             sa = target.ToCharArray();
             n = sa.Length;
         }
-		
-		
+        
+        
         //***************************
         // Compute Levenshtein distance
         //***************************
@@ -56,7 +56,7 @@ namespace SpellChecker.Net.Search.Spell
             {
                 return n;
             }
-			
+            
             if (m >= cache.Length)
             {
                 d = Form(n, m);
@@ -68,31 +68,31 @@ namespace SpellChecker.Net.Search.Spell
             else
             {
                 d = cache[m] = Form(n, m);
-				
+                
                 // Step 3
             }
             for (int i = 1; i <= n; i++)
             {
                 char s_i = sa[i - 1];
-				
+                
                 // Step 4
-				
+                
                 for (int j = 1; j <= m; j++)
                 {
                     char t_j = ta[j - 1];
-					
+                    
                     // Step 5
 
                     int cost = s_i == t_j ? 0 : 1;
                     d[i][j] = Min3(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
                 }
             }
-			
+            
             // Step 7
             return d[n][m];
         }
-		
-		
+        
+        
         /// <summary> </summary>
         private static int[][] Form(int n, int m)
         {
@@ -102,7 +102,7 @@ namespace SpellChecker.Net.Search.Spell
                 d[i] = new int[m + 1];
             }
             // Step 2
-			
+            
             for (int i = 0; i <= n; i++)
             {
                 d[i][0] = i;
@@ -113,8 +113,8 @@ namespace SpellChecker.Net.Search.Spell
             }
             return d;
         }
-		
-		
+        
+        
         //**************************
         // Get minimum of three values
         //**************************

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/62f018ab/src/contrib/WordNet/SynExpand/SynExpand.cs
----------------------------------------------------------------------
diff --git a/src/contrib/WordNet/SynExpand/SynExpand.cs b/src/contrib/WordNet/SynExpand/SynExpand.cs
index a830f6f..79498c0 100644
--- a/src/contrib/WordNet/SynExpand/SynExpand.cs
+++ b/src/contrib/WordNet/SynExpand/SynExpand.cs
@@ -87,48 +87,48 @@ namespace WorldNet.Net
             Analyzer a,
             String field,
             float boost)
-		{
-			already = new List<String>(); // avoid dups 
-			var top = new List<String>(); // needs to be separately listed..
-			if (field == null)
-				field = "contents";
-			
+        {
+            already = new List<String>(); // avoid dups 
+            var top = new List<String>(); // needs to be separately listed..
+            if (field == null)
+                field = "contents";
+            
             if (a == null)
-				a = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT);
-			
-			// [1] Parse query into separate words so that when we expand we can avoid dups
-			var ts = a.TokenStream(field, new StringReader(query));
+                a = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT);
+            
+            // [1] Parse query into separate words so that when we expand we can avoid dups
+            var ts = a.TokenStream(field, new StringReader(query));
             var termAtt = ts.AddAttribute<TermAttribute>();
-		    
+            
             while (ts.IncrementToken())
-			{
-				var word = termAtt.Term;
-				
+            {
+                var word = termAtt.Term;
+                
                 if (!already.Contains(word))
-				{
-					already.Add(word);
-					top.Add(word);
-				}
-			}
-
-			tmp = new BooleanQuery();
-			
-			// [2] form query
-			System.Collections.IEnumerator it = top.GetEnumerator();
-			while (it.MoveNext())
-			{
-				// [2a] add to level words in
-				var word = (String) it.Current;
-				var tq = new TermQuery(new Term(field, word));
-				tmp.Add(tq, Occur.SHOULD);
-
-			    var c = new CollectorImpl(field, boost);
+                {
+                    already.Add(word);
+                    top.Add(word);
+                }
+            }
+
+            tmp = new BooleanQuery();
+            
+            // [2] form query
+            System.Collections.IEnumerator it = top.GetEnumerator();
+            while (it.MoveNext())
+            {
+                // [2a] add to level words in
+                var word = (String) it.Current;
+                var tq = new TermQuery(new Term(field, word));
+                tmp.Add(tq, Occur.SHOULD);
+
+                var c = new CollectorImpl(field, boost);
                 syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word)), c);
-			}
-			
-			return tmp;
-		}
-	
+            }
+            
+            return tmp;
+        }
+    
 
         /// <summary>
         /// From project WordNet.Net.Syns2Index

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/62f018ab/src/contrib/WordNet/SynLookup/SynLookup.cs
----------------------------------------------------------------------
diff --git a/src/contrib/WordNet/SynLookup/SynLookup.cs b/src/contrib/WordNet/SynLookup/SynLookup.cs
index 62c436d..024dcc9 100644
--- a/src/contrib/WordNet/SynLookup/SynLookup.cs
+++ b/src/contrib/WordNet/SynLookup/SynLookup.cs
@@ -27,100 +27,100 @@ using Lucene.Net.Store;
 
 namespace WorldNet.Net
 {
-	/// <summary> Test program to look up synonyms.</summary>
-	public class SynLookup
-	{
-		static List<String> already;
-		private static BooleanQuery tmp;
-
-		[STAThread]
-		public static void  Main(System.String[] args)
-		{
-			if (args.Length != 2)
-			{
-				System.Console.Out.WriteLine(typeof(SynLookup) + " <index path> <word>");
-				return;
-			}
-			
-			using (var directory = FSDirectory.Open(new DirectoryInfo(args[0])))
-			{
-				using (var searcher = new IndexSearcher(directory, true))
-				{
-
-					String word = args[1];
-					Query query = new TermQuery(new Term(Syns2Index.F_WORD, word));
-					var countingCollector = new CountingCollector();
-					searcher.Search(query, countingCollector);
-
-					if (countingCollector.numHits == 0)
-					{
-						Console.Out.WriteLine("No synonyms found for " + word);
-					}
-					else
-					{
-						Console.Out.WriteLine("Synonyms found for \"" + word + "\":");
-					}
-
-					var hits = searcher.Search(query, countingCollector.numHits).ScoreDocs;
-
-					foreach (var v in
-						hits.Select(t => searcher.Doc(t.Doc)).Select(doc => doc.GetValues(Syns2Index.F_SYN)).SelectMany(values
=> values))
-					{
-						Console.Out.WriteLine(v);
-					}
-
-				}
-			}
-		}
-		
-		/// <summary> 
-		/// Perform synonym expansion on a query.
-		/// </summary>
-		/// <param name="query">query</param>
-		/// <param name="syns">syns</param>
-		/// <param name="a">a</param>
-		/// <param name="field">field</param>
-		/// <param name="boost">boost</param>
-		public static Query Expand(String query, 
-			Searcher syns, 
-			Analyzer a, 
-			String field, 
-			float boost)
-		{
-			already = new List<String>(); // avoid dups		
-			var top = new List<String>(); // needs to be separately listed..
-
-			var ts = a.TokenStream(field, new StringReader(query));
-			var termAtt = ts.AddAttribute<TermAttribute>();
-
-			while (ts.IncrementToken())
-			{
-				var word = termAtt.Term;
-
-				if (!already.Contains(word))
-				{
-					already.Add(word);
-					top.Add(word);
-				}
-			}
-
-			tmp = new BooleanQuery();
-
-			// [2] form query
-			System.Collections.IEnumerator it = top.GetEnumerator();
-			while (it.MoveNext())
-			{
-				// [2a] add to level words in
-				var word = (String)it.Current;
-				var tq = new TermQuery(new Term(field, word));
-				tmp.Add(tq, Occur.SHOULD);
-
-				var c = new CollectorImpl(field, boost);
-				syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word)), c);
-			}
-
-			return tmp;
-		}
+    /// <summary> Test program to look up synonyms.</summary>
+    public class SynLookup
+    {
+        static List<String> already;
+        private static BooleanQuery tmp;
+
+        [STAThread]
+        public static void  Main(System.String[] args)
+        {
+            if (args.Length != 2)
+            {
+                System.Console.Out.WriteLine(typeof(SynLookup) + " <index path> <word>");
+                return;
+            }
+            
+            using (var directory = FSDirectory.Open(new DirectoryInfo(args[0])))
+            {
+                using (var searcher = new IndexSearcher(directory, true))
+                {
+
+                    String word = args[1];
+                    Query query = new TermQuery(new Term(Syns2Index.F_WORD, word));
+                    var countingCollector = new CountingCollector();
+                    searcher.Search(query, countingCollector);
+
+                    if (countingCollector.numHits == 0)
+                    {
+                        Console.Out.WriteLine("No synonyms found for " + word);
+                    }
+                    else
+                    {
+                        Console.Out.WriteLine("Synonyms found for \"" + word + "\":");
+                    }
+
+                    var hits = searcher.Search(query, countingCollector.numHits).ScoreDocs;
+
+                    foreach (var v in
+                        hits.Select(t => searcher.Doc(t.Doc)).Select(doc => doc.GetValues(Syns2Index.F_SYN)).SelectMany(values
=> values))
+                    {
+                        Console.Out.WriteLine(v);
+                    }
+
+                }
+            }
+        }
+        
+        /// <summary> 
+        /// Perform synonym expansion on a query.
+        /// </summary>
+        /// <param name="query">query</param>
+        /// <param name="syns">syns</param>
+        /// <param name="a">a</param>
+        /// <param name="field">field</param>
+        /// <param name="boost">boost</param>
+        public static Query Expand(String query, 
+            Searcher syns, 
+            Analyzer a, 
+            String field, 
+            float boost)
+        {
+            already = new List<String>(); // avoid dups        
+            var top = new List<String>(); // needs to be separately listed..
+
+            var ts = a.TokenStream(field, new StringReader(query));
+            var termAtt = ts.AddAttribute<TermAttribute>();
+
+            while (ts.IncrementToken())
+            {
+                var word = termAtt.Term;
+
+                if (!already.Contains(word))
+                {
+                    already.Add(word);
+                    top.Add(word);
+                }
+            }
+
+            tmp = new BooleanQuery();
+
+            // [2] form query
+            System.Collections.IEnumerator it = top.GetEnumerator();
+            while (it.MoveNext())
+            {
+                // [2a] add to level words in
+                var word = (String)it.Current;
+                var tq = new TermQuery(new Term(field, word));
+                tmp.Add(tq, Occur.SHOULD);
+
+                var c = new CollectorImpl(field, boost);
+                syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word)), c);
+            }
+
+            return tmp;
+        }
 
         internal sealed class CountingCollector : Collector
         {

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/62f018ab/src/contrib/WordNet/Syns2Index/Syns2Index.cs
----------------------------------------------------------------------
diff --git a/src/contrib/WordNet/Syns2Index/Syns2Index.cs b/src/contrib/WordNet/Syns2Index/Syns2Index.cs
index ac5bea6..da96a8a 100644
--- a/src/contrib/WordNet/Syns2Index/Syns2Index.cs
+++ b/src/contrib/WordNet/Syns2Index/Syns2Index.cs
@@ -29,264 +29,264 @@ using IndexWriter = Lucene.Net.Index.IndexWriter;
 
 namespace WorldNet.Net
 {
-	
-	/// <summary> Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet
prolog download</a>
-	/// into a Lucene index suitable for looking up synonyms and performing query expansion
(<see cref="SynExpand.Expand"/>).
-	/// 
-	/// This has been tested with WordNet 2.0.
-	/// 
-	/// The index has fields named "word" (<see cref="F_WORD"/>)
-	/// and "syn" (<see cref="F_SYN"/>).
-	/// <p>
-	/// The source word (such as 'big') can be looked up in the
-	/// "word" field, and if present there will be fields named "syn"
-	/// for every synonym. What's tricky here is that there could be <b>multiple</b>
-	/// fields with the same name, in the general case for words that have multiple synonyms.
-	/// That's not a problem with Lucene, you just use <see cref="Document.GetValues"/>
-	/// </p>
-	/// <p>
-	/// While the WordNet file distinguishes groups of synonyms with
-	/// related meanings we don't do that here.
-	/// </p>
-	/// This can take 4 minutes to execute and build an index on a "fast" system and the index
takes up almost 3 MB.
-	/// </summary>
-	/// 
-	/// <seealso cref="http://www.cogsci.princeton.edu/~wn/"></seealso>
-	/// <seealso cref="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html"></seealso>
-	/// <seealso cref="http://www.hostmon.com/rfc/advanced.jsp"> </seealso>
-	public class Syns2Index
-	{
-		/// <summary> </summary>
-		private static readonly System.IO.StreamWriter o;
-		
-		/// <summary> </summary>
-		private static readonly System.IO.StreamWriter err;
-		
-		/// <summary> </summary>
-		public const System.String F_SYN = "syn";
-		
-		/// <summary> </summary>
-		public const System.String F_WORD = "word";
-		
-		/// <summary> </summary>
-		private static readonly Analyzer ana = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT);
-		
-		/// <summary> 
-		/// Takes arg of prolog file name and index directory.
-		/// </summary>
-		[STAThread]
-		public static void  Main(System.String[] args)
-		{
-			// get command line arguments
-			String prologFilename = null; // name of file "wn_s.pl"
-			String indexDir = null;
-			if (args.Length == 2)
-			{
-				prologFilename = args[0];
-				indexDir = args[1];
-			}
-			else
-			{
-				Usage();
-				Environment.Exit(1);
-			}
-			
-			// ensure that the prolog file is readable
-			if (!(new FileInfo(prologFilename)).Exists)
-			{
-				err.WriteLine("Error: cannot read Prolog file: " + prologFilename);
-				Environment.Exit(1);
-			}
-			// exit if the target index directory already exists
-			if (Directory.Exists((new FileInfo(indexDir)).FullName))
-			{
-				err.WriteLine("Error: index directory already exists: " + indexDir);
-				err.WriteLine("Please specify a name of a non-existent directory");
-				Environment.Exit(1);
-			}
-			
-			o.WriteLine("Opening Prolog file " + prologFilename);
-			var fis = new FileStream(prologFilename, FileMode.Open, FileAccess.Read);
-			var br = new StreamReader(new StreamReader(fis, System.Text.Encoding.Default).BaseStream,
new StreamReader(fis, System.Text.Encoding.Default).CurrentEncoding);
-			String line;
-			
-			// maps a word to all the "groups" it's in
-			System.Collections.IDictionary word2Nums = new System.Collections.SortedList();
-			// maps a group to all the words in it
-			System.Collections.IDictionary num2Words = new System.Collections.SortedList();
-			// number of rejected words
-			var ndecent = 0;
-			
-			// status output
-			var mod = 1;
-			var row = 1;
-			// parse prolog file
-			o.WriteLine("[1/2] Parsing " + prologFilename);
-			while ((line = br.ReadLine()) != null)
-			{
-				// occasional progress
-				if ((++row) % mod == 0) // periodically print out line we read in
-				{
-					mod *= 2;
-					o.WriteLine("\t" + row + " " + line + " " + word2Nums.Count + " " + num2Words.Count
+ " ndecent=" + ndecent);
-				}
-				
-				// syntax check
-				if (!line.StartsWith("s("))
-				{
-					err.WriteLine("OUCH: " + line);
-					Environment.Exit(1);
-				}
-				
-				// parse line
-				line = line.Substring(2);
-				var comma = line.IndexOf(',');
-				var num = line.Substring(0, comma);
-				var q1 = line.IndexOf('\'');
-				line = line.Substring(q1 + 1);
-				var q2 = line.IndexOf('\'');
-				var word = line.Substring(0, q2).ToLower().Replace("''", "'");
-				
-				// make sure is a normal word
-				if (!IsDecent(word))
-				{
-					ndecent++;
-					continue; // don't store words w/ spaces
-				}
-				
-				// 1/2: word2Nums map
-				// append to entry or add new one
-				var lis = (System.Collections.IList) word2Nums[word];
-				if (lis == null)
-				{
-					lis = new List<String> {num};
-					word2Nums[word] = lis;
-				}
-				else
-					lis.Add(num);
-				
-				// 2/2: num2Words map
-				lis = (System.Collections.IList) num2Words[num];
-				if (lis == null)
-				{
-					lis = new List<String> { word };
-					num2Words[num] = lis;
-				}
-				else
-					lis.Add(word);
-			}
-			
-			// close the streams
-			fis.Close();
-			br.Close();
-			
-			// create the index
-			o.WriteLine("[2/2] Building index to store synonyms, " + " map sizes are " + word2Nums.Count
+ " and " + num2Words.Count);
-			Index(indexDir, word2Nums, num2Words);
-		}
-		
-		/// <summary> 
-		/// Checks to see if a word contains only alphabetic characters by
-		/// checking it one character at a time.
-		/// </summary>
-		/// <param name="s">string to check </param>
-		/// <returns> <c>true</c> if the string is decent</returns>
-		private static bool IsDecent(String s)
-		{
-			var len = s.Length;
-			for (var i = 0; i < len; i++)
-			{
-				if (!Char.IsLetter(s[i]))
-				{
-					return false;
-				}
-			}
-			return true;
-		}
-		
-		/// <summary> 
-		/// Forms a Lucene index based on the 2 maps.
-		/// </summary>
-		/// <param name="indexDir">the direcotry where the index should be created</param>
-		/// <param name="word2Nums">word2Nums</param>
-		/// <param name="num2Words">num2Words</param>
-		private static void  Index(String indexDir, System.Collections.IDictionary word2Nums, System.Collections.IDictionary
num2Words)
-		{
-			var row = 0;
-			var mod = 1;
-			
-			using (var dir = FSDirectory.Open(new DirectoryInfo(indexDir)))
-			{
-				var writer = new IndexWriter(dir, ana, true, IndexWriter.MaxFieldLength.LIMITED);
-				writer.UseCompoundFile = true; // why?
+    
+    /// <summary> Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet
prolog download</a>
+    /// into a Lucene index suitable for looking up synonyms and performing query expansion
(<see cref="SynExpand.Expand"/>).
+    /// 
+    /// This has been tested with WordNet 2.0.
+    /// 
+    /// The index has fields named "word" (<see cref="F_WORD"/>)
+    /// and "syn" (<see cref="F_SYN"/>).
+    /// <p>
+    /// The source word (such as 'big') can be looked up in the
+    /// "word" field, and if present there will be fields named "syn"
+    /// for every synonym. What's tricky here is that there could be <b>multiple</b>
+    /// fields with the same name, in the general case for words that have multiple synonyms.
+    /// That's not a problem with Lucene, you just use <see cref="Document.GetValues"/>
+    /// </p>
+    /// <p>
+    /// While the WordNet file distinguishes groups of synonyms with
+    /// related meanings we don't do that here.
+    /// </p>
+    /// This can take 4 minutes to execute and build an index on a "fast" system and the
index takes up almost 3 MB.
+    /// </summary>
+    /// 
+    /// <seealso cref="http://www.cogsci.princeton.edu/~wn/"></seealso>
+    /// <seealso cref="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html"></seealso>
+    /// <seealso cref="http://www.hostmon.com/rfc/advanced.jsp"> </seealso>
+    public class Syns2Index
+    {
+        /// <summary> </summary>
+        private static readonly System.IO.StreamWriter o;
+        
+        /// <summary> </summary>
+        private static readonly System.IO.StreamWriter err;
+        
+        /// <summary> </summary>
+        public const System.String F_SYN = "syn";
+        
+        /// <summary> </summary>
+        public const System.String F_WORD = "word";
+        
+        /// <summary> </summary>
+        private static readonly Analyzer ana = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT);
+        
+        /// <summary> 
+        /// Takes arg of prolog file name and index directory.
+        /// </summary>
+        [STAThread]
+        public static void  Main(System.String[] args)
+        {
+            // get command line arguments
+            String prologFilename = null; // name of file "wn_s.pl"
+            String indexDir = null;
+            if (args.Length == 2)
+            {
+                prologFilename = args[0];
+                indexDir = args[1];
+            }
+            else
+            {
+                Usage();
+                Environment.Exit(1);
+            }
+            
+            // ensure that the prolog file is readable
+            if (!(new FileInfo(prologFilename)).Exists)
+            {
+                err.WriteLine("Error: cannot read Prolog file: " + prologFilename);
+                Environment.Exit(1);
+            }
+            // exit if the target index directory already exists
+            if (Directory.Exists((new FileInfo(indexDir)).FullName))
+            {
+                err.WriteLine("Error: index directory already exists: " + indexDir);
+                err.WriteLine("Please specify a name of a non-existent directory");
+                Environment.Exit(1);
+            }
+            
+            o.WriteLine("Opening Prolog file " + prologFilename);
+            var fis = new FileStream(prologFilename, FileMode.Open, FileAccess.Read);
+            var br = new StreamReader(new StreamReader(fis, System.Text.Encoding.Default).BaseStream,
new StreamReader(fis, System.Text.Encoding.Default).CurrentEncoding);
+            String line;
+            
+            // maps a word to all the "groups" it's in
+            System.Collections.IDictionary word2Nums = new System.Collections.SortedList();
+            // maps a group to all the words in it
+            System.Collections.IDictionary num2Words = new System.Collections.SortedList();
+            // number of rejected words
+            var ndecent = 0;
+            
+            // status output
+            var mod = 1;
+            var row = 1;
+            // parse prolog file
+            o.WriteLine("[1/2] Parsing " + prologFilename);
+            while ((line = br.ReadLine()) != null)
+            {
+                // occasional progress
+                if ((++row) % mod == 0) // periodically print out line we read in
+                {
+                    mod *= 2;
+                    o.WriteLine("\t" + row + " " + line + " " + word2Nums.Count + " " + num2Words.Count
+ " ndecent=" + ndecent);
+                }
+                
+                // syntax check
+                if (!line.StartsWith("s("))
+                {
+                    err.WriteLine("OUCH: " + line);
+                    Environment.Exit(1);
+                }
+                
+                // parse line
+                line = line.Substring(2);
+                var comma = line.IndexOf(',');
+                var num = line.Substring(0, comma);
+                var q1 = line.IndexOf('\'');
+                line = line.Substring(q1 + 1);
+                var q2 = line.IndexOf('\'');
+                var word = line.Substring(0, q2).ToLower().Replace("''", "'");
+                
+                // make sure is a normal word
+                if (!IsDecent(word))
+                {
+                    ndecent++;
+                    continue; // don't store words w/ spaces
+                }
+                
+                // 1/2: word2Nums map
+                // append to entry or add new one
+                var lis = (System.Collections.IList) word2Nums[word];
+                if (lis == null)
+                {
+                    lis = new List<String> {num};
+                    word2Nums[word] = lis;
+                }
+                else
+                    lis.Add(num);
+                
+                // 2/2: num2Words map
+                lis = (System.Collections.IList) num2Words[num];
+                if (lis == null)
+                {
+                    lis = new List<String> { word };
+                    num2Words[num] = lis;
+                }
+                else
+                    lis.Add(word);
+            }
+            
+            // close the streams
+            fis.Close();
+            br.Close();
+            
+            // create the index
+            o.WriteLine("[2/2] Building index to store synonyms, " + " map sizes are " +
word2Nums.Count + " and " + num2Words.Count);
+            Index(indexDir, word2Nums, num2Words);
+        }
+        
+        /// <summary> 
+        /// Checks to see if a word contains only alphabetic characters by
+        /// checking it one character at a time.
+        /// </summary>
+        /// <param name="s">string to check </param>
+        /// <returns> <c>true</c> if the string is decent</returns>
+        private static bool IsDecent(String s)
+        {
+            var len = s.Length;
+            for (var i = 0; i < len; i++)
+            {
+                if (!Char.IsLetter(s[i]))
+                {
+                    return false;
+                }
+            }
+            return true;
+        }
+        
+        /// <summary> 
+        /// Forms a Lucene index based on the 2 maps.
+        /// </summary>
+        /// <param name="indexDir">the direcotry where the index should be created</param>
+        /// <param name="word2Nums">word2Nums</param>
+        /// <param name="num2Words">num2Words</param>
+        private static void  Index(String indexDir, System.Collections.IDictionary word2Nums,
System.Collections.IDictionary num2Words)
+        {
+            var row = 0;
+            var mod = 1;
+            
+            using (var dir = FSDirectory.Open(new DirectoryInfo(indexDir)))
+            {
+                var writer = new IndexWriter(dir, ana, true, IndexWriter.MaxFieldLength.LIMITED);
+                writer.UseCompoundFile = true; // why?
 
-				var i1 = word2Nums.Keys.GetEnumerator();
-				while (i1.MoveNext())
-				{
-					var g = (String)i1.Current;
-					var doc = new Document();
+                var i1 = word2Nums.Keys.GetEnumerator();
+                while (i1.MoveNext())
+                {
+                    var g = (String)i1.Current;
+                    var doc = new Document();
 
-					var n = Index(word2Nums, num2Words, g, doc);
-					if (n > 0)
-					{
-						doc.Add(new Field(F_WORD, g, Field.Store.YES, Field.Index.NOT_ANALYZED));
-						if ((++row % mod) == 0)
-						{
-							o.WriteLine("\trow=" + row + "/" + word2Nums.Count + " doc= " + doc);
-							mod *= 2;
-						}
-						writer.AddDocument(doc);
-					}
-				}
-				o.WriteLine("Optimizing..");
-				writer.Optimize();
-				writer.Close();
-			}
-			
-		}
+                    var n = Index(word2Nums, num2Words, g, doc);
+                    if (n > 0)
+                    {
+                        doc.Add(new Field(F_WORD, g, Field.Store.YES, Field.Index.NOT_ANALYZED));
+                        if ((++row % mod) == 0)
+                        {
+                            o.WriteLine("\trow=" + row + "/" + word2Nums.Count + " doc= "
+ doc);
+                            mod *= 2;
+                        }
+                        writer.AddDocument(doc);
+                    }
+                }
+                o.WriteLine("Optimizing..");
+                writer.Optimize();
+                writer.Close();
+            }
+            
+        }
 
-		/// <summary> 
-		/// Given the 2 maps fills a document for 1 word.
-		/// </summary>
-		private static int Index(System.Collections.IDictionary word2Nums, System.Collections.IDictionary
num2Words, System.String g, Document doc)
-		{
-			var keys = (System.Collections.IList) word2Nums[g]; // get list of key#'s
-			var i2 = keys.GetEnumerator();
-			
-			var already = new System.Collections.SortedList(); // keep them sorted
-			
-			// pass 1: fill up 'already' with all words
-			while (i2.MoveNext()) // for each key#
-			{
-				foreach (var item in
-					((System.Collections.IList) num2Words[i2.Current]).Cast<object>().Where(item =>
already.Contains(item) == false))
-				{
-					already.Add(item, item);
-				}
-			}
+        /// <summary> 
+        /// Given the 2 maps fills a document for 1 word.
+        /// </summary>
+        private static int Index(System.Collections.IDictionary word2Nums, System.Collections.IDictionary
num2Words, System.String g, Document doc)
+        {
+            var keys = (System.Collections.IList) word2Nums[g]; // get list of key#'s
+            var i2 = keys.GetEnumerator();
+            
+            var already = new System.Collections.SortedList(); // keep them sorted
+            
+            // pass 1: fill up 'already' with all words
+            while (i2.MoveNext()) // for each key#
+            {
+                foreach (var item in
+                    ((System.Collections.IList) num2Words[i2.Current]).Cast<object>().Where(item
=> already.Contains(item) == false))
+                {
+                    already.Add(item, item);
+                }
+            }
 
-			var num = 0;
-			already.Remove(g); // of course a word is it's own syn
-			var it = already.GetEnumerator();
-			while (it.MoveNext())
-			{
-				var cur = (String) it.Key;
-				// don't store things like 'pit bull' -> 'american pit bull'
-				if (!IsDecent(cur))
-				{
-					continue;
-				}
-				num++;
-				doc.Add(new Field(F_SYN, cur, Field.Store.YES, Field.Index.NO));
-			}
-			return num;
-		}
-		
-		/// <summary> </summary>
-		private static void  Usage()
-		{
-			o.WriteLine("\n\n" + typeof(Syns2Index) + " <prolog file> <index dir>\n\n");
-		}
+            var num = 0;
+            already.Remove(g); // of course a word is it's own syn
+            var it = already.GetEnumerator();
+            while (it.MoveNext())
+            {
+                var cur = (String) it.Key;
+                // don't store things like 'pit bull' -> 'american pit bull'
+                if (!IsDecent(cur))
+                {
+                    continue;
+                }
+                num++;
+                doc.Add(new Field(F_SYN, cur, Field.Store.YES, Field.Index.NO));
+            }
+            return num;
+        }
+        
+        /// <summary> </summary>
+        private static void  Usage()
+        {
+            o.WriteLine("\n\n" + typeof(Syns2Index) + " <prolog file> <index dir>\n\n");
+        }
 
-	}
+    }
 }
\ No newline at end of file


Mime
View raw message