Return-Path: X-Original-To: apmail-lucene-lucene-net-commits-archive@www.apache.org Delivered-To: apmail-lucene-lucene-net-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 69255794E for ; Fri, 25 Nov 2011 22:39:58 +0000 (UTC) Received: (qmail 57116 invoked by uid 500); 25 Nov 2011 22:39:58 -0000 Delivered-To: apmail-lucene-lucene-net-commits-archive@lucene.apache.org Received: (qmail 57086 invoked by uid 500); 25 Nov 2011 22:39:57 -0000 Mailing-List: contact lucene-net-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucene.apache.org Delivered-To: mailing list lucene-net-commits@lucene.apache.org Received: (qmail 57079 invoked by uid 99); 25 Nov 2011 22:39:57 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 25 Nov 2011 22:39:57 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 25 Nov 2011 22:39:55 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id DA37B2388A2C; Fri, 25 Nov 2011 22:39:34 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Date: Fri, 25 Nov 2011 22:39:34 -0000 To: lucene-net-commits@lucene.apache.org From: pnasser@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20111125223934.DA37B2388A2C@eris.apache.org> Subject: [Lucene.Net] svn commit: r1206365 - in /incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet: SynExpand/Contrib.WordNet.SynExpand.csproj SynExpand/SynExpand.cs SynLookup/SynLookup.cs Syns2Index/Syns2Index.cs Author: pnasser Date: Fri Nov 25 22:39:33 2011 New Revision: 1206365 URL: http://svn.apache.org/viewvc?rev=1206365&view=rev Log: Contrib.WordNet - compiles and present files are updated, still missing a few files Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/SynExpand/Contrib.WordNet.SynExpand.csproj incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/SynExpand/SynExpand.cs incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/SynLookup/SynLookup.cs incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/Syns2Index/Syns2Index.cs Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/SynExpand/Contrib.WordNet.SynExpand.csproj URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/SynExpand/Contrib.WordNet.SynExpand.csproj?rev=1206365&r1=1206364&r2=1206365&view=diff ============================================================================== --- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/SynExpand/Contrib.WordNet.SynExpand.csproj (original) +++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/SynExpand/Contrib.WordNet.SynExpand.csproj Fri Nov 25 22:39:33 2011 @@ -19,7 +19,6 @@ under the License. --> - Local @@ -116,12 +115,6 @@ - - {5D4AD9BE-1FFB-41AB-9943-25737971BF57} - Lucene.Net - - - False Microsoft .NET Framework 4 %28x86 and x64%29 @@ -143,6 +136,12 @@ true + + + {5D4AD9BE-1FFB-41AB-9943-25737971BF57} + Lucene.Net + + Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/SynExpand/SynExpand.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/SynExpand/SynExpand.cs?rev=1206365&r1=1206364&r2=1206365&view=diff ============================================================================== --- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/SynExpand/SynExpand.cs (original) +++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/SynExpand/SynExpand.cs Fri Nov 25 22:39:33 2011 @@ -16,152 +16,179 @@ */ using System; - -using Lucene.Net.Store; -using Lucene.Net.Search; -using Lucene.Net.Index; -using Lucene.Net.Documents; +using System.Collections.Generic; +using System.IO; +using System.Linq; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Standard; +using Lucene.Net.Analysis.Tokenattributes; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Store; namespace WorldNet.Net { - - - /// Expand a query by looking up synonyms for every term. - /// You need to invoke first to build the synonym index. - /// - /// - /// - /// - public sealed class SynExpand - { - - /// Test driver for synonym expansion. - /// Uses boost factor of 0.9 for illustrative purposes. - /// - /// If you pass in the query "big dog" then it prints out: - /// - ///
-		/// Query: big adult^0.9 bad^0.9 bighearted^0.9 boastful^0.9 boastfully^0.9 bounteous^0.9 bountiful^0.9 braggy^0.9 crowing^0.9 freehanded^0.9 giving^0.9 grown^0.9 grownup^0.9 handsome^0.9 large^0.9 liberal^0.9 magnanimous^0.9 momentous^0.9 openhanded^0.9 prominent^0.9 swelled^0.9 vainglorious^0.9 vauntingly^0.9
-		/// dog andiron^0.9 blackguard^0.9 bounder^0.9 cad^0.9 chase^0.9 click^0.9 detent^0.9 dogtooth^0.9 firedog^0.9 frank^0.9 frankfurter^0.9 frump^0.9 heel^0.9 hotdog^0.9 hound^0.9 pawl^0.9 tag^0.9 tail^0.9 track^0.9 trail^0.9 weenie^0.9 wiener^0.9 wienerwurst^0.9
-		/// 
- ///
- [STAThread] - public static void Main(System.String[] args) - { - if (args.Length != 2) - { - System.Console.Out.WriteLine(typeof(SynExpand) + " "); + + + /// Expand a query by looking up synonyms for every term. + /// You need to invoke first to build the synonym index. + /// + /// + /// + public sealed class SynExpand + { + static List already; + private static BooleanQuery tmp; + + /// Test driver for synonym expansion. + /// Uses boost factor of 0.9 for illustrative purposes. + /// + /// If you pass in the query "big dog" then it prints out: + /// + ///
+        /// Query: big adult^0.9 bad^0.9 bighearted^0.9 boastful^0.9 boastfully^0.9 bounteous^0.9 bountiful^0.9 braggy^0.9 crowing^0.9 freehanded^0.9 giving^0.9 grown^0.9 grownup^0.9 handsome^0.9 large^0.9 liberal^0.9 magnanimous^0.9 momentous^0.9 openhanded^0.9 prominent^0.9 swelled^0.9 vainglorious^0.9 vauntingly^0.9
+        /// dog andiron^0.9 blackguard^0.9 bounder^0.9 cad^0.9 chase^0.9 click^0.9 detent^0.9 dogtooth^0.9 firedog^0.9 frank^0.9 frankfurter^0.9 frump^0.9 heel^0.9 hotdog^0.9 hound^0.9 pawl^0.9 tag^0.9 tail^0.9 track^0.9 trail^0.9 weenie^0.9 wiener^0.9 wienerwurst^0.9
+        /// 
+ ///
+ [STAThread] + public static void Main(String[] args) + { + if (args.Length != 2) + { + Console.Out.WriteLine(typeof(SynExpand) + " "); return; - } - - FSDirectory directory = FSDirectory.GetDirectory(args[0], false); - IndexSearcher searcher = new IndexSearcher(directory); - - System.String query = args[1]; - System.String field = "contents"; - - Query q = Expand(query, searcher, new StandardAnalyzer(), field, 0.9f); - System.Console.Out.WriteLine("Query: " + q.ToString(field)); - - - - searcher.Close(); - directory.Close(); - } - - - /// Perform synonym expansion on a query. - /// - /// - /// users query that is assumed to not have any "special" query syntax, thus it should be just normal words, so "big dog" makes sense, but a query like "title:foo^1.2" doesn't as this should presumably be passed directly to the default query parser. - /// - /// - /// a opened to the Lucene index you previously created with . The searcher is not closed or otherwise altered. - /// - /// - /// optional analyzer used to parse the users query else is used - /// - /// - /// optional field name to search in or null if you want the default of "contents" - /// - /// - /// optional boost applied to synonyms else no boost is applied - /// - /// - /// the expanded Query - /// - public static Query Expand(System.String query, Searcher syns, Analyzer a, System.String field, float boost) + } + + var directory = FSDirectory.Open(new DirectoryInfo(args[0])); + var searcher = new IndexSearcher(directory, true); + + String query = args[1]; + const string field = "contents"; + + Query q = Expand(query, searcher, new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT), field, 0.9f); + System.Console.Out.WriteLine("Query: " + q.ToString(field)); + + searcher.Close(); + directory.Close(); + } + + + /// + /// Perform synonym expansion on a query. + /// + /// users query that is assumed to not have any "special" query syntax, thus it should be just normal words, so "big dog" makes sense, but a query like "title:foo^1.2" doesn't as this should presumably be passed directly to the default query parser + /// a opened to the Lucene index you previously created with . The searcher is not closed or otherwise altered. + /// optional analyzer used to parse the users query else is used + /// optional field name to search in or null if you want the default of "contents" + /// optional boost applied to synonyms else no boost is applied + /// the expanded Query + public static Query Expand(String query, + Searcher syns, + Analyzer a, + String field, + float boost) { - System.Collections.Hashtable already = new System.Collections.Hashtable(); // avoid dups - System.Collections.IList top = new System.Collections.ArrayList(); // needs to be separately listed.. + already = new List(); // avoid dups + var top = new List(); // needs to be separately listed.. if (field == null) field = "contents"; - if (a == null) - a = new StandardAnalyzer(); + + if (a == null) + a = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT); // [1] Parse query into separate words so that when we expand we can avoid dups - TokenStream ts = a.TokenStream(field, new System.IO.StringReader(query)); - Lucene.Net.Analysis.Token t; - while ((t = ts.Next()) != null) + var ts = a.TokenStream(field, new StringReader(query)); + var termAtt = ts.AddAttribute(); + + while (ts.IncrementToken()) { - System.String word = t.TermText(); - if (already.Contains(word) == false) + var word = termAtt.Term(); + + if (!already.Contains(word)) { - already.Add(word, word); + already.Add(word); top.Add(word); } } - BooleanQuery tmp = new BooleanQuery(); + + tmp = new BooleanQuery(); // [2] form query System.Collections.IEnumerator it = top.GetEnumerator(); while (it.MoveNext()) { // [2a] add to level words in - System.String word = (System.String) it.Current; - TermQuery tq = new TermQuery(new Term(field, word)); + var word = (String) it.Current; + var tq = new TermQuery(new Term(field, word)); tmp.Add(tq, BooleanClause.Occur.SHOULD); - - // [2b] add in unique synonums - Hits hits = syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word))); - for (int i = 0; i < hits.Length(); i++) - { - Document doc = hits.Doc(i); - System.String[] values = doc.GetValues(Syns2Index.F_SYN); - for (int j = 0; j < values.Length; j++) - { - System.String syn = values[j]; - if (already.Contains(syn) == false) - // avoid dups of top level words and synonyms - { - already.Add(syn, syn); - tq = new TermQuery(new Term(field, syn)); - if (boost > 0) - // else keep normal 1.0 - tq.SetBoost(boost); - tmp.Add(tq, BooleanClause.Occur.SHOULD); - } - } - } + + var c = new CollectorImpl(field, boost); + syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word)), c); } - return tmp; } - } + + + /// + /// From project WordNet.Net.Syns2Index + /// + public class Syns2Index + { + /// + public const String F_SYN = "syn"; + + /// + public const String F_WORD = "word"; + } + + /// + /// CollectorImpl + /// + internal sealed class CollectorImpl : Collector + { + private IndexReader reader; + private readonly string field; + private readonly float boost; + + public CollectorImpl(string field, float boost) + { + this.field = field; + this.boost = boost; + } + + public override void SetScorer(Scorer scorer) + { + // Ignore + } + + public override void Collect(int doc) + { + var d = reader.Document(doc); + var values = d.GetValues(Syns2Index.F_SYN); + foreach (var syn in values.Where(syn => !already.Contains(syn))) + { + already.Add(syn); + + var tq = new TermQuery(new Term(field, syn)); + if (boost > 0) // else keep normal 1.0 + tq.SetBoost(boost); + + tmp.Add(tq, BooleanClause.Occur.SHOULD); + } + } + + public override void SetNextReader(IndexReader reader, int docBase) + { + this.reader = reader; + } + + public override bool AcceptsDocsOutOfOrder() + { + return true; + } + } - /// - /// From project WordNet.Net.Syns2Index - /// - public class Syns2Index - { - /// - public const System.String F_SYN = "syn"; - - /// - public const System.String F_WORD = "word"; - } + } } \ No newline at end of file Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/SynLookup/SynLookup.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/SynLookup/SynLookup.cs?rev=1206365&r1=1206364&r2=1206365&view=diff ============================================================================== --- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/SynLookup/SynLookup.cs (original) +++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/SynLookup/SynLookup.cs Fri Nov 25 22:39:33 2011 @@ -16,140 +16,193 @@ */ using System; - -using Lucene.Net.Store; -using Lucene.Net.Search; -using Lucene.Net.Index; -using Lucene.Net.Documents; +using System.Collections.Generic; +using System.IO; +using System.Linq; using Lucene.Net.Analysis; +using Lucene.Net.Analysis.Tokenattributes; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Store; namespace WorldNet.Net { - - /// Test program to look up synonyms. public class SynLookup { - + static List already; + private static BooleanQuery tmp; + [STAThread] public static void Main(System.String[] args) { if (args.Length != 2) { System.Console.Out.WriteLine(typeof(SynLookup) + " "); - return; + return; } - FSDirectory directory = FSDirectory.GetDirectory(args[0], false); - IndexSearcher searcher = new IndexSearcher(directory); - - System.String word = args[1]; - Hits hits = searcher.Search(new TermQuery(new Term(Syns2Index.F_WORD, word))); - - if (hits.Length() == 0) + using (var directory = FSDirectory.Open(new DirectoryInfo(args[0]))) { - System.Console.Out.WriteLine("No synonyms found for " + word); - } - else - { - System.Console.Out.WriteLine("Synonyms found for \"" + word + "\":"); - } - - for (int i = 0; i < hits.Length(); i++) - { - Document doc = hits.Doc(i); - - System.String[] values = doc.GetValues(Syns2Index.F_SYN); - - for (int j = 0; j < values.Length; j++) + using (var searcher = new IndexSearcher(directory, true)) { - System.Console.Out.WriteLine(values[j]); + + String word = args[1]; + Query query = new TermQuery(new Term(Syns2Index.F_WORD, word)); + var countingCollector = new CountingCollector(); + searcher.Search(query, countingCollector); + + if (countingCollector.numHits == 0) + { + Console.Out.WriteLine("No synonyms found for " + word); + } + else + { + Console.Out.WriteLine("Synonyms found for \"" + word + "\":"); + } + + var hits = searcher.Search(query, countingCollector.numHits).ScoreDocs; + + foreach (var v in + hits.Select(t => searcher.Doc(t.doc)).Select(doc => doc.GetValues(Syns2Index.F_SYN)).SelectMany(values => values)) + { + Console.Out.WriteLine(v); + } + } } - - searcher.Close(); - directory.Close(); } - - /// Perform synonym expansion on a query. - /// + /// + /// Perform synonym expansion on a query. /// - /// query - /// - /// syns - /// - /// a - /// - /// field - /// - /// boost - /// - public static Query Expand(System.String query, Searcher syns, Analyzer a, System.String field, float boost) + /// query + /// syns + /// a + /// field + /// boost + public static Query Expand(String query, + Searcher syns, + Analyzer a, + String field, + float boost) { - System.Collections.Hashtable already = new System.Collections.Hashtable(); // avoid dups - System.Collections.IList top = new System.Collections.ArrayList(); // needs to be separately listed.. - - // [1] Parse query into separate words so that when we expand we can avoid dups - TokenStream ts = a.TokenStream(field, new System.IO.StringReader(query)); - Lucene.Net.Analysis.Token t; - while ((t = ts.Next()) != null) + already = new List(); // avoid dups + var top = new List(); // needs to be separately listed.. + + var ts = a.TokenStream(field, new StringReader(query)); + var termAtt = ts.AddAttribute(); + + while (ts.IncrementToken()) { - System.String word = t.TermText(); - if (already.Contains(word) == false) + var word = termAtt.Term(); + + if (!already.Contains(word)) { - already.Add(word, word); + already.Add(word); top.Add(word); } } - BooleanQuery tmp = new BooleanQuery(); - + + tmp = new BooleanQuery(); + // [2] form query System.Collections.IEnumerator it = top.GetEnumerator(); while (it.MoveNext()) { // [2a] add to level words in - System.String word = (System.String) it.Current; - TermQuery tq = new TermQuery(new Term(field, word)); + var word = (String)it.Current; + var tq = new TermQuery(new Term(field, word)); tmp.Add(tq, BooleanClause.Occur.SHOULD); - - // [2b] add in unique synonums - Hits hits = syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word))); - for (int i = 0; i < hits.Length(); i++) - { - Document doc = hits.Doc(i); - System.String[] values = doc.GetValues(Syns2Index.F_SYN); - for (int j = 0; j < values.Length; j++) - { - System.String syn = values[j]; - if (already.Contains(syn) == false) - { - already.Add(syn, syn); - tq = new TermQuery(new Term(field, syn)); - if (boost > 0) - // else keep normal 1.0 - tq.SetBoost(boost); - tmp.Add(tq, BooleanClause.Occur.SHOULD); - } - } - } + + var c = new CollectorImpl(field, boost); + syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word)), c); } - - + return tmp; } - } + internal sealed class CountingCollector : Collector + { + public int numHits; + + public override void SetScorer(Scorer scorer) + { } + + public override void Collect(int doc) + { + numHits++; + } + + public override void SetNextReader(IndexReader reader, int docBase) + { } + + public override bool AcceptsDocsOutOfOrder() + { + return true; + } + } + + /// + /// CollectorImpl + /// + internal sealed class CollectorImpl : Collector + { + private IndexReader reader; + private readonly string field; + private readonly float boost; + + public CollectorImpl(string field, float boost) + { + this.field = field; + this.boost = boost; + } + + public override void SetScorer(Scorer scorer) + { + // Ignore + } + + public override void Collect(int doc) + { + var d = reader.Document(doc); + var values = d.GetValues(Syns2Index.F_SYN); + foreach (var syn in values.Where(syn => !already.Contains(syn))) + { + already.Add(syn); + + var tq = new TermQuery(new Term(field, syn)); + if (boost > 0) // else keep normal 1.0 + tq.SetBoost(boost); + + tmp.Add(tq, BooleanClause.Occur.SHOULD); + } + } + + public override void SetNextReader(IndexReader reader, int docBase) + { + this.reader = reader; + } + + public override bool AcceptsDocsOutOfOrder() + { + return true; + } + + } + + /// + /// From project WordNet.Net.Syns2Index + /// + public class Syns2Index + { + /// + public const String F_SYN = "syn"; + + /// + public const String F_WORD = "word"; + } - /// - /// From project WordNet.Net.Syns2Index - /// - public class Syns2Index - { - /// - public const System.String F_SYN = "syn"; - - /// - public const System.String F_WORD = "word"; } + } \ No newline at end of file Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/Syns2Index/Syns2Index.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/Syns2Index/Syns2Index.cs?rev=1206365&r1=1206364&r2=1206365&view=diff ============================================================================== --- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/Syns2Index/Syns2Index.cs (original) +++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/WordNet/Syns2Index/Syns2Index.cs Fri Nov 25 22:39:33 2011 @@ -16,8 +16,12 @@ */ using System; - +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Lucene.Net.Store; using Analyzer = Lucene.Net.Analysis.Analyzer; +using Directory = System.IO.Directory; using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer; using Document = Lucene.Net.Documents.Document; using Field = Lucene.Net.Documents.Field; @@ -44,18 +48,12 @@ namespace WorldNet.Net /// While the WordNet file distinguishes groups of synonyms with /// related meanings we don't do that here. ///

- /// /// This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB. - /// ///
- /// Dave Spencer, dave@searchmorph.com - /// - /// WordNet home page"> - /// - /// prologdb man page"> - /// - /// sample site that uses it"> - /// + /// + /// + /// + /// public class Syns2Index { /// @@ -71,15 +69,17 @@ namespace WorldNet.Net public const System.String F_WORD = "word"; /// - private static readonly Analyzer ana = new StandardAnalyzer(); + private static readonly Analyzer ana = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT); - /// Takes arg of prolog file name and index directory. + /// + /// Takes arg of prolog file name and index directory. + /// [STAThread] public static void Main(System.String[] args) { // get command line arguments - System.String prologFilename = null; // name of file "wn_s.pl" - System.String indexDir = null; + String prologFilename = null; // name of file "wn_s.pl" + String indexDir = null; if (args.Length == 2) { prologFilename = args[0]; @@ -88,45 +88,44 @@ namespace WorldNet.Net else { Usage(); - System.Environment.Exit(1); + Environment.Exit(1); } // ensure that the prolog file is readable - if (!(new System.IO.FileInfo(prologFilename)).Exists) + if (!(new FileInfo(prologFilename)).Exists) { err.WriteLine("Error: cannot read Prolog file: " + prologFilename); - System.Environment.Exit(1); + Environment.Exit(1); } // exit if the target index directory already exists - if (System.IO.Directory.Exists((new System.IO.FileInfo(indexDir)).FullName)) + if (Directory.Exists((new FileInfo(indexDir)).FullName)) { err.WriteLine("Error: index directory already exists: " + indexDir); err.WriteLine("Please specify a name of a non-existent directory"); - System.Environment.Exit(1); + Environment.Exit(1); } o.WriteLine("Opening Prolog file " + prologFilename); - System.IO.FileStream fis = new System.IO.FileStream(prologFilename, System.IO.FileMode.Open, System.IO.FileAccess.Read); - System.IO.StreamReader br = new System.IO.StreamReader(new System.IO.StreamReader(fis, System.Text.Encoding.Default).BaseStream, new System.IO.StreamReader(fis, System.Text.Encoding.Default).CurrentEncoding); - System.String line; + var fis = new FileStream(prologFilename, FileMode.Open, FileAccess.Read); + var br = new StreamReader(new StreamReader(fis, System.Text.Encoding.Default).BaseStream, new StreamReader(fis, System.Text.Encoding.Default).CurrentEncoding); + String line; // maps a word to all the "groups" it's in System.Collections.IDictionary word2Nums = new System.Collections.SortedList(); // maps a group to all the words in it System.Collections.IDictionary num2Words = new System.Collections.SortedList(); // number of rejected words - int ndecent = 0; + var ndecent = 0; // status output - int mod = 1; - int row = 1; + var mod = 1; + var row = 1; // parse prolog file o.WriteLine("[1/2] Parsing " + prologFilename); while ((line = br.ReadLine()) != null) { // occasional progress - if ((++row) % mod == 0) - // periodically print out line we read in + if ((++row) % mod == 0) // periodically print out line we read in { mod *= 2; o.WriteLine("\t" + row + " " + line + " " + word2Nums.Count + " " + num2Words.Count + " ndecent=" + ndecent); @@ -136,17 +135,17 @@ namespace WorldNet.Net if (!line.StartsWith("s(")) { err.WriteLine("OUCH: " + line); - System.Environment.Exit(1); + Environment.Exit(1); } // parse line line = line.Substring(2); - int comma = line.IndexOf((System.Char) ','); - System.String num = line.Substring(0, (comma) - (0)); - int q1 = line.IndexOf((System.Char) '\''); + var comma = line.IndexOf(','); + var num = line.Substring(0, comma); + var q1 = line.IndexOf('\''); line = line.Substring(q1 + 1); - int q2 = line.IndexOf((System.Char) '\''); - System.String word = line.Substring(0, (q2) - (0)).ToLower(); + var q2 = line.IndexOf('\''); + var word = line.Substring(0, q2).ToLower().Replace("''", "'"); // make sure is a normal word if (!IsDecent(word)) @@ -157,11 +156,10 @@ namespace WorldNet.Net // 1/2: word2Nums map // append to entry or add new one - System.Collections.IList lis = (System.Collections.IList) word2Nums[word]; + var lis = (System.Collections.IList) word2Nums[word]; if (lis == null) { - lis = new System.Collections.ArrayList(); - lis.Add(num); + lis = new List {num}; word2Nums[word] = lis; } else @@ -171,8 +169,7 @@ namespace WorldNet.Net lis = (System.Collections.IList) num2Words[num]; if (lis == null) { - lis = new System.Collections.ArrayList(); - lis.Add(word); + lis = new List { word }; num2Words[num] = lis; } else @@ -188,20 +185,18 @@ namespace WorldNet.Net Index(indexDir, word2Nums, num2Words); } - /// Checks to see if a word contains only alphabetic characters by + /// + /// Checks to see if a word contains only alphabetic characters by /// checking it one character at a time. - /// /// - /// string to check - /// - /// true if the string is decent - /// - private static bool IsDecent(System.String s) + /// string to check + /// true if the string is decent + private static bool IsDecent(String s) { - int len = s.Length; - for (int i = 0; i < len; i++) + var len = s.Length; + for (var i = 0; i < len; i++) { - if (!System.Char.IsLetter(s[i])) + if (!Char.IsLetter(s[i])) { return false; } @@ -209,75 +204,73 @@ namespace WorldNet.Net return true; } - /// Forms a Lucene index based on the 2 maps. - /// + /// + /// Forms a Lucene index based on the 2 maps. /// - /// the direcotry where the index should be created - /// - /// word2Nums - /// - /// num2Words - /// - private static void Index(System.String indexDir, System.Collections.IDictionary word2Nums, System.Collections.IDictionary num2Words) + /// the direcotry where the index should be created + /// word2Nums + /// num2Words + private static void Index(String indexDir, System.Collections.IDictionary word2Nums, System.Collections.IDictionary num2Words) { - int row = 0; - int mod = 1; + var row = 0; + var mod = 1; - // override the specific index if it already exists - IndexWriter writer = new IndexWriter(indexDir, ana, true); - writer.SetUseCompoundFile(true); // why? - // blindly up these parameters for speed - writer.SetMergeFactor(writer.GetMergeFactor() * 2); - writer.SetMaxBufferedDocs(writer.GetMaxBufferedDocs() * 2); - System.Collections.IEnumerator i1 = word2Nums.Keys.GetEnumerator(); - while (i1.MoveNext()) - // for each word + using (var dir = FSDirectory.Open(new DirectoryInfo(indexDir))) { - System.String g = (System.String) i1.Current; - Document doc = new Document(); - - int n = Index(word2Nums, num2Words, g, doc); - if (n > 0) + var writer = new IndexWriter(dir, ana, true, IndexWriter.MaxFieldLength.LIMITED); + writer.SetUseCompoundFile(true); // why? + + var i1 = word2Nums.Keys.GetEnumerator(); + while (i1.MoveNext()) { - doc.Add(new Field(F_WORD, g, Field.Store.YES, Field.Index.UN_TOKENIZED)); - if ((++row % mod) == 0) + var g = (String)i1.Current; + var doc = new Document(); + + var n = Index(word2Nums, num2Words, g, doc); + if (n > 0) { - o.WriteLine("\trow=" + row + "/" + word2Nums.Count + " doc= " + doc); - mod *= 2; + doc.Add(new Field(F_WORD, g, Field.Store.YES, Field.Index.NOT_ANALYZED)); + if ((++row % mod) == 0) + { + o.WriteLine("\trow=" + row + "/" + word2Nums.Count + " doc= " + doc); + mod *= 2; + } + writer.AddDocument(doc); } - writer.AddDocument(doc); - } // else degenerate + } + o.WriteLine("Optimizing.."); + writer.Optimize(); + writer.Close(); } - o.WriteLine("Optimizing.."); - writer.Optimize(); - writer.Close(); + } - /// Given the 2 maps fills a document for 1 word. + /// + /// Given the 2 maps fills a document for 1 word. + /// private static int Index(System.Collections.IDictionary word2Nums, System.Collections.IDictionary num2Words, System.String g, Document doc) { - System.Collections.IList keys = (System.Collections.IList) word2Nums[g]; // get list of key#'s - System.Collections.IEnumerator i2 = keys.GetEnumerator(); + var keys = (System.Collections.IList) word2Nums[g]; // get list of key#'s + var i2 = keys.GetEnumerator(); - System.Collections.SortedList already = new System.Collections.SortedList(); // keep them sorted + var already = new System.Collections.SortedList(); // keep them sorted // pass 1: fill up 'already' with all words while (i2.MoveNext()) // for each key# { - foreach (object item in (System.Collections.IList) num2Words[i2.Current]) // get list of words + foreach (var item in + ((System.Collections.IList) num2Words[i2.Current]).Cast().Where(item => already.Contains(item) == false)) { - if (already.Contains(item) == false) - { - already.Add(item, item); - } + already.Add(item, item); } } - int num = 0; + + var num = 0; already.Remove(g); // of course a word is it's own syn - System.Collections.IDictionaryEnumerator it = already.GetEnumerator(); + var it = already.GetEnumerator(); while (it.MoveNext()) { - System.String cur = (System.String) it.Key; + var cur = (String) it.Key; // don't store things like 'pit bull' -> 'american pit bull' if (!IsDecent(cur)) { @@ -295,16 +288,5 @@ namespace WorldNet.Net o.WriteLine("\n\n" + typeof(Syns2Index) + " \n\n"); } - static Syns2Index() - { - System.IO.StreamWriter temp_writer; - temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding); - temp_writer.AutoFlush = true; - o = temp_writer; - System.IO.StreamWriter temp_writer2; - temp_writer2 = new System.IO.StreamWriter(System.Console.OpenStandardError(), System.Console.Error.Encoding); - temp_writer2.AutoFlush = true; - err = temp_writer2; - } } } \ No newline at end of file