lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [1/3] lucenenet git commit: port of lucene-solr/lucene/classification w/o tests
Date Mon, 22 Dec 2014 13:10:01 GMT
Repository: lucenenet
Updated Branches:
  refs/heads/master 2d7533d4e -> c0c101953


port of lucene-solr/lucene/classification w/o tests


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/aba955ce
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/aba955ce
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/aba955ce

Branch: refs/heads/master
Commit: aba955ce29c81a2e943246acb1df7940fe3d7483
Parents: 2d7533d
Author: Laimonas Simutis <laimis@gmail.com>
Authored: Sat Dec 6 22:23:30 2014 -0500
Committer: Laimonas Simutis <laimis@gmail.com>
Committed: Sat Dec 6 22:23:30 2014 -0500

----------------------------------------------------------------------
 .../ClassificationResult.cs                     |  64 ++++++
 src/Lucene.Net.Classification/Classifier.cs     |  65 ++++++
 .../KNearesteighborClassifier.cs                | 150 ++++++++++++++
 .../Lucene.Net.Classification.csproj            |  66 ++++++
 .../Lucene.Net.Classification.sln               |  54 +++++
 .../Properties/AssemblyInfo.cs                  |  36 ++++
 .../SimpleNaiveBayesClassifier.cs               | 205 +++++++++++++++++++
 7 files changed, 640 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/ClassificationResult.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/ClassificationResult.cs b/src/Lucene.Net.Classification/ClassificationResult.cs
new file mode 100644
index 0000000..356ec7e
--- /dev/null
+++ b/src/Lucene.Net.Classification/ClassificationResult.cs
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Classification
+{
+    /// <summary>
+    /// The result of a call to {@link Classifier#assignClass(String)} holding an assigned
class of type <code>T</code> and a score.
+    /// @lucene.experimental
+    /// </summary>
+    public class ClassificationResult<T> {
+
+        private readonly T _assignedClass;
+        private readonly double _score;
+
+        /// <summary>
+        /// Constructor
+        /// <param name="assignedClass">assignedClass the class <code>T</code>
assigned by a {@link Classifier}</param>
+        /// <param name="score">score the score for the assignedClass as a <code>double</code></param>
+        /// </summary>
+        public ClassificationResult(T assignedClass, double score) 
+        {
+            this._assignedClass = assignedClass;
+            this._score = score;
+        }
+
+        /// <summary>
+        /// retrieve the result class
+        /// @return a <code>T</code> representing an assigned class
+        /// </summary>
+        public T AssignedClass 
+        {
+            get
+            {
+                return _assignedClass;
+            }
+        }
+
+        /// <summary>
+        /// retrieve the result score
+        /// @return a <code>double</code> representing a result score
+        /// </summary>
+        public double Score
+        {
+            get
+            {
+                return _score;
+            }
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/Classifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Classifier.cs b/src/Lucene.Net.Classification/Classifier.cs
new file mode 100644
index 0000000..6ffca79
--- /dev/null
+++ b/src/Lucene.Net.Classification/Classifier.cs
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Classification
+{
+    using Lucene.Net.Analysis;
+    using Lucene.Net.Index;
+    using Lucene.Net.Search;
+    using System;
+
+    /// <summary>
+    /// A classifier, see <code>http://en.wikipedia.org/wiki/Classifier_(mathematics)</code>,
which assign classes of type
+    /// <code>T</code>
+    /// @lucene.experimental
+    /// </summary>
+    public interface Classifier<T> 
+    {
+        /// <summary>
+        /// Assign a class (with score) to the given text String
+        /// </summary>
+        /// <param name="text">a String containing text to be classified</param>
+        /// <returns>a {ClassificationResult} holding assigned class of type <code>T</code>
and score</returns>
+        ClassificationResult<T> AssignClass(String text);
+
+        /// <summary>
+        /// * Train the classifier using the underlying Lucene index
+        /// </summary>
+        /// <param name="analyzer"> the analyzer used to tokenize / filter the unseen
text</param>
+        /// <param name="atomicReader">the reader to use to access the Lucene index</param>
+        /// <param name="classFieldName">the name of the field containing the class
assigned to documents</param>
+        /// <param name="textFieldName">the name of the field used to compare documents</param>
+        void Train(AtomicReader atomicReader, String textFieldName, String classFieldName,
Analyzer analyzer);
+
+        /// <summary>Train the classifier using the underlying Lucene index</summary>
+        /// <param name="analyzer">the analyzer used to tokenize / filter the unseen
text</param>
+        /// <param name="atomicReader">the reader to use to access the Lucene index</param>
+        /// <param name="classFieldName">the name of the field containing the class
assigned to documents</param>
+        /// <param name="query">the query to filter which documents use for training</param>
+        /// <param name="textFieldName">the name of the field used to compare documents</param>
+        void Train(AtomicReader atomicReader, String textFieldName, String classFieldName,
Analyzer analyzer, Query query);
+
+        /// <summary>Train the classifier using the underlying Lucene index</summary>
+        /// <param name="analyzer">the analyzer used to tokenize / filter the unseen
text</param>
+        /// <param name="atomicReader">the reader to use to access the Lucene index</param>
+        /// <param name="classFieldName">the name of the field containing the class
assigned to documents</param>
+        /// <param name="query">the query to filter which documents use for training</param>
+        /// <param name="textFieldNames">the names of the fields to be used to compare
documents</param>
+        void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName,
Analyzer analyzer,
+                   Query query);
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/KNearesteighborClassifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/KNearesteighborClassifier.cs b/src/Lucene.Net.Classification/KNearesteighborClassifier.cs
new file mode 100644
index 0000000..c83301e
--- /dev/null
+++ b/src/Lucene.Net.Classification/KNearesteighborClassifier.cs
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Classification
+{
+    using Lucene.Net.Analysis;
+    using Lucene.Net.Index;
+    using Lucene.Net.Queries.Mlt;
+    using Lucene.Net.Search;
+    using Lucene.Net.Util;
+    using System;
+    using System.Collections.Generic;
+    using System.IO;
+
+    /// <summary>
+    /// A k-Nearest Neighbor classifier (see <code>http://en.wikipedia.org/wiki/K-nearest_neighbors</code>)
based
+    /// on {@link MoreLikeThis}
+    /// 
+    /// @lucene.experimental
+    /// </summary>
+    public class KNearestNeighborClassifier : Classifier<BytesRef> 
+    {
+
+        private MoreLikeThis _mlt;
+        private String[] _textFieldNames;
+        private String _classFieldName;
+        private IndexSearcher _indexSearcher;
+        private readonly int _k;
+        private Query _query;
+
+        private int _minDocsFreq;
+        private int _minTermFreq;
+
+        /// <summary>Create a {@link Classifier} using kNN algorithm</summary>
+        /// <param name="k">the number of neighbors to analyze as an <code>int</code></param>
+        public KNearestNeighborClassifier(int k) 
+        {
+        this._k = k;
+        }
+
+        /// <summary>Create a {@link Classifier} using kNN algorithm</summary>
+        /// <param name="k">the number of neighbors to analyze as an <code>int</code></param>
+        /// <param name="minDocsFreq">the minimum number of docs frequency for MLT
to be set with {@link MoreLikeThis#setMinDocFreq(int)}</param>
+        /// <param name="minTermFreq">the minimum number of term frequency for MLT
to be set with {@link MoreLikeThis#setMinTermFreq(int)}</param>
+        public KNearestNeighborClassifier(int k, int minDocsFreq, int minTermFreq) 
+        {
+        this._k = k;
+        this._minDocsFreq = minDocsFreq;
+        this._minTermFreq = minTermFreq;
+        }
+
+        public ClassificationResult<BytesRef> AssignClass(String text)
+        {
+            if (_mlt == null) 
+            {
+                throw new IOException("You must first call Classifier#train");
+            }
+
+            BooleanQuery mltQuery = new BooleanQuery();
+            foreach (String textFieldName in _textFieldNames) 
+            {
+                mltQuery.Add(new BooleanClause(_mlt.Like(new StringReader(text), textFieldName),
BooleanClause.Occur.SHOULD));
+            }
+            Query classFieldQuery = new WildcardQuery(new Term(_classFieldName, "*"));
+            mltQuery.Add(new BooleanClause(classFieldQuery, BooleanClause.Occur.MUST));
+            if (_query != null) {
+                mltQuery.Add(_query, BooleanClause.Occur.MUST);
+            }
+            TopDocs topDocs = _indexSearcher.Search(mltQuery, _k);
+            return SelectClassFromNeighbors(topDocs);
+        }
+
+        private ClassificationResult<BytesRef> SelectClassFromNeighbors(TopDocs topDocs)

+        {
+            // TODO : improve the nearest neighbor selection
+            Dictionary<BytesRef, int> classCounts = new Dictionary<BytesRef, int>();
+
+            foreach (ScoreDoc scoreDoc in topDocs.ScoreDocs) 
+            {
+                BytesRef cl = new BytesRef(_indexSearcher.Doc(scoreDoc.Doc).GetField(_classFieldName).StringValue);
+                int count = classCounts[cl];
+                if (classCounts.ContainsKey(cl))
+                {
+                    classCounts[cl] = count + 1;
+                } 
+                else 
+                {
+                    classCounts.Add(cl, 1);
+                }
+            }
+            double max = 0;
+            BytesRef assignedClass = new BytesRef();
+            foreach (KeyValuePair<BytesRef, int> entry in classCounts) 
+            {
+                int count = entry.Value;
+                if (count > max) 
+                {
+                    max = count;
+                    assignedClass = (BytesRef)entry.Key.Clone();
+                }
+            }
+            double score = max / (double) _k;
+            return new ClassificationResult<BytesRef>(assignedClass, score);
+        }
+
+        public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName,
Analyzer analyzer) 
+        {
+            Train(atomicReader, textFieldName, classFieldName, analyzer, null);
+        }
+
+
+        public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName,
Analyzer analyzer, Query query) 
+        {
+            Train(atomicReader, new String[]{textFieldName}, classFieldName, analyzer, query);
+        }
+
+        public void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName,
Analyzer analyzer, Query query)
+        {
+            this._textFieldNames = textFieldNames;
+            this._classFieldName = classFieldName;
+            _mlt = new MoreLikeThis(atomicReader);
+            _mlt.Analyzer = analyzer;
+            _mlt.FieldNames = _textFieldNames;
+            _indexSearcher = new IndexSearcher(atomicReader);
+            if (_minDocsFreq > 0) 
+            {
+                _mlt.MinDocFreq = _minDocsFreq;
+            }
+            if (_minTermFreq > 0) 
+            {
+                _mlt.MinTermFreq = _minTermFreq;
+            }
+            this._query = query;
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj b/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
new file mode 100644
index 0000000..e0bf2e9
--- /dev/null
+++ b/src/Lucene.Net.Classification/Lucene.Net.Classification.csproj
@@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props"
Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')"
/>
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProjectGuid>{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}</ProjectGuid>
+    <OutputType>Library</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>Lucene.Net.Classification</RootNamespace>
+    <AssemblyName>Lucene.Net.Classification</AssemblyName>
+    <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+    <DebugSymbols>true</DebugSymbols>
+    <DebugType>full</DebugType>
+    <Optimize>false</Optimize>
+    <OutputPath>bin\Debug\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+    <DebugType>pdbonly</DebugType>
+    <Optimize>true</Optimize>
+    <OutputPath>bin\Release\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="System" />
+    <Reference Include="System.Core" />
+    <Reference Include="System.Xml.Linq" />
+    <Reference Include="System.Data.DataSetExtensions" />
+    <Reference Include="Microsoft.CSharp" />
+    <Reference Include="System.Data" />
+    <Reference Include="System.Xml" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="ClassificationResult.cs" />
+    <Compile Include="Classifier.cs" />
+    <Compile Include="KNearesteighborClassifier.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+    <Compile Include="SimpleNaiveBayesClassifier.cs" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="..\Lucene.Net.Core\Lucene.Net.csproj">
+      <Project>{5D4AD9BE-1FFB-41AB-9943-25737971BF57}</Project>
+      <Name>Lucene.Net</Name>
+    </ProjectReference>
+    <ProjectReference Include="..\Lucene.Net.Queries\Lucene.Net.Queries.csproj">
+      <Project>{69D7956C-C2CC-4708-B399-A188FEC384C4}</Project>
+      <Name>Lucene.Net.Queries</Name>
+    </ProjectReference>
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <!-- To modify your build process, add your task inside one of the targets below and
uncomment it. 
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+</Project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/Lucene.Net.Classification.sln
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Lucene.Net.Classification.sln b/src/Lucene.Net.Classification/Lucene.Net.Classification.sln
new file mode 100644
index 0000000..9965049
--- /dev/null
+++ b/src/Lucene.Net.Classification/Lucene.Net.Classification.sln
@@ -0,0 +1,54 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Classification", "Lucene.Net.Classification.csproj",
"{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net", "..\Lucene.Net.Core\Lucene.Net.csproj",
"{5D4AD9BE-1FFB-41AB-9943-25737971BF57}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Queries", "..\Lucene.Net.Queries\Lucene.Net.Queries.csproj",
"{69D7956C-C2CC-4708-B399-A188FEC384C4}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Debug|Mixed Platforms = Debug|Mixed Platforms
+		Debug|x86 = Debug|x86
+		Release|Any CPU = Release|Any CPU
+		Release|Mixed Platforms = Release|Mixed Platforms
+		Release|x86 = Release|x86
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Debug|x86.ActiveCfg = Debug|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Release|Any CPU.Build.0 = Release|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Release|Mixed Platforms.ActiveCfg = Release|Any
CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Release|Mixed Platforms.Build.0 = Release|Any CPU
+		{E067B8BB-D8E7-4040-BEB8-EFF8BB4149BD}.Release|x86.ActiveCfg = Release|Any CPU
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Any CPU.ActiveCfg = Debug|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Mixed Platforms.ActiveCfg = Debug|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Mixed Platforms.Build.0 = Debug|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|x86.ActiveCfg = Debug|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|x86.Build.0 = Debug|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Any CPU.ActiveCfg = Release|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Mixed Platforms.ActiveCfg = Release|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Mixed Platforms.Build.0 = Release|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|x86.ActiveCfg = Release|x86
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|x86.Build.0 = Release|x86
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Debug|x86.ActiveCfg = Debug|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Release|Any CPU.Build.0 = Release|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Release|Mixed Platforms.ActiveCfg = Release|Any
CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Release|Mixed Platforms.Build.0 = Release|Any CPU
+		{69D7956C-C2CC-4708-B399-A188FEC384C4}.Release|x86.ActiveCfg = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/Properties/AssemblyInfo.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/Properties/AssemblyInfo.cs b/src/Lucene.Net.Classification/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..ede45e1
--- /dev/null
+++ b/src/Lucene.Net.Classification/Properties/AssemblyInfo.cs
@@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following 
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Lucene.Net.Classification")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("Lucene.Net.Classification")]
+[assembly: AssemblyCopyright("Copyright ©  2014")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible 
+// to COM components.  If you need to access a type in this assembly from 
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("ff6180c7-579d-4557-bf6a-ddd139fad2e4")]
+
+// Version information for an assembly consists of the following four values:
+//
+//      Major Version
+//      Minor Version 
+//      Build Number
+//      Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers 
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("1.0.0.0")]
+[assembly: AssemblyFileVersion("1.0.0.0")]

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/aba955ce/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs b/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
new file mode 100644
index 0000000..0980d58
--- /dev/null
+++ b/src/Lucene.Net.Classification/SimpleNaiveBayesClassifier.cs
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Classification
+{
+    using Lucene.Net.Analysis;
+    using Lucene.Net.Analysis.Tokenattributes;
+    using Lucene.Net.Index;
+    using Lucene.Net.Search;
+    using Lucene.Net.Util;
+    using System;
+    using System.Collections.Generic;
+    using System.IO;
+
+    /// <summary>
+    /// A simplistic Lucene based NaiveBayes classifier, see <code>http://en.wikipedia.org/wiki/Naive_Bayes_classifier</code>
+    ///
+    /// @lucene.experimental
+    /// </summary>
+    public class SimpleNaiveBayesClassifier : Classifier<BytesRef> 
+    {
+        private AtomicReader _atomicReader;
+        private String[] _textFieldNames;
+        private String _classFieldName;
+        private int _docsWithClassSize;
+        private Analyzer _analyzer;
+        private IndexSearcher _indexSearcher;
+        private Query _query;
+
+        public SimpleNaiveBayesClassifier()
+        {      
+        }
+
+        public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName,
Analyzer analyzer) 
+        {
+            Train(atomicReader, textFieldName, classFieldName, analyzer, null);
+        }
+
+        public void Train(AtomicReader atomicReader, String textFieldName, String classFieldName,
Analyzer analyzer, Query query)
+        {
+            Train(atomicReader, new String[]{textFieldName}, classFieldName, analyzer, query);
+        }
+
+        public void Train(AtomicReader atomicReader, String[] textFieldNames, String classFieldName,
Analyzer analyzer, Query query)
+        {
+            this._atomicReader = atomicReader;
+            this._indexSearcher = new IndexSearcher(this._atomicReader);
+            this._textFieldNames = textFieldNames;
+            this._classFieldName = classFieldName;
+            this._analyzer = analyzer;
+            this._query = query;
+            this._docsWithClassSize = CountDocsWithClass();
+        }
+
+        private int CountDocsWithClass() 
+        {
+            int docCount = MultiFields.GetTerms(this._atomicReader, this._classFieldName).DocCount;
+            if (docCount == -1) 
+            { // in case codec doesn't support getDocCount
+                TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector();
+                BooleanQuery q = new BooleanQuery();
+                q.Add(new BooleanClause(new WildcardQuery(new Term(_classFieldName, WildcardQuery.WILDCARD_STRING.ToString())),
BooleanClause.Occur.MUST));
+                if (_query != null) 
+                {
+                    q.Add(_query, BooleanClause.Occur.MUST);
+                }
+                _indexSearcher.Search(q, totalHitCountCollector);
+                docCount = totalHitCountCollector.TotalHits;
+            }
+            return docCount;
+        }
+
+        private String[] TokenizeDoc(String doc)
+        {
+            ICollection<String> result = new LinkedList<string>();
+            foreach (String textFieldName in _textFieldNames) {
+                TokenStream tokenStream = _analyzer.TokenStream(textFieldName, new StringReader(doc));
+                try 
+                {
+                    CharTermAttribute charTermAttribute = tokenStream.AddAttribute<CharTermAttribute>();
+                    tokenStream.Reset();
+                    while (tokenStream.IncrementToken()) 
+                    {
+                        result.Add(charTermAttribute.ToString());
+                    }
+                    tokenStream.End();
+                } 
+                finally 
+                {
+                    IOUtils.CloseWhileHandlingException(tokenStream);
+                }
+            }
+            var ret = new string[result.Count];
+            result.CopyTo(ret, 0);
+            return ret;
+        }
+
+        public ClassificationResult<BytesRef> AssignClass(String inputDocument) 
+        {
+            if (_atomicReader == null) 
+            {
+                throw new IOException("You must first call Classifier#train");
+            }
+            double max = - Double.MaxValue;
+            BytesRef foundClass = new BytesRef();
+
+            Terms terms = MultiFields.GetTerms(_atomicReader, _classFieldName);
+            TermsEnum termsEnum = terms.Iterator(null);
+            BytesRef next;
+            String[] tokenizedDoc = TokenizeDoc(inputDocument);
+            while ((next = termsEnum.Next()) != null) 
+            {
+                double clVal = CalculateLogPrior(next) + CalculateLogLikelihood(tokenizedDoc,
next);
+                if (clVal > max) 
+                {
+                    max = clVal;
+                    foundClass = BytesRef.DeepCopyOf(next);
+                }
+            }
+            double score = 10 / Math.Abs(max);
+            return new ClassificationResult<BytesRef>(foundClass, score);
+        }
+
+
+        private double CalculateLogLikelihood(String[] tokenizedDoc, BytesRef c)
+        {
+            // for each word
+            double result = 0d;
+            foreach (String word in tokenizedDoc) 
+            {
+                // search with text:word AND class:c
+                int hits = GetWordFreqForClass(word, c);
+
+                // num : count the no of times the word appears in documents of class c (+1)
+                double num = hits + 1; // +1 is added because of add 1 smoothing
+
+                // den : for the whole dictionary, count the no of times a word appears in
documents of class c (+|V|)
+                double den = GetTextTermFreqForClass(c) + _docsWithClassSize;
+
+                // P(w|c) = num/den
+                double wordProbability = num / den;
+                result += Math.Log(wordProbability);
+            }
+
+            // log(P(d|c)) = log(P(w1|c))+...+log(P(wn|c))
+            return result;
+        }
+
+        private double GetTextTermFreqForClass(BytesRef c)
+        {
+            double avgNumberOfUniqueTerms = 0;
+            foreach (String textFieldName in _textFieldNames) 
+            {
+                Terms terms = MultiFields.GetTerms(_atomicReader, textFieldName);
+                long numPostings = terms.SumDocFreq; // number of term/doc pairs
+                avgNumberOfUniqueTerms += numPostings / (double) terms.DocCount; // avg #
of unique terms per doc
+            }
+            int docsWithC = _atomicReader.DocFreq(new Term(_classFieldName, c));
+            return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text fields
per doc * # docs with c
+        }
+
+        private int GetWordFreqForClass(String word, BytesRef c)
+        {
+            BooleanQuery booleanQuery = new BooleanQuery();
+            BooleanQuery subQuery = new BooleanQuery();
+            foreach (String textFieldName in _textFieldNames) 
+            {
+                subQuery.Add(new BooleanClause(new TermQuery(new Term(textFieldName, word)),
BooleanClause.Occur.SHOULD));
+            }
+            booleanQuery.Add(new BooleanClause(subQuery, BooleanClause.Occur.MUST));
+            booleanQuery.Add(new BooleanClause(new TermQuery(new Term(_classFieldName, c)),
BooleanClause.Occur.MUST));
+            if (_query != null) 
+            {
+                booleanQuery.Add(_query, BooleanClause.Occur.MUST);
+            }
+            TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector();
+            _indexSearcher.Search(booleanQuery, totalHitCountCollector);
+            return totalHitCountCollector.TotalHits;
+        }
+
+        private double CalculateLogPrior(BytesRef currentClass)
+        {
+            return Math.Log((double) DocCount(currentClass)) - Math.Log(_docsWithClassSize);
+        }
+
+        private int DocCount(BytesRef countedClass) 
+        {
+            return _atomicReader.DocFreq(new Term(_classFieldName, countedClass));
+        }
+    }   
+}
\ No newline at end of file


Mime
View raw message