lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From d...@apache.org
Subject svn commit: r1069573 [2/3] - in /incubator/lucene.net: tags/Lucene.Net_2_9_2/contrib/Analyzers/ tags/Lucene.Net_2_9_2/contrib/Analyzers/BR/ tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/ tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net...
Date Thu, 10 Feb 2011 21:17:45 GMT
Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj Thu Feb 10 21:17:43 2011
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="3.5" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProductVersion>9.0.21022</ProductVersion>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{4286E961-9143-4821-B46D-3D39D3736386}</ProjectGuid>
+    <OutputType>Library</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>Lucene.Net.Analyzers</RootNamespace>
+    <AssemblyName>Lucene.Net.Analyzers</AssemblyName>
+    <TargetFrameworkVersion>v2.0</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+    <DebugSymbols>true</DebugSymbols>
+    <DebugType>full</DebugType>
+    <Optimize>false</Optimize>
+    <OutputPath>bin\Debug\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+    <DebugType>pdbonly</DebugType>
+    <Optimize>true</Optimize>
+    <OutputPath>bin\Release\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="Lucene.Net, Version=2.9.2.2, Culture=neutral, processorArchitecture=MSIL">
+      <SpecificVersion>False</SpecificVersion>
+      <HintPath>..\Lucene.Net.dll</HintPath>
+    </Reference>
+    <Reference Include="System" />
+    <Reference Include="System.Data" />
+    <Reference Include="System.Xml" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="AR\ArabicAnalyzer.cs" />
+    <Compile Include="AR\ArabicLetterTokenizer.cs" />
+    <Compile Include="AR\ArabicNormalizationFilter.cs" />
+    <Compile Include="AR\ArabicNormalizer.cs" />
+    <Compile Include="AR\ArabicStemFilter.cs" />
+    <Compile Include="AR\ArabicStemmer.cs" />
+    <Compile Include="BR\BrazilianAnalyzer.cs" />
+    <Compile Include="BR\BrazilianStemFilter.cs" />
+    <Compile Include="BR\BrazilianStemmer.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+  </ItemGroup>
+  <ItemGroup>
+    <EmbeddedResource Include="AR\ArabicStopWords.txt" />
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+</Project>
\ No newline at end of file

Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj.user
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj.user?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj.user (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj.user Thu Feb 10 21:17:43 2011
@@ -0,0 +1 @@
+<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003" />
\ No newline at end of file

Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/Properties/AssemblyInfo.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/Properties/AssemblyInfo.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/Properties/AssemblyInfo.cs (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/Properties/AssemblyInfo.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following 
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Lucene.Net.Analyzers")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("The Apache Software Foundation")]
+[assembly: AssemblyProduct("Lucene.Net.Analyzers")]
+[assembly: AssemblyCopyright("Copyright 2006 - 2011 The Apache Software Foundation")]
+[assembly: AssemblyTrademark("Copyright 2006 - 2011 The Apache Software Foundation")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible 
+// to COM components.  If you need to access a type in this assembly from 
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("36a962fb-a8be-4238-88c4-32568216e247")]
+
+// Version information for an assembly consists of the following four values:
+//
+//      Major Version
+//      Minor Version 
+//      Build Number
+//      Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers 
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("2.9.2.1")]
+[assembly: AssemblyFileVersion("2.9.2.1")]

Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/AR/TestArabicAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/AR/TestArabicAnalyzer.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/AR/TestArabicAnalyzer.cs (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/AR/TestArabicAnalyzer.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.AR
+{
+
+
+    /**
+     * Test the Arabic Analyzer
+     *
+     */
+    [TestFixture]
+    public class TestArabicAnalyzer : BaseTokenStreamTestCase
+    {
+
+        /** This test fails with NPE when the 
+         * stopwords file is missing in classpath */
+        [Test]
+        public void TestResourcesAvailable()
+        {
+            new ArabicAnalyzer();
+        }
+
+        /**
+         * Some simple tests showing some features of the analyzer, how some regular forms will conflate
+         */
+        [Test]
+        public void TestBasicFeatures()
+        {
+            ArabicAnalyzer a = new ArabicAnalyzer();
+            AssertAnalyzesTo(a, "كبير", new String[] { "كبير" });
+            AssertAnalyzesTo(a, "كبيرة", new String[] { "كبير" }); // feminine marker
+
+            AssertAnalyzesTo(a, "مشروب", new String[] { "مشروب" });
+            AssertAnalyzesTo(a, "مشروبات", new String[] { "مشروب" }); // plural -at
+
+            AssertAnalyzesTo(a, "أمريكيين", new String[] { "امريك" }); // plural -in
+            AssertAnalyzesTo(a, "امريكي", new String[] { "امريك" }); // singular with bare alif
+
+            AssertAnalyzesTo(a, "كتاب", new String[] { "كتاب" });
+            AssertAnalyzesTo(a, "الكتاب", new String[] { "كتاب" }); // definite article
+
+            AssertAnalyzesTo(a, "ما ملكت أيمانكم", new String[] { "ملكت", "ايمانكم" });
+            AssertAnalyzesTo(a, "الذين ملكت أيمانكم", new String[] { "ملكت", "ايمانكم" }); // stopwords
+        }
+
+        /**
+         * Simple tests to show things are getting reset correctly, etc.
+         */
+        [Test]
+        public void TestReusableTokenStream()
+        {
+            ArabicAnalyzer a = new ArabicAnalyzer();
+            AssertAnalyzesToReuse(a, "كبير", new String[] { "كبير" });
+            AssertAnalyzesToReuse(a, "كبيرة", new String[] { "كبير" }); // feminine marker
+        }
+
+        /**
+         * Non-arabic text gets treated in a similar way as SimpleAnalyzer.
+         */
+        [Test]
+        public void TestEnglishInput()
+        {
+            AssertAnalyzesTo(new ArabicAnalyzer(), "English text.", new String[] {
+        "english", "text" });
+        }
+
+        /**
+         * Test that custom stopwords work, and are not case-sensitive.
+         */
+        [Test]
+        public void TestCustomStopwords()
+        {
+            ArabicAnalyzer a = new ArabicAnalyzer(new String[] { "the", "and", "a" });
+            AssertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick", "brown", "fox" });
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/AR/TestArabicNormalizationFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/AR/TestArabicNormalizationFilter.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/AR/TestArabicNormalizationFilter.cs (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/AR/TestArabicNormalizationFilter.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,131 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.AR
+{
+
+
+    /**
+     * Test the Arabic Normalization Filter
+     *
+     */
+    [TestFixture]
+    public class TestArabicNormalizationFilter : BaseTokenStreamTestCase
+    {
+
+        [Test]
+        public void TestAlifMadda()
+        {
+            Check("آجن", "اجن");
+        }
+
+        [Test]
+        public void TestAlifHamzaAbove()
+        {
+            Check("أحمد", "احمد");
+        }
+
+        [Test]
+        public void TestAlifHamzaBelow()
+        {
+            Check("إعاذ", "اعاذ");
+        }
+
+        [Test]
+        public void TestAlifMaksura()
+        {
+            Check("بنى", "بني");
+        }
+
+        [Test]
+        public void TestTehMarbuta()
+        {
+            Check("فاطمة", "فاطمه");
+        }
+
+        [Test]
+        public void TestTatweel()
+        {
+            Check("روبرـــــت", "روبرت");
+        }
+
+        [Test]
+        public void TestFatha()
+        {
+            Check("مَبنا", "مبنا");
+        }
+
+        [Test]
+        public void TestKasra()
+        {
+            Check("علِي", "علي");
+        }
+
+        [Test]
+        public void TestDamma()
+        {
+            Check("بُوات", "بوات");
+        }
+
+        [Test]
+        public void TestFathatan()
+        {
+            Check("ولداً", "ولدا");
+        }
+
+        [Test]
+        public void TestKasratan()
+        {
+            Check("ولدٍ", "ولد");
+        }
+
+        [Test]
+        public void TestDammatan()
+        {
+            Check("ولدٌ", "ولد");
+        }
+
+        [Test]
+        public void TestSukun()
+        {
+            Check("نلْسون", "نلسون");
+        }
+
+        [Test]
+        public void TestShaddah()
+        {
+            Check("هتميّ", "هتمي");
+        }
+
+        private void Check(string input, string expected)
+        {
+            ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
+            ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
+            AssertTokenStreamContents(filter, new String[] { expected });
+        }
+
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/AR/TestArabicStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/AR/TestArabicStemFilter.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/AR/TestArabicStemFilter.cs (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/AR/TestArabicStemFilter.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,174 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+using NUnit.Framework;
+
+
+namespace Lucene.Net.Analysis.AR
+{
+
+
+    /**
+     * Test the Arabic Normalization Filter
+     *
+     */
+    [NUnit.Framework.TestFixture]
+    public class TestArabicStemFilter : BaseTokenStreamTestCase
+    {
+        [Test]
+        public void TestAlPrefix()
+        {
+            Check("الحسن", "حسن");
+        }
+
+        [Test]
+        public void TestWalPrefix()
+        {
+            Check("والحسن", "حسن");
+        }
+
+        [Test]
+        public void TestBalPrefix()
+        {
+            Check("بالحسن", "حسن");
+        }
+
+        [Test]
+        public void TestKalPrefix()
+        {
+            Check("كالحسن", "حسن");
+        }
+
+        [Test]
+        public void TestFalPrefix()
+        {
+            Check("فالحسن", "حسن");
+        }
+
+        [Test]
+        public void TestLlPrefix()
+        {
+            Check("للاخر", "اخر");
+        }
+
+        [Test]
+        public void TestWaPrefix()
+        {
+            Check("وحسن", "حسن");
+        }
+
+        [Test]
+        public void TestAhSuffix()
+        {
+            Check("زوجها", "زوج");
+        }
+
+        [Test]
+        public void TestAnSuffix()
+        {
+            Check("ساهدان", "ساهد");
+        }
+
+        [Test]
+        public void TestAtSuffix()
+        {
+            Check("ساهدات", "ساهد");
+        }
+
+        [Test]
+        public void TestWnSuffix()
+        {
+            Check("ساهدون", "ساهد");
+        }
+
+        [Test]
+        public void TestYnSuffix()
+        {
+            Check("ساهدين", "ساهد");
+        }
+
+        [Test]
+        public void TestYhSuffix()
+        {
+            Check("ساهديه", "ساهد");
+        }
+
+        [Test]
+        public void TestYpSuffix()
+        {
+            Check("ساهدية", "ساهد");
+        }
+
+        [Test]
+        public void TestHSuffix()
+        {
+            Check("ساهده", "ساهد");
+        }
+
+        [Test]
+        public void TestPSuffix()
+        {
+            Check("ساهدة", "ساهد");
+        }
+
+        [Test]
+        public void TestYSuffix()
+        {
+            Check("ساهدي", "ساهد");
+        }
+
+        [Test]
+        public void TestComboPrefSuf()
+        {
+            Check("وساهدون", "ساهد");
+        }
+
+        [Test]
+        public void TestComboSuf()
+        {
+            Check("ساهدهات", "ساهد");
+        }
+
+        [Test]
+        public void TestShouldntStem()
+        {
+            Check("الو", "الو");
+        }
+
+        [Test]
+        public void TestNonArabic()
+        {
+            Check("English", "English");
+        }
+
+        private void Check(string input, string expected)
+        {
+            ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
+            ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
+            AssertTokenStreamContents(filter, new String[] { expected });
+        }
+
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/Properties/AssemblyInfo.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/Properties/AssemblyInfo.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/Properties/AssemblyInfo.cs (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/Properties/AssemblyInfo.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following 
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Lucene.Net.Analyzers.Test")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("The Apache Software Foundation")]
+[assembly: AssemblyProduct("Lucene.Net.Analyzers.Test")]
+[assembly: AssemblyCopyright("Copyright 2006 - 2011 The Apache Software Foundation")]
+[assembly: AssemblyTrademark("Copyright 2006 - 2011 The Apache Software Foundation")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible 
+// to COM components.  If you need to access a type in this assembly from 
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("36a962fb-a8be-4238-88c4-32568216e247")]
+
+// Version information for an assembly consists of the following four values:
+//
+//      Major Version
+//      Minor Version 
+//      Build Number
+//      Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers 
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("2.9.2.1")]
+[assembly: AssemblyFileVersion("2.9.2.1")]

Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/Test.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/Test.csproj?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/Test.csproj (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/Test.csproj Thu Feb 10 21:17:43 2011
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="3.5" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProductVersion>9.0.21022</ProductVersion>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{67D27628-F1D5-4499-9818-B669731925C8}</ProjectGuid>
+    <OutputType>Library</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>Lucene.Net.Analyzers</RootNamespace>
+    <AssemblyName>Lucene.Net.Analyzers.Test</AssemblyName>
+    <TargetFrameworkVersion>v2.0</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+    <DebugSymbols>true</DebugSymbols>
+    <DebugType>full</DebugType>
+    <Optimize>false</Optimize>
+    <OutputPath>bin\Debug\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+    <DebugType>pdbonly</DebugType>
+    <Optimize>true</Optimize>
+    <OutputPath>bin\Release\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="Lucene.Net, Version=2.9.2.2, Culture=neutral, processorArchitecture=MSIL">
+      <SpecificVersion>False</SpecificVersion>
+      <HintPath>..\Lucene.Net.dll</HintPath>
+    </Reference>
+    <Reference Include="Lucene.Net.Test, Version=2.9.2.1, Culture=neutral, processorArchitecture=MSIL">
+      <SpecificVersion>False</SpecificVersion>
+      <HintPath>..\Lucene.Net.Test.dll</HintPath>
+    </Reference>
+    <Reference Include="nunit.framework, Version=2.5.2.9222, Culture=neutral, PublicKeyToken=96d09a1eb7f44a77, processorArchitecture=MSIL" />
+    <Reference Include="System" />
+    <Reference Include="System.Data" />
+    <Reference Include="System.Xml" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="AR\TestArabicAnalyzer.cs" />
+    <Compile Include="AR\TestArabicNormalizationFilter.cs" />
+    <Compile Include="AR\TestArabicStemFilter.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="..\Lucene.Net.Analyzers\Lucene.Net.Analyzers.csproj">
+      <Project>{4286E961-9143-4821-B46D-3D39D3736386}</Project>
+      <Name>Lucene.Net.Analyzers</Name>
+    </ProjectReference>
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+</Project>
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers.sln
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers.sln?rev=1069573&r1=1069572&r2=1069573&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers.sln (original)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers.sln Thu Feb 10 21:17:43 2011
@@ -1,7 +1,9 @@
 
-Microsoft Visual Studio Solution File, Format Version 9.00
-# Visual C# Express 2005
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Analyzers", "Lucene.Net.Analyzers.csproj", "{A4AF790F-900A-48D2-85A7-B948E5214C16}"
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual C# Express 2008
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Analyzers", "Lucene.Net.Analyzers\Lucene.Net.Analyzers.csproj", "{4286E961-9143-4821-B46D-3D39D3736386}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Test", "Test\Test.csproj", "{67D27628-F1D5-4499-9818-B669731925C8}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
@@ -9,10 +11,14 @@ Global
 		Release|Any CPU = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{A4AF790F-900A-48D2-85A7-B948E5214C16}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{A4AF790F-900A-48D2-85A7-B948E5214C16}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{A4AF790F-900A-48D2-85A7-B948E5214C16}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{A4AF790F-900A-48D2-85A7-B948E5214C16}.Release|Any CPU.Build.0 = Release|Any CPU
+		{4286E961-9143-4821-B46D-3D39D3736386}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{4286E961-9143-4821-B46D-3D39D3736386}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{4286E961-9143-4821-B46D-3D39D3736386}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{4286E961-9143-4821-B46D-3D39D3736386}.Release|Any CPU.Build.0 = Release|Any CPU
+		{67D27628-F1D5-4499-9818-B669731925C8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{67D27628-F1D5-4499-9818-B669731925C8}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{67D27628-F1D5-4499-9818-B669731925C8}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{67D27628-F1D5-4499-9818-B669731925C8}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicAnalyzer.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicAnalyzer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicAnalyzer.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,202 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.AR
+{
+    /**
+     * {@link Analyzer} for Arabic. 
+     * <p>
+     * This analyzer implements light-stemming as specified by:
+     * <i>
+     * Light Stemming for Arabic Information Retrieval
+     * </i>    
+     * http://www.mtholyoke.edu/~lballest/Pubs/arab_stem05.pdf
+     * <p>
+     * The analysis package contains three primary components:
+     * <ul>
+     *  <li>{@link ArabicNormalizationFilter}: Arabic orthographic normalization.
+     *  <li>{@link ArabicStemFilter}: Arabic light stemming
+     *  <li>Arabic stop words file: a set of default Arabic stop words.
+     * </ul>
+     * 
+     */
+    public class ArabicAnalyzer : Analyzer
+    {
+
+        /**
+         * File containing default Arabic stopwords.
+         * 
+         * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
+         * The stopword list is BSD-Licensed.
+         */
+        public static string DEFAULT_STOPWORD_FILE = "ArabicStopWords.txt";
+
+        /**
+         * Contains the stopwords used with the StopFilter.
+         */
+        private Hashtable stoptable = new Hashtable();
+        /**
+         * The comment character in the stopwords file.  All lines prefixed with this will be ignored  
+         */
+        public static string STOPWORDS_COMMENT = "#";
+
+        private Version matchVersion;
+
+        /**
+         * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+         *
+         * @deprecated Use {@link #ArabicAnalyzer(Version)} instead
+         */
+        public ArabicAnalyzer() : this(Version.LUCENE_24)
+        {
+            
+        }
+
+        /**
+         * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+         */
+        public ArabicAnalyzer(Version matchVersion)
+        {
+            this.matchVersion = matchVersion;
+
+            using (StreamReader reader = new StreamReader(System.Reflection.Assembly.GetAssembly(this.GetType()).GetManifestResourceStream("Lucene.Net.Analyzers.AR." + DEFAULT_STOPWORD_FILE)))
+            {
+                while (!reader.EndOfStream)
+                {
+                    string word = reader.ReadLine();
+                    stoptable.Add(word, word);
+                }
+            }
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         *
+         * @deprecated Use {@link #ArabicAnalyzer(Version, String[])} instead
+         */
+        public ArabicAnalyzer(string[] stopwords): this(Version.LUCENE_24, stopwords)
+        {
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         */
+        public ArabicAnalyzer(Version matchVersion, string[] stopwords)
+        {
+            stoptable = StopFilter.MakeStopSet(stopwords);
+            this.matchVersion = matchVersion;
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         *
+         * @deprecated Use {@link #ArabicAnalyzer(Version, Hashtable)} instead
+         */
+        public ArabicAnalyzer(Hashtable stopwords) : this(Version.LUCENE_24, stopwords)
+        {
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         */
+        public ArabicAnalyzer(Version matchVersion, Hashtable stopwords)
+        {
+            stoptable = new Hashtable(stopwords);
+            this.matchVersion = matchVersion;
+        }
+
+        //DIGY
+        ///**
+        // * Builds an analyzer with the given stop words.  Lines can be commented out using {@link #STOPWORDS_COMMENT}
+        // *
+        // * @deprecated Use {@link #ArabicAnalyzer(Version, File)} instead
+        // */
+        //public ArabicAnalyzer(File stopwords)
+        //{
+        //    this(Version.LUCENE_24, stopwords);
+        //}
+
+        ///**
+        // * Builds an analyzer with the given stop words.  Lines can be commented out using {@link #STOPWORDS_COMMENT}
+        // */
+        //public ArabicAnalyzer(Version matchVersion, File stopwords)
+        //{
+        //    stoptable = WordlistLoader.getWordSet(stopwords, STOPWORDS_COMMENT);
+        //    this.matchVersion = matchVersion;
+        //}
+
+
+        /**
+         * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
+         *
+         * @return  A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
+         * 			{@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
+         *            and {@link ArabicStemFilter}.
+         */
+        public override TokenStream TokenStream(string fieldName, TextReader reader)
+        {
+            TokenStream result = new ArabicLetterTokenizer(reader);
+            result = new LowerCaseFilter(result);
+            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stoptable);
+            result = new ArabicNormalizationFilter(result);
+            result = new ArabicStemFilter(result);
+
+            return result;
+        }
+
+        private class SavedStreams
+        {
+            internal Tokenizer Source;
+            internal TokenStream Result;
+        };
+
+        /**
+         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+         * in the provided {@link Reader}.
+         *
+         * @return  A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
+         *            {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
+         *            and {@link ArabicStemFilter}.
+         */
+        public override TokenStream ReusableTokenStream(string fieldName, TextReader reader)
+        {
+            SavedStreams streams = (SavedStreams)GetPreviousTokenStream();
+            if (streams == null)
+            {
+                streams = new SavedStreams();
+                streams.Source = new ArabicLetterTokenizer(reader);
+                streams.Result = new LowerCaseFilter(streams.Source);
+                streams.Result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                                streams.Result, stoptable);
+                streams.Result = new ArabicNormalizationFilter(streams.Result);
+                streams.Result = new ArabicStemFilter(streams.Result);
+                SetPreviousTokenStream(streams);
+            }
+            else
+            {
+                streams.Source.Reset(reader);
+            }
+            return streams.Result;
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicLetterTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicLetterTokenizer.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicLetterTokenizer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicLetterTokenizer.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.AR
+{
+
+    /**
+     * Tokenizer that breaks text into runs of letters and diacritics.
+     * <p>
+     * The problem with the standard Letter tokenizer is that it fails on diacritics.
+     * Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc.
+     * </p>
+     *
+     */
+    public class ArabicLetterTokenizer : LetterTokenizer
+    {
+
+        public ArabicLetterTokenizer(TextReader @in): base(@in)
+        {
+            
+        }
+
+        public ArabicLetterTokenizer(AttributeSource source, TextReader @in) : base(source, @in)
+        {
+            
+        }
+
+        public ArabicLetterTokenizer(AttributeFactory factory, TextReader @in) : base(factory, @in)
+        {
+            
+        }
+
+        /** 
+         * Allows for Letter category or NonspacingMark category
+         * @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char)
+         */
+        protected override bool IsTokenChar(char c)
+        {
+            return base.IsTokenChar(c) || char.GetUnicodeCategory(c)==System.Globalization.UnicodeCategory.NonSpacingMark ;
+        }
+
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizationFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizationFilter.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizationFilter.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizationFilter.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+
+namespace Lucene.Net.Analysis.AR
+{
+
+    /**
+     * A {@link TokenFilter} that applies {@link ArabicNormalizer} to normalize the orthography.
+     * 
+     */
+
+    public class ArabicNormalizationFilter : TokenFilter
+    {
+
+        protected ArabicNormalizer normalizer = null;
+        private TermAttribute termAtt;
+
+        public ArabicNormalizationFilter(TokenStream input) : base(input)
+        {
+            
+            normalizer = new ArabicNormalizer();
+            termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+        }
+
+        public override bool IncrementToken()
+        {
+            if (input.IncrementToken())
+            {
+                int newlen = normalizer.Normalize(termAtt.TermBuffer(), termAtt.TermLength());
+                termAtt.SetTermLength(newlen);
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizer.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizer.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+
+namespace Lucene.Net.Analysis.AR
+{
+    /**
+     *  Normalizer for Arabic.
+     *  <p>
+     *  Normalization is done in-place for efficiency, operating on a termbuffer.
+     *  <p>
+     *  Normalization is defined as:
+     *  <ul>
+     *  <li> Normalization of hamza with alef seat to a bare alef.
+     *  <li> Normalization of teh marbuta to heh
+     *  <li> Normalization of dotless yeh (alef maksura) to yeh.
+     *  <li> Removal of Arabic diacritics (the harakat)
+     *  <li> Removal of tatweel (stretching character).
+     * </ul>
+     *
+     */
+    public class ArabicNormalizer
+    {
+        public static char ALEF = '\u0627';
+        public static char ALEF_MADDA = '\u0622';
+        public static char ALEF_HAMZA_ABOVE = '\u0623';
+        public static char ALEF_HAMZA_BELOW = '\u0625';
+
+        public static char YEH = '\u064A';
+        public static char DOTLESS_YEH = '\u0649';
+
+        public static char TEH_MARBUTA = '\u0629';
+        public static char HEH = '\u0647';
+
+        public static char TATWEEL = '\u0640';
+
+        public static char FATHATAN = '\u064B';
+        public static char DAMMATAN = '\u064C';
+        public static char KASRATAN = '\u064D';
+        public static char FATHA = '\u064E';
+        public static char DAMMA = '\u064F';
+        public static char KASRA = '\u0650';
+        public static char SHADDA = '\u0651';
+        public static char SUKUN = '\u0652';
+
+        /**
+         * Normalize an input buffer of Arabic text
+         * 
+         * @param s input buffer
+         * @param len length of input buffer
+         * @return length of input buffer after normalization
+         */
+        public int Normalize(char[] s, int len)
+        {
+
+            for (int i = 0; i < len; i++)
+            {
+                if (s[i] == ALEF_MADDA || s[i] == ALEF_HAMZA_ABOVE || s[i] == ALEF_HAMZA_BELOW)
+                    s[i] = ALEF;
+
+                if (s[i] == DOTLESS_YEH)
+                    s[i] = YEH;
+
+                if (s[i] == TEH_MARBUTA)
+                    s[i] = HEH;
+
+                if (s[i] == TATWEEL || s[i] == KASRATAN || s[i] == DAMMATAN || s[i] == FATHATAN ||
+                    s[i] == FATHA || s[i] == DAMMA || s[i] == KASRA || s[i] == SHADDA || s[i] == SUKUN)
+                {
+                    len = Delete(s, i, len);
+                    i--;
+                }
+            }
+
+            return len;
+        }
+
+        /**
+         * Delete a character in-place
+         * 
+         * @param s Input Buffer
+         * @param pos Position of character to delete
+         * @param len length of input buffer
+         * @return length of input buffer after deletion
+         */
+        protected int Delete(char[] s, int pos, int len)
+        {
+            if (pos < len)
+                Array.Copy(s, pos + 1, s, pos, len - pos - 1); 
+
+            return len - 1;
+        }
+
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemFilter.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemFilter.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemFilter.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+
+namespace Lucene.Net.Analysis.AR
+{
+
+
+    /**
+     * A {@link TokenFilter} that applies {@link ArabicStemmer} to stem Arabic words..
+     * 
+     */
+
+    public class ArabicStemFilter : TokenFilter
+    {
+
+        protected ArabicStemmer stemmer = null;
+        private TermAttribute termAtt;
+
+        public ArabicStemFilter(TokenStream input) : base(input)
+        {
+            stemmer = new ArabicStemmer();
+            termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+        }
+
+        public override bool IncrementToken()
+        {
+            if (input.IncrementToken())
+            {
+                int newlen = stemmer.Stem(termAtt.TermBuffer(), termAtt.TermLength());
+                termAtt.SetTermLength(newlen);
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemmer.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemmer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemmer.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,208 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+
+namespace Lucene.Net.Analysis.AR
+{
+
+
+    /**
+     *  Stemmer for Arabic.
+     *  <p>
+     *  Stemming  is done in-place for efficiency, operating on a termbuffer.
+     *  <p>
+     *  Stemming is defined as:
+     *  <ul>
+     *  <li> Removal of attached definite article, conjunction, and prepositions.
+     *  <li> Stemming of common suffixes.
+     * </ul>
+     *
+     */
+    public class ArabicStemmer
+    {
+        public static char ALEF = '\u0627';
+        public static char BEH = '\u0628';
+        public static char TEH_MARBUTA = '\u0629';
+        public static char TEH = '\u062A';
+        public static char FEH = '\u0641';
+        public static char KAF = '\u0643';
+        public static char LAM = '\u0644';
+        public static char NOON = '\u0646';
+        public static char HEH = '\u0647';
+        public static char WAW = '\u0648';
+        public static char YEH = '\u064A';
+
+        public static char[][] prefixes = {
+            ("" + ALEF + LAM).ToCharArray(), 
+            ("" + WAW + ALEF + LAM).ToCharArray(), 
+            ("" + BEH + ALEF + LAM).ToCharArray(),
+            ("" + KAF + ALEF + LAM).ToCharArray(),
+            ("" + FEH + ALEF + LAM).ToCharArray(),
+            ("" + LAM + LAM).ToCharArray(),
+            ("" + WAW).ToCharArray(),
+        };
+
+        public static char[][] suffixes = {
+            ("" + HEH + ALEF).ToCharArray(), 
+            ("" + ALEF + NOON).ToCharArray(), 
+            ("" + ALEF + TEH).ToCharArray(), 
+            ("" + WAW + NOON).ToCharArray(), 
+            ("" + YEH + NOON).ToCharArray(), 
+            ("" + YEH + HEH).ToCharArray(),
+            ("" + YEH + TEH_MARBUTA).ToCharArray(),
+            ("" + HEH).ToCharArray(),
+            ("" + TEH_MARBUTA).ToCharArray(),
+            ("" + YEH).ToCharArray(),
+        };
+
+
+        /**
+         * Stem an input buffer of Arabic text.
+         * 
+         * @param s input buffer
+         * @param len length of input buffer
+         * @return length of input buffer after normalization
+         */
+        public int Stem(char[] s, int len)
+        {
+            len = StemPrefix(s, len);
+            len = StemSuffix(s, len);
+
+            return len;
+        }
+
+        /**
+         * Stem a prefix off an Arabic word.
+         * @param s input buffer
+         * @param len length of input buffer
+         * @return new length of input buffer after stemming.
+         */
+        public int StemPrefix(char[] s, int len)
+        {
+            for (int i = 0; i < prefixes.Length; i++)
+                if (StartsWith(s, len, prefixes[i]))
+                    return DeleteN(s, 0, len, prefixes[i].Length);
+            return len;
+        }
+
+        /**
+         * Stem suffix(es) off an Arabic word.
+         * @param s input buffer
+         * @param len length of input buffer
+         * @return new length of input buffer after stemming
+         */
+        public int StemSuffix(char[] s, int len)
+        {
+            for (int i = 0; i < suffixes.Length; i++)
+                if (EndsWith(s, len, suffixes[i]))
+                    len = DeleteN(s, len - suffixes[i].Length, len, suffixes[i].Length);
+            return len;
+        }
+
+        /**
+         * Returns true if the prefix matches and can be stemmed
+         * @param s input buffer
+         * @param len length of input buffer
+         * @param prefix prefix to check
+         * @return true if the prefix matches and can be stemmed
+         */
+        bool StartsWith(char[] s, int len, char[] prefix)
+        {
+            if (prefix.Length == 1 && len < 4)
+            { // wa- prefix requires at least 3 characters
+                return false;
+            }
+            else if (len < prefix.Length + 2)
+            { // other prefixes require only 2.
+                return false;
+            }
+            else
+            {
+                for (int i = 0; i < prefix.Length; i++)
+                    if (s[i] != prefix[i])
+                        return false;
+
+                return true;
+            }
+        }
+
+        /**
+         * Returns true if the suffix matches and can be stemmed
+         * @param s input buffer
+         * @param len length of input buffer
+         * @param suffix suffix to check
+         * @return true if the suffix matches and can be stemmed
+         */
+        bool EndsWith(char[] s, int len, char[] suffix)
+        {
+            if (len < suffix.Length + 2)
+            { // all suffixes require at least 2 characters after stemming
+                return false;
+            }
+            else
+            {
+                for (int i = 0; i < suffix.Length; i++)
+                    if (s[len - suffix.Length + i] != suffix[i])
+                        return false;
+
+                return true;
+            }
+        }
+
+
+        /**
+         * Delete n characters in-place
+         * 
+         * @param s Input Buffer
+         * @param pos Position of character to delete
+         * @param len Length of input buffer
+         * @param nChars number of characters to delete
+         * @return length of input buffer after deletion
+         */
+        protected int DeleteN(char[] s, int pos, int len, int nChars)
+        {
+            for (int i = 0; i < nChars; i++)
+                len = Delete(s, pos, len);
+            return len;
+        }
+
+        /**
+         * Delete a character in-place
+         * 
+         * @param s Input Buffer
+         * @param pos Position of character to delete
+         * @param len length of input buffer
+         * @return length of input buffer after deletion
+         */
+        protected int Delete(char[] s, int pos, int len)
+        {
+            if (pos < len)
+                Array.Copy(s, pos + 1, s, pos, len - pos - 1); 
+
+            return len - 1;
+        }
+
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStopWords.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStopWords.txt?rev=1069573&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStopWords.txt
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Standard;
+using System.IO;
+
+/**
+ * Analyzer for Brazilian language. Supports an external list of stopwords (words that
+ * will not be indexed at all) and an external list of exclusions (word that will
+ * not be stemmed, but indexed).
+ *
+ */
+namespace Lucene.Net.Analysis.BR
+{
+    public sealed class BrazilianAnalyzer : Analyzer
+    {
+
+        /**
+         * List of typical Brazilian stopwords.
+         */
+        public static string[] BRAZILIAN_STOP_WORDS = {
+      "a","ainda","alem","ambas","ambos","antes",
+      "ao","aonde","aos","apos","aquele","aqueles",
+      "as","assim","com","como","contra","contudo",
+      "cuja","cujas","cujo","cujos","da","das","de",
+      "dela","dele","deles","demais","depois","desde",
+      "desta","deste","dispoe","dispoem","diversa",
+      "diversas","diversos","do","dos","durante","e",
+      "ela","elas","ele","eles","em","entao","entre",
+      "essa","essas","esse","esses","esta","estas",
+      "este","estes","ha","isso","isto","logo","mais",
+      "mas","mediante","menos","mesma","mesmas","mesmo",
+      "mesmos","na","nas","nao","nas","nem","nesse","neste",
+      "nos","o","os","ou","outra","outras","outro","outros",
+      "pelas","pelas","pelo","pelos","perante","pois","por",
+      "porque","portanto","proprio","propios","quais","qual",
+      "qualquer","quando","quanto","que","quem","quer","se",
+      "seja","sem","sendo","seu","seus","sob","sobre","sua",
+      "suas","tal","tambem","teu","teus","toda","todas","todo",
+      "todos","tua","tuas","tudo","um","uma","umas","uns"};
+
+
+        /**
+         * Contains the stopwords used with the StopFilter.
+         */
+        private Hashtable stoptable = new Hashtable();
+
+        /**
+         * Contains words that should be indexed but not stemmed.
+         */
+        private Hashtable excltable = new Hashtable();
+
+        /**
+         * Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
+         */
+        public BrazilianAnalyzer()
+        {
+            stoptable = StopFilter.MakeStopSet(BRAZILIAN_STOP_WORDS);
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         */
+        public BrazilianAnalyzer(string[] stopwords)
+        {
+            stoptable = StopFilter.MakeStopSet(stopwords);
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         */
+        public BrazilianAnalyzer(Hashtable stopwords)
+        {
+            stoptable = stopwords;
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         */
+        public BrazilianAnalyzer(FileInfo stopwords)
+        {
+            stoptable = WordlistLoader.GetWordSet(stopwords);
+        }
+
+        /**
+         * Builds an exclusionlist from an array of Strings.
+         */
+        public void SetStemExclusionTable(string[] exclusionlist)
+        {
+            excltable = StopFilter.MakeStopSet(exclusionlist);
+        }
+        /**
+         * Builds an exclusionlist from a Hashtable.
+         */
+        public void SetStemExclusionTable(Hashtable exclusionlist)
+        {
+            excltable = exclusionlist;
+        }
+        /**
+         * Builds an exclusionlist from the words contained in the given file.
+         */
+        public void SetStemExclusionTable(FileInfo exclusionlist)
+        {
+            excltable = WordlistLoader.GetWordSet(exclusionlist);
+        }
+
+        /**
+         * Creates a TokenStream which tokenizes all the text in the provided Reader.
+         *
+         * @return  A TokenStream build from a StandardTokenizer filtered with
+         * 			StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter.
+         */
+        public override TokenStream TokenStream(string fieldName, TextReader reader)
+        {
+            TokenStream result = new StandardTokenizer(reader);
+            result = new LowerCaseFilter(result);
+            result = new StandardFilter(result);
+            result = new StopFilter(result, stoptable);
+            result = new BrazilianStemFilter(result, excltable);
+            return result;
+        }
+    }
+}

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemFilter.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemFilter.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemFilter.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using Lucene.Net.Analysis;
+using System.Collections;
+
+
+/**
+ * Based on GermanStemFilter
+ *
+ */
+namespace Lucene.Net.Analysis.BR
+{
+
+    public sealed class BrazilianStemFilter : TokenFilter
+    {
+
+        /**
+         * The actual token in the input stream.
+         */
+        private BrazilianStemmer stemmer = null;
+        private Hashtable exclusions = null;
+
+        public BrazilianStemFilter(TokenStream input)
+            : base(input)
+        {
+            stemmer = new BrazilianStemmer();
+        }
+
+        public BrazilianStemFilter(TokenStream input, Hashtable exclusiontable)
+            : this(input)
+        {
+            this.exclusions = exclusiontable;
+        }
+
+        /**
+         * @return Returns the next token in the stream, or null at EOS.
+         */
+        public override Token Next(Token reusableToken)
+        {
+            System.Diagnostics.Trace.Assert(reusableToken != null);
+
+            Token nextToken = input.Next(reusableToken);
+            if (nextToken == null)
+                return null;
+
+            string term = nextToken.TermText();
+
+            // Check the exclusion table.
+            if (exclusions == null || !exclusions.Contains(term))
+            {
+                string s = stemmer.Stem(term);
+                // If not stemmed, don't waste the time adjusting the token.
+                if ((s != null) && !s.Equals(term))
+                    nextToken.SetTermBuffer(s.ToCharArray(), 0, s.Length);//was  SetTermBuffer(s)
+            }
+            return nextToken;
+        }
+    }
+}
\ No newline at end of file



Mime
View raw message