lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject lucene-solr:master: LUCENE-7287: normalize Ukrainian morfologik dictionary to have unique token+lemma pairs
Date Wed, 06 Jul 2016 10:17:48 GMT
Repository: lucene-solr
Updated Branches:
  refs/heads/master 032e31aea -> bc502bd9c


LUCENE-7287: normalize Ukrainian morfologik dictionary to have unique token+lemma pairs


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/bc502bd9
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/bc502bd9
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/bc502bd9

Branch: refs/heads/master
Commit: bc502bd9c91669cec72f40fd6fc13b6a68e90c52
Parents: 032e31a
Author: Mike McCandless <mikemccand@apache.org>
Authored: Wed Jul 6 06:17:32 2016 -0400
Committer: Mike McCandless <mikemccand@apache.org>
Committed: Wed Jul 6 06:17:32 2016 -0400

----------------------------------------------------------------------
 .../apache/lucene/analysis/uk/mapping_uk.txt    |  19 +++++++++++++++++++
 .../apache/lucene/analysis/uk/ukrainian.dict    | Bin 1707759 -> 1989243 bytes
 .../analysis/uk/TestUkrainianAnalyzer.java      |  19 +++++++++++++------
 3 files changed, 32 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bc502bd9/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/mapping_uk.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/mapping_uk.txt
b/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/mapping_uk.txt
new file mode 100644
index 0000000..1142604
--- /dev/null
+++ b/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/mapping_uk.txt
@@ -0,0 +1,19 @@
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This map normalizes some characters used in Ukrainian text
+"\u2019" => "'"
+"\u02BC" => "'"
+
+# Remove accent
+"\u0301" => ""

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bc502bd9/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/ukrainian.dict
----------------------------------------------------------------------
diff --git a/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/ukrainian.dict
b/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/ukrainian.dict
index 679e392..2468970 100644
Binary files a/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/ukrainian.dict
and b/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/ukrainian.dict
differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bc502bd9/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java
b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java
index 87d3be5..a38fc63 100644
--- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java
+++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java
@@ -37,22 +37,29 @@ public class TestUkrainianAnalyzer extends BaseTokenStreamTestCase {
 
   public void testReusableTokenStream() throws Exception {
     Analyzer a = new UkrainianMorfologikAnalyzer();
-    assertAnalyzesTo(a, "Ця п'єса у свою чергу рухається по колу.",
-                     new String[] { "п'єса", "черга", "рухатися", "кола",
"коло", "коло", "кіл", "кіл" });
+    assertAnalyzesTo(a, "Ця п'єса, у свою чергу, рухається по
емоційно-напруженому колу за ритм-енд-блюзом.",
+                     new String[] { "п'єса", "черга", "рухатися", "емоційно",
"напружений", "кола", "коло", "кіл", "ритм", "енд", "блюз"
});
     a.close();
   }
 
   public void testSpecialCharsTokenStream() throws Exception {
     Analyzer a = new UkrainianMorfologikAnalyzer();
-    assertAnalyzesTo(a, "Ця пʼєса, у сво́ю чергу, рухається по
колу.",
-                     new String[] { "п'єса", "черга", "рухатися", "кола",
"коло", "коло", "кіл", "кіл" });
+    assertAnalyzesTo(a, "Ця пʼєса, у сво́ю чергу рухається.",
+                     new String[] { "п'єса", "черга", "рухатися" });
     a.close();
   }
 
   public void testCapsTokenStream() throws Exception {
     Analyzer a = new UkrainianMorfologikAnalyzer();
-    assertAnalyzesTo(a, "Цей Чайковський.",
-                     new String[] { "чайковський" });
+    assertAnalyzesTo(a, "Цей Чайковський і Ґете.",
+                     new String[] { "чайковський", "ґете" });
+    a.close();
+  }
+
+  public void testSampleSentence() throws Exception {
+    Analyzer a = new UkrainianMorfologikAnalyzer();
+    assertAnalyzesTo(a, "Це — проект генерування словника з
тегами частин мови для української мови.",
+                     new String[] { "проект", "генерування", "словник",
"тег", "частина", "мова", "українська", "український",
"мова" });
     a.close();
   }
 


Mime
View raw message