Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 51A01200B36 for ; Wed, 6 Jul 2016 12:17:50 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 503A0160A64; Wed, 6 Jul 2016 10:17:50 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 963F1160A36 for ; Wed, 6 Jul 2016 12:17:49 +0200 (CEST) Received: (qmail 40160 invoked by uid 500); 6 Jul 2016 10:17:48 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 40151 invoked by uid 99); 6 Jul 2016 10:17:48 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 06 Jul 2016 10:17:48 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id A1227E0B66; Wed, 6 Jul 2016 10:17:48 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: mikemccand@apache.org To: commits@lucene.apache.org Message-Id: <66b636e4a8474315bd9f7a6d35db014b@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: lucene-solr:master: LUCENE-7287: normalize Ukrainian morfologik dictionary to have unique token+lemma pairs Date: Wed, 6 Jul 2016 10:17:48 +0000 (UTC) archived-at: Wed, 06 Jul 2016 10:17:50 -0000 Repository: lucene-solr Updated Branches: refs/heads/master 032e31aea -> bc502bd9c LUCENE-7287: normalize Ukrainian morfologik dictionary to have unique token+lemma pairs Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/bc502bd9 Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/bc502bd9 Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/bc502bd9 Branch: refs/heads/master Commit: bc502bd9c91669cec72f40fd6fc13b6a68e90c52 Parents: 032e31a Author: Mike McCandless Authored: Wed Jul 6 06:17:32 2016 -0400 Committer: Mike McCandless Committed: Wed Jul 6 06:17:32 2016 -0400 ---------------------------------------------------------------------- .../apache/lucene/analysis/uk/mapping_uk.txt | 19 +++++++++++++++++++ .../apache/lucene/analysis/uk/ukrainian.dict | Bin 1707759 -> 1989243 bytes .../analysis/uk/TestUkrainianAnalyzer.java | 19 +++++++++++++------ 3 files changed, 32 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bc502bd9/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/mapping_uk.txt ---------------------------------------------------------------------- diff --git a/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/mapping_uk.txt b/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/mapping_uk.txt new file mode 100644 index 0000000..1142604 --- /dev/null +++ b/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/mapping_uk.txt @@ -0,0 +1,19 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# This map normalizes some characters used in Ukrainian text +"\u2019" => "'" +"\u02BC" => "'" + +# Remove accent +"\u0301" => "" http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bc502bd9/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/ukrainian.dict ---------------------------------------------------------------------- diff --git a/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/ukrainian.dict b/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/ukrainian.dict index 679e392..2468970 100644 Binary files a/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/ukrainian.dict and b/lucene/analysis/morfologik/src/resources/org/apache/lucene/analysis/uk/ukrainian.dict differ http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bc502bd9/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java index 87d3be5..a38fc63 100644 --- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java +++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java @@ -37,22 +37,29 @@ public class TestUkrainianAnalyzer extends BaseTokenStreamTestCase { public void testReusableTokenStream() throws Exception { Analyzer a = new UkrainianMorfologikAnalyzer(); - assertAnalyzesTo(a, "Ця п'єса у свою чергу рухається по колу.", - new String[] { "п'єса", "черга", "рухатися", "кола", "коло", "коло", "кіл", "кіл" }); + assertAnalyzesTo(a, "Ця п'єса, у свою чергу, рухається по емоційно-напруженому колу за ритм-енд-блюзом.", + new String[] { "п'єса", "черга", "рухатися", "емоційно", "напружений", "кола", "коло", "кіл", "ритм", "енд", "блюз" }); a.close(); } public void testSpecialCharsTokenStream() throws Exception { Analyzer a = new UkrainianMorfologikAnalyzer(); - assertAnalyzesTo(a, "Ця пʼєса, у сво́ю чергу, рухається по колу.", - new String[] { "п'єса", "черга", "рухатися", "кола", "коло", "коло", "кіл", "кіл" }); + assertAnalyzesTo(a, "Ця пʼєса, у сво́ю чергу рухається.", + new String[] { "п'єса", "черга", "рухатися" }); a.close(); } public void testCapsTokenStream() throws Exception { Analyzer a = new UkrainianMorfologikAnalyzer(); - assertAnalyzesTo(a, "Цей Чайковський.", - new String[] { "чайковський" }); + assertAnalyzesTo(a, "Цей Чайковський і Ґете.", + new String[] { "чайковський", "ґете" }); + a.close(); + } + + public void testSampleSentence() throws Exception { + Analyzer a = new UkrainianMorfologikAnalyzer(); + assertAnalyzesTo(a, "Це — проект генерування словника з тегами частин мови для української мови.", + new String[] { "проект", "генерування", "словник", "тег", "частина", "мова", "українська", "український", "мова" }); a.close(); }