lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1573569 - in /lucene/dev/trunk/lucene: ./ analysis/common/src/java/org/apache/lucene/analysis/hunspell/ analysis/common/src/test/org/apache/lucene/analysis/hunspell/
Date Mon, 03 Mar 2014 14:20:25 GMT
Author: rmuir
Date: Mon Mar  3 14:20:24 2014
New Revision: 1573569

URL: http://svn.apache.org/r1573569
Log:
LUCENE-5485: hunspell circumfix support

Added:
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCircumfix.java
  (with props)
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.aff
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.dic
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1573569&r1=1573568&r2=1573569&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Mon Mar  3 14:20:24 2014
@@ -87,6 +87,8 @@ New Features
 * LUCENE-5479: FacetsConfig subclass can now customize the default
   per-dim facets configuration.  (Rob Audenaerde via Mike McCandless)
 
+* LUCENE-5485: Add circumfix support to HunspellStemFilter. (Robert Muir)
+
 API Changes
 
 * LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java?rev=1573569&r1=1573568&r2=1573569&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
(original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
Mon Mar  3 14:20:24 2014
@@ -66,6 +66,7 @@ public class Dictionary {
   private static final String SUFFIX_KEY = "SFX";
   private static final String FLAG_KEY = "FLAG";
   private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
+  private static final String CIRCUMFIX_KEY = "CIRCUMFIX";
 
   private static final String NUM_FLAG_TYPE = "num";
   private static final String UTF8_FLAG_TYPE = "UTF-8";
@@ -107,6 +108,8 @@ public class Dictionary {
   boolean ignoreCase;
   boolean complexPrefixes;
   
+  int circumfix = -1; // circumfix flag, or -1 if one is not defined
+  
   /**
    * Creates a new Dictionary containing the information read from the provided InputStreams
to hunspell affix
    * and dictionary files.
@@ -240,6 +243,12 @@ public class Dictionary {
         flagParsingStrategy = getFlagParsingStrategy(line);
       } else if (line.equals(COMPLEXPREFIXES_KEY)) {
         complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage
prefix
+      } else if (line.startsWith(CIRCUMFIX_KEY)) {
+        String parts[] = line.split("\\s+");
+        if (parts.length != 2) {
+          throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
+        }
+        circumfix = flagParsingStrategy.parseFlag(parts[1]);
       }
     }
     

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java?rev=1573569&r1=1573568&r2=1573569&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
(original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
Mon Mar  3 14:20:24 2014
@@ -81,7 +81,7 @@ final class Stemmer {
         stems.add(new CharsRef(word, 0, length));
       }
     }
-    stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false));
+    stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false));
     return stems;
   }
   
@@ -122,9 +122,11 @@ final class Stemmer {
    * @param previousWasPrefix true if the previous removal was a prefix:
    *        if we are removing a suffix, and it has no continuation requirements, its ok.
    *        but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements
to recurse. 
+   * @param circumfix true if the previous prefix removal was signed as a circumfix
+   *        this means inner most suffix must also contain circumfix flag.
    * @return List of stems, or empty list if no stems are found
    */
-  private List<CharsRef> stem(char word[], int length, int previous, int prevFlag,
int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix)
{
+  private List<CharsRef> stem(char word[], int length, int previous, int prevFlag,
int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix,
boolean circumfix) {
     
     // TODO: allow this stuff to be reused by tokenfilter
     List<CharsRef> stems = new ArrayList<CharsRef>();
@@ -171,7 +173,7 @@ final class Stemmer {
                 .append(word, deAffixedStart, deAffixedLength)
                 .toString();
             
-            List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(),
prefix, -1, recursionDepth, true);
+            List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(),
prefix, -1, recursionDepth, true, circumfix);
             
             stems.addAll(stemList);
           }
@@ -219,7 +221,7 @@ final class Stemmer {
             dictionary.stripLookup.get(stripOrd, scratch);
             String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString();
             
-            List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(),
suffix, prefixFlag, recursionDepth, false);
+            List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(),
suffix, prefixFlag, recursionDepth, false, circumfix);
             
             stems.addAll(stemList);
           }
@@ -242,7 +244,7 @@ final class Stemmer {
    * @param prefix true if we are removing a prefix (false if its a suffix)
    * @return List of stems for the word, or an empty list if none are found
    */
-  List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag,
int recursionDepth, boolean prefix) {
+  List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag,
int recursionDepth, boolean prefix, boolean circumfix) {
     segment.setLength(0);
     segment.append(strippedWord, 0, length);
     
@@ -279,10 +281,28 @@ final class Stemmer {
               continue;
             }
           }
+          
+          // if circumfix was previously set by a prefix, we must check this suffix,
+          // to ensure it has it, and vice versa
+          if (dictionary.circumfix != -1) {
+            dictionary.flagLookup.get(append, scratch);
+            char appendFlags[] = Dictionary.decodeFlags(scratch);
+            boolean suffixCircumfix = Dictionary.hasFlag(appendFlags, (char)dictionary.circumfix);
+            if (circumfix != suffixCircumfix) {
+              continue;
+            }
+          }
           stems.add(new CharsRef(strippedWord, 0, length));
         }
       }
     }
+    
+    // if a circumfix flag is defined in the dictionary, and we are a prefix, we need to
check if we have that flag
+    if (dictionary.circumfix != -1 && !circumfix && prefix) {
+      dictionary.flagLookup.get(append, scratch);
+      char appendFlags[] = Dictionary.decodeFlags(scratch);
+      circumfix = Dictionary.hasFlag(appendFlags, (char)dictionary.circumfix);
+    }
 
     if (crossProduct) {
       if (recursionDepth == 0) {
@@ -290,20 +310,20 @@ final class Stemmer {
           // we took away the first prefix.
           // COMPLEXPREFIXES = true:  combine with a second prefix and another suffix 
           // COMPLEXPREFIXES = false: combine with another suffix
-          stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes,
true, true));
+          stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes,
true, true, circumfix));
         } else if (!dictionary.complexPrefixes) {
           // we took away a suffix.
           // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
           // COMPLEXPREFIXES = false: combine with another suffix
-          stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth,
false, true, false));
+          stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth,
false, true, false, circumfix));
         }
       } else if (recursionDepth == 1) {
         if (prefix && dictionary.complexPrefixes) {
           // we took away the second prefix: go look for another suffix
-          stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false,
true, true));
+          stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false,
true, true, circumfix));
         } else if (prefix == false && dictionary.complexPrefixes == false) {
           // we took away a prefix, then a suffix: go look for another suffix
-          stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth,
false, true, false));
+          stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth,
false, true, false, circumfix));
         }
       }
     }

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCircumfix.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCircumfix.java?rev=1573569&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCircumfix.java
(added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCircumfix.java
Mon Mar  3 14:20:24 2014
@@ -0,0 +1,38 @@
+package org.apache.lucene.analysis.hunspell;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.junit.BeforeClass;
+
+public class TestCircumfix extends StemmerTestBase {
+  
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    init("circumfix.aff", "circumfix.dic");
+  }
+  
+  public void testCircumfix() {
+    assertStemsTo("nagy", "nagy");
+    assertStemsTo("nagyobb", "nagy");
+    assertStemsTo("legnagyobb", "nagy");
+    assertStemsTo("legeslegnagyobb", "nagy");
+    assertStemsTo("nagyobbobb");
+    assertStemsTo("legnagy");
+    assertStemsTo("legeslegnagy");
+  }
+}

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.aff
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.aff?rev=1573569&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.aff
(added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.aff
Mon Mar  3 14:20:24 2014
@@ -0,0 +1,14 @@
+SET UTF-8
+
+CIRCUMFIX X
+
+PFX A Y 1
+PFX A 0 leg/X .
+
+PFX B Y 1
+PFX B 0 legesleg/X .
+
+SFX C Y 3
+SFX C 0 obb . +COMPARATIVE
+SFX C 0 obb/AX . +SUPERLATIVE
+SFX C 0 obb/BX . +SUPERSUPERLATIVE

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.dic
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.dic?rev=1573569&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.dic
(added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.dic
Mon Mar  3 14:20:24 2014
@@ -0,0 +1,2 @@
+1
+nagy/C    [MN]



Mime
View raw message