Return-Path: X-Original-To: apmail-lucene-commits-archive@www.apache.org Delivered-To: apmail-lucene-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 4654C10EC5 for ; Mon, 3 Mar 2014 14:20:57 +0000 (UTC) Received: (qmail 177 invoked by uid 500); 3 Mar 2014 14:20:52 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 99326 invoked by uid 99); 3 Mar 2014 14:20:51 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 03 Mar 2014 14:20:51 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 03 Mar 2014 14:20:47 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 7F06F23889CB; Mon, 3 Mar 2014 14:20:25 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1573569 - in /lucene/dev/trunk/lucene: ./ analysis/common/src/java/org/apache/lucene/analysis/hunspell/ analysis/common/src/test/org/apache/lucene/analysis/hunspell/ Date: Mon, 03 Mar 2014 14:20:25 -0000 To: commits@lucene.apache.org From: rmuir@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20140303142025.7F06F23889CB@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: rmuir Date: Mon Mar 3 14:20:24 2014 New Revision: 1573569 URL: http://svn.apache.org/r1573569 Log: LUCENE-5485: hunspell circumfix support Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCircumfix.java (with props) lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.aff lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.dic Modified: lucene/dev/trunk/lucene/CHANGES.txt lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java Modified: lucene/dev/trunk/lucene/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1573569&r1=1573568&r2=1573569&view=diff ============================================================================== --- lucene/dev/trunk/lucene/CHANGES.txt (original) +++ lucene/dev/trunk/lucene/CHANGES.txt Mon Mar 3 14:20:24 2014 @@ -87,6 +87,8 @@ New Features * LUCENE-5479: FacetsConfig subclass can now customize the default per-dim facets configuration. (Rob Audenaerde via Mike McCandless) +* LUCENE-5485: Add circumfix support to HunspellStemFilter. (Robert Muir) + API Changes * LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java?rev=1573569&r1=1573568&r2=1573569&view=diff ============================================================================== --- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java (original) +++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java Mon Mar 3 14:20:24 2014 @@ -66,6 +66,7 @@ public class Dictionary { private static final String SUFFIX_KEY = "SFX"; private static final String FLAG_KEY = "FLAG"; private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES"; + private static final String CIRCUMFIX_KEY = "CIRCUMFIX"; private static final String NUM_FLAG_TYPE = "num"; private static final String UTF8_FLAG_TYPE = "UTF-8"; @@ -107,6 +108,8 @@ public class Dictionary { boolean ignoreCase; boolean complexPrefixes; + int circumfix = -1; // circumfix flag, or -1 if one is not defined + /** * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix * and dictionary files. @@ -240,6 +243,12 @@ public class Dictionary { flagParsingStrategy = getFlagParsingStrategy(line); } else if (line.equals(COMPLEXPREFIXES_KEY)) { complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix + } else if (line.startsWith(CIRCUMFIX_KEY)) { + String parts[] = line.split("\\s+"); + if (parts.length != 2) { + throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber()); + } + circumfix = flagParsingStrategy.parseFlag(parts[1]); } } Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java?rev=1573569&r1=1573568&r2=1573569&view=diff ============================================================================== --- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java (original) +++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java Mon Mar 3 14:20:24 2014 @@ -81,7 +81,7 @@ final class Stemmer { stems.add(new CharsRef(word, 0, length)); } } - stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false)); + stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false)); return stems; } @@ -122,9 +122,11 @@ final class Stemmer { * @param previousWasPrefix true if the previous removal was a prefix: * if we are removing a suffix, and it has no continuation requirements, its ok. * but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. + * @param circumfix true if the previous prefix removal was signed as a circumfix + * this means inner most suffix must also contain circumfix flag. * @return List of stems, or empty list if no stems are found */ - private List stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix) { + private List stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix) { // TODO: allow this stuff to be reused by tokenfilter List stems = new ArrayList(); @@ -171,7 +173,7 @@ final class Stemmer { .append(word, deAffixedStart, deAffixedLength) .toString(); - List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, -1, recursionDepth, true); + List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, -1, recursionDepth, true, circumfix); stems.addAll(stemList); } @@ -219,7 +221,7 @@ final class Stemmer { dictionary.stripLookup.get(stripOrd, scratch); String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString(); - List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, prefixFlag, recursionDepth, false); + List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, prefixFlag, recursionDepth, false, circumfix); stems.addAll(stemList); } @@ -242,7 +244,7 @@ final class Stemmer { * @param prefix true if we are removing a prefix (false if its a suffix) * @return List of stems for the word, or an empty list if none are found */ - List applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix) { + List applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) { segment.setLength(0); segment.append(strippedWord, 0, length); @@ -279,10 +281,28 @@ final class Stemmer { continue; } } + + // if circumfix was previously set by a prefix, we must check this suffix, + // to ensure it has it, and vice versa + if (dictionary.circumfix != -1) { + dictionary.flagLookup.get(append, scratch); + char appendFlags[] = Dictionary.decodeFlags(scratch); + boolean suffixCircumfix = Dictionary.hasFlag(appendFlags, (char)dictionary.circumfix); + if (circumfix != suffixCircumfix) { + continue; + } + } stems.add(new CharsRef(strippedWord, 0, length)); } } } + + // if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we have that flag + if (dictionary.circumfix != -1 && !circumfix && prefix) { + dictionary.flagLookup.get(append, scratch); + char appendFlags[] = Dictionary.decodeFlags(scratch); + circumfix = Dictionary.hasFlag(appendFlags, (char)dictionary.circumfix); + } if (crossProduct) { if (recursionDepth == 0) { @@ -290,20 +310,20 @@ final class Stemmer { // we took away the first prefix. // COMPLEXPREFIXES = true: combine with a second prefix and another suffix // COMPLEXPREFIXES = false: combine with another suffix - stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes, true, true)); + stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes, true, true, circumfix)); } else if (!dictionary.complexPrefixes) { // we took away a suffix. // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed // COMPLEXPREFIXES = false: combine with another suffix - stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false)); + stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix)); } } else if (recursionDepth == 1) { if (prefix && dictionary.complexPrefixes) { // we took away the second prefix: go look for another suffix - stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true)); + stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix)); } else if (prefix == false && dictionary.complexPrefixes == false) { // we took away a prefix, then a suffix: go look for another suffix - stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false)); + stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix)); } } } Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCircumfix.java URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCircumfix.java?rev=1573569&view=auto ============================================================================== --- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCircumfix.java (added) +++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCircumfix.java Mon Mar 3 14:20:24 2014 @@ -0,0 +1,38 @@ +package org.apache.lucene.analysis.hunspell; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.junit.BeforeClass; + +public class TestCircumfix extends StemmerTestBase { + + @BeforeClass + public static void beforeClass() throws Exception { + init("circumfix.aff", "circumfix.dic"); + } + + public void testCircumfix() { + assertStemsTo("nagy", "nagy"); + assertStemsTo("nagyobb", "nagy"); + assertStemsTo("legnagyobb", "nagy"); + assertStemsTo("legeslegnagyobb", "nagy"); + assertStemsTo("nagyobbobb"); + assertStemsTo("legnagy"); + assertStemsTo("legeslegnagy"); + } +} Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.aff URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.aff?rev=1573569&view=auto ============================================================================== --- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.aff (added) +++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.aff Mon Mar 3 14:20:24 2014 @@ -0,0 +1,14 @@ +SET UTF-8 + +CIRCUMFIX X + +PFX A Y 1 +PFX A 0 leg/X . + +PFX B Y 1 +PFX B 0 legesleg/X . + +SFX C Y 3 +SFX C 0 obb . +COMPARATIVE +SFX C 0 obb/AX . +SUPERLATIVE +SFX C 0 obb/BX . +SUPERSUPERLATIVE Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.dic URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.dic?rev=1573569&view=auto ============================================================================== --- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.dic (added) +++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/circumfix.dic Mon Mar 3 14:20:24 2014 @@ -0,0 +1,2 @@ +1 +nagy/C [MN]