lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dwe...@apache.org
Subject svn commit: r1354840 - in /lucene/dev/trunk: dev-tools/eclipse/ dev-tools/maven/ lucene/ lucene/analysis/morfologik/ lucene/analysis/morfologik/lib/ lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/ lucene/analysis/morfologik/s...
Date Thu, 28 Jun 2012 07:35:40 GMT
Author: dweiss
Date: Thu Jun 28 07:35:36 2012
New Revision: 1354840

URL: http://svn.apache.org/viewvc?rev=1354840&view=rev
Log:
LUCENE-4138: Update morfologik (polish stemming) to release 1.5.3. Changed the way morphosyntactic
tags are exposed (a list of tags for a single lemma instead of a compound tag).

Added:
    lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar.sha1
    lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar.sha1
    lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-BSD.txt
      - copied, changed from r1354828, lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-COMPOUND.txt
    lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar.sha1
    lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java
      - copied, changed from r1354828, lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java
    lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttributeImpl.java
Removed:
    lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar.sha1
    lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-1.5.2.jar.sha1
    lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-COMPOUND.txt
    lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar.sha1
    lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java
    lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttributeImpl.java
Modified:
    lucene/dev/trunk/dev-tools/eclipse/dot.classpath
    lucene/dev/trunk/dev-tools/maven/pom.xml.template
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/morfologik/build.xml
    lucene/dev/trunk/lucene/analysis/morfologik/ivy.xml
    lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt
    lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-NOTICE.txt
    lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt
    lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
    lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
    lucene/dev/trunk/lucene/module-build.xml
    lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java

Modified: lucene/dev/trunk/dev-tools/eclipse/dot.classpath
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/dev-tools/eclipse/dot.classpath?rev=1354840&r1=1354839&r2=1354840&view=diff
==============================================================================
--- lucene/dev/trunk/dev-tools/eclipse/dot.classpath (original)
+++ lucene/dev/trunk/dev-tools/eclipse/dot.classpath Thu Jun 28 07:35:36 2012
@@ -97,9 +97,9 @@
 	<classpathentry kind="lib" path="lucene/sandbox/lib/jakarta-regexp-1.4.jar"/>
 	<classpathentry kind="lib" path="lucene/analysis/icu/lib/icu4j-4.8.1.1.jar"/>
 	<classpathentry kind="lib" path="lucene/analysis/phonetic/lib/commons-codec-1.6.jar"/>
-	<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar"/>
-	<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-polish-1.5.2.jar"/>
-	<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar"/>
+	<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar"/>
+	<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar"/>
+	<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar"/>
 	<classpathentry kind="lib" path="lucene/benchmark/lib/commons-compress-1.2.jar"/>
 	<classpathentry kind="lib" path="lucene/benchmark/lib/xercesImpl-2.9.1.jar"/>
 	<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>

Modified: lucene/dev/trunk/dev-tools/maven/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/dev-tools/maven/pom.xml.template?rev=1354840&r1=1354839&r2=1354840&view=diff
==============================================================================
--- lucene/dev/trunk/dev-tools/maven/pom.xml.template (original)
+++ lucene/dev/trunk/dev-tools/maven/pom.xml.template Thu Jun 28 07:35:36 2012
@@ -303,7 +303,7 @@
       <dependency>
         <groupId>org.carrot2</groupId>
         <artifactId>morfologik-polish</artifactId>
-        <version>1.5.2</version>
+        <version>1.5.3</version>
       </dependency>
       <dependency>
         <groupId>org.codehaus.woodstox</groupId>

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1354840&r1=1354839&r2=1354840&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Thu Jun 28 07:35:36 2012
@@ -7,6 +7,16 @@ http://s.apache.org/luceneversions
 ======================= Lucene 5.0.0 =======================
 
 
+======================= Lucene 4.0.0-BETA =======================
+
+API Changes
+
+* LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.
+  The tag attribute class has been renamed to MorphosyntacticTagsAttribute and
+  has a different API (carries a list of tags instead of a compound tag). Upgrade
+  of embedded morfologik dictionaries to version 1.9. (Dawid Weiss)
+
+
 ======================= Lucene 4.0.0-ALPHA =======================
 
 More information about this release, including any errata related to the 

Modified: lucene/dev/trunk/lucene/analysis/morfologik/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/build.xml?rev=1354840&r1=1354839&r2=1354840&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/build.xml (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/build.xml Thu Jun 28 07:35:36 2012
@@ -27,9 +27,9 @@
 
   <path id="classpath">
     <pathelement path="${analyzers-common.jar}"/>
-    <pathelement path="lib/morfologik-fsa-1.5.2.jar"/>
-    <pathelement path="lib/morfologik-polish-1.5.2.jar"/>
-    <pathelement path="lib/morfologik-stemming-1.5.2.jar"/>
+    <pathelement path="lib/morfologik-fsa-1.5.3.jar"/>
+    <pathelement path="lib/morfologik-polish-1.5.3.jar"/>
+    <pathelement path="lib/morfologik-stemming-1.5.3.jar"/>
     <path refid="base.classpath"/>
   </path>
 

Modified: lucene/dev/trunk/lucene/analysis/morfologik/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/ivy.xml?rev=1354840&r1=1354839&r2=1354840&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/ivy.xml (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/ivy.xml Thu Jun 28 07:35:36 2012
@@ -19,9 +19,9 @@
 <ivy-module version="2.0">
     <info organisation="org.apache.lucene" module="analyzers-morfologik"/>
     <dependencies>
-      <dependency org="org.carrot2" name="morfologik-polish" rev="1.5.2" transitive="false"/>
-      <dependency org="org.carrot2" name="morfologik-fsa" rev="1.5.2" transitive="false"/>
-      <dependency org="org.carrot2" name="morfologik-stemming" rev="1.5.2" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-polish" rev="1.5.3" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-fsa" rev="1.5.3" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-stemming" rev="1.5.3" transitive="false"/>
       <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> 
     </dependencies>
 </ivy-module>

Added: lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar.sha1
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar.sha1?rev=1354840&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar.sha1 (added)
+++ lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar.sha1 Thu Jun
28 07:35:36 2012
@@ -0,0 +1 @@
+d1f729cd3019e6d86485226202f84458141a5688

Modified: lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt?rev=1354840&r1=1354839&r2=1354840&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-fsa-LICENSE-BSD.txt Thu Jun
28 07:35:36 2012
@@ -1,6 +1,6 @@
 
 Copyright (c) 2006 Dawid Weiss
-Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
+Copyright (c) 2007-2012 Dawid Weiss, Marcin Miłkowski
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without modification, 

Added: lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar.sha1
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar.sha1?rev=1354840&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar.sha1 (added)
+++ lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar.sha1 Thu Jun
28 07:35:36 2012
@@ -0,0 +1 @@
+8217b6f7ad018ceda0e824b2e60340000da4397a

Copied: lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-BSD.txt
(from r1354828, lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-COMPOUND.txt)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-BSD.txt?p2=lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-BSD.txt&p1=lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-COMPOUND.txt&r1=1354828&r2=1354840&rev=1354840&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-COMPOUND.txt
(original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-LICENSE-BSD.txt Thu
Jun 28 07:35:36 2012
@@ -1,13 +1,33 @@
-morfologik-polish, TERMS OF LICENCE
+BSD-licensed dictionary of Polish (Morfologik)
 
-This JAR contains and makes use of data from Polish ispell/myspell 
-dictionaries hosted at http://www.sjp.pl/slownik/en/ and is 
-licenced on the terms of (inter alia): GPL, LGPL, MPL or CC-SA licenses.
+Copyright (c) 2012, Marcin Miłkowski
+All rights reserved.
 
-Part-of-speech tags were added in Morfologik project and are not found 
-in the data from sjp.pl. 
+Redistribution and  use in  source and binary  forms, with  or without
+modification, are permitted provided that the following conditions are
+met:
 
------
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
+OR  IMPLIED WARRANTIES,  INCLUDING, BUT  NOT LIMITED  TO,  THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED.  IN NO EVENT  SHALL COPYRIGHT  HOLDERS OR  CONTRIBUTORS BE
+LIABLE FOR  ANY DIRECT,  INDIRECT, INCIDENTAL, SPECIAL,  EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES  (INCLUDING, BUT NOT LIMITED  TO, PROCUREMENT OF
+SUBSTITUTE  GOODS OR  SERVICES;  LOSS  OF USE,  DATA,  OR PROFITS;  OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF LIABILITY,
+WHETHER IN  CONTRACT, STRICT LIABILITY, OR  TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--
 
 BSD-licensed dictionary of Polish (SGJP)
 http://sgjp.pl/morfeusz/
@@ -39,4 +59,4 @@ SUBSTITUTE  GOODS OR  SERVICES;  LOSS  O
 BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF LIABILITY,
 WHETHER IN  CONTRACT, STRICT LIABILITY, OR  TORT (INCLUDING NEGLIGENCE
 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
-IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file

Modified: lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-NOTICE.txt?rev=1354840&r1=1354839&r2=1354840&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-NOTICE.txt (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-polish-NOTICE.txt Thu Jun 28
07:35:36 2012
@@ -1,9 +1,6 @@
-This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
-(http://morfologik.blogspot.com/).
 
-This JAR contains and makes use of data from Polish ispell/myspell 
-dictionaries hosted at http://www.sjp.pl/slownik/en/ and is 
-licenced on the terms of (inter alia): GPL, LGPL, MPL or CC-SA licenses.
+This product includes data from BSD-licensed dictionary of Polish (Morfologik)
+(http://morfologik.blogspot.com/)
 
 This product includes data from BSD-licensed dictionary of Polish (SGJP)
 (http://sgjp.pl/morfeusz/)

Added: lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar.sha1
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar.sha1?rev=1354840&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar.sha1 (added)
+++ lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-stemming-1.5.3.jar.sha1 Thu
Jun 28 07:35:36 2012
@@ -0,0 +1 @@
+c4ead57b78fa71b00553ff21da6fb5a326e914e8

Modified: lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt?rev=1354840&r1=1354839&r2=1354840&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/lib/morfologik-stemming-LICENSE-BSD.txt Thu
Jun 28 07:35:36 2012
@@ -1,6 +1,6 @@
 
 Copyright (c) 2006 Dawid Weiss
-Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
+Copyright (c) 2007-2012 Dawid Weiss, Marcin Miłkowski
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without modification, 

Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java?rev=1354840&r1=1354839&r2=1354840&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
(original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
Thu Jun 28 07:35:36 2012
@@ -19,8 +19,7 @@ package org.apache.lucene.analysis.morfo
  */
 
 import java.io.IOException;
-import java.util.Collections;
-import java.util.List;
+import java.util.*;
 
 import morfologik.stemming.*;
 import morfologik.stemming.PolishStemmer.DICTIONARY;
@@ -30,13 +29,12 @@ import org.apache.lucene.analysis.TokenS
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.util.CharacterUtils;
-import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.Version;
+import org.apache.lucene.util.*;
 
 /**
  * {@link TokenFilter} using Morfologik library.
  *
- * MorfologikFilter contains a {@link MorphosyntacticTagAttribute}, which provides morphosyntactic
+ * MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic
  * annotations for produced lemmas. See the Morfologik documentation for details.
  * 
  * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
@@ -44,7 +42,7 @@ import org.apache.lucene.util.Version;
 public class MorfologikFilter extends TokenFilter {
 
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final MorphosyntacticTagAttribute tagAtt = addAttribute(MorphosyntacticTagAttribute.class);
+  private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
 
   private final CharsRef scratch = new CharsRef(0);
@@ -55,6 +53,8 @@ public class MorfologikFilter extends To
   private final IStemmer stemmer;
   
   private List<WordData> lemmaList;
+  private final ArrayList<StringBuilder> tagsList = new ArrayList<StringBuilder>();
+
   private int lemmaListIndex;
 
   /**
@@ -73,9 +73,43 @@ public class MorfologikFilter extends To
   }
 
   private void popNextLemma() {
-    final WordData lemma = lemmaList.get(lemmaListIndex++);
-    termAtt.setEmpty().append(lemma.getStem());
-    tagAtt.setTag(lemma.getTag());
+    // Collect all tags for the next unique lemma.
+    CharSequence currentStem;
+    int tags = 0;
+    do {
+      final WordData lemma = lemmaList.get(lemmaListIndex++);
+      currentStem = lemma.getStem();
+      final CharSequence tag = lemma.getTag();
+      if (tag != null) {
+        if (tagsList.size() <= tags) {
+          tagsList.add(new StringBuilder());
+        }
+
+        final StringBuilder buffer = tagsList.get(tags++);  
+        buffer.setLength(0);
+        buffer.append(lemma.getTag());
+      }
+    } while (lemmaListIndex < lemmaList.size() &&
+             equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem));
+
+    // Set the lemma's base form and tags as attributes.
+    termAtt.setEmpty().append(currentStem);
+    tagsAtt.setTags(tagsList.subList(0, tags));
+  }
+
+  /**
+   * Compare two char sequences for equality. Assumes non-null arguments. 
+   */
+  private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) {
+    int len1 = s1.length();
+    int len2 = s2.length();
+    if (len1 != len2) return false;
+    for (int i = len1; --i >= 0;) {
+      if (s1.charAt(i) != s2.charAt(i)) { 
+        return false; 
+      }
+    }
+    return true;
   }
 
   /**
@@ -101,7 +135,7 @@ public class MorfologikFilter extends To
         current = captureState();
         popNextLemma();
       } else {
-        tagAtt.clear();
+        tagsAtt.clear();
       }
       return true;
     } else {
@@ -130,6 +164,7 @@ public class MorfologikFilter extends To
   public void reset() throws IOException {
     lemmaListIndex = 0;
     lemmaList = Collections.emptyList();
+    tagsList.clear();
     super.reset();
   }
 }

Copied: lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java
(from r1354828, lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java?p2=lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java&p1=lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java&r1=1354828&r2=1354840&rev=1354840&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagAttribute.java
(original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttribute.java
Thu Jun 28 07:35:36 2012
@@ -18,6 +18,8 @@ package org.apache.lucene.analysis.morfo
  * limitations under the License.
  */
 
+import java.util.List;
+
 import org.apache.lucene.util.Attribute;
 
 /** 
@@ -25,15 +27,18 @@ import org.apache.lucene.util.Attribute;
  * surface forms. For the exact format and description of these,
  * see the project's documentation (annotations vary by dictionary!).
  */
-public interface MorphosyntacticTagAttribute extends Attribute {
+public interface MorphosyntacticTagsAttribute extends Attribute {
   /** 
    * Set the POS tag. The default value (no-value) is null.
-   * @param pos POS tag corresponding to current lemma
+   * 
+   * @param tags A list of POS tags corresponding to current lemma.
    */
-  public void setTag(CharSequence pos);
+  public void setTags(List<StringBuilder> tags);
 
-  /** Returns the POS tag of the term. */
-  public CharSequence getTag();
+  /** 
+   * Returns the POS tag of the term.
+   */
+  public List<StringBuilder> getTags();
 
   /** Clear to default value. */
   public void clear();

Added: lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttributeImpl.java?rev=1354840&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttributeImpl.java
(added)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorphosyntacticTagsAttributeImpl.java
Thu Jun 28 07:35:36 2012
@@ -0,0 +1,96 @@
+// -*- c-basic-offset: 2 -*-
+package org.apache.lucene.analysis.morfologik;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.*;
+
+import org.apache.lucene.util.AttributeImpl;
+
+/**
+ * Morphosyntactic annotations for surface forms.
+ * @see MorphosyntacticTagsAttribute
+ */
+public class MorphosyntacticTagsAttributeImpl extends AttributeImpl 
+  implements MorphosyntacticTagsAttribute, Cloneable {
+  
+  /**
+   * A list of potential tag variants for the current token.
+   */
+  private List<StringBuilder> tags;
+
+  /**
+   * Returns the POS tag of the term. If you need a copy of this char sequence, copy
+   * its contents (and clone {@link StringBuilder}s) because it changes with 
+   * each new term to avoid unnecessary memory allocations.
+   */
+  @Override
+  public List<StringBuilder> getTags() {
+    return tags;
+  }
+
+  public void clear() {
+    tags = null;
+  }
+
+  public boolean equals(Object other) {
+    if (other instanceof MorphosyntacticTagsAttribute) {
+      return equal(this.getTags(), ((MorphosyntacticTagsAttribute) other).getTags());
+    }
+    return false;
+  }
+
+  private boolean equal(Object l1, Object l2) {
+    return l1 == null ? (l2 == null) : (l1.equals(l2));
+  }
+
+  public int hashCode() {
+    return this.tags == null ? 0 : tags.hashCode();
+  }
+
+  /**
+   * Sets the internal tags reference to the given list. The contents
+   * is not copied. 
+   */
+  @Override
+  public void setTags(List<StringBuilder> tags) {
+    this.tags = tags;
+  }
+
+  public void copyTo(AttributeImpl target) {
+    List<StringBuilder> cloned = null;
+    if (tags != null) {
+      cloned = new ArrayList<StringBuilder>(tags.size());
+      for (StringBuilder b : tags) {
+        cloned.add(new StringBuilder(b));
+      }
+    }
+    ((MorphosyntacticTagsAttribute) target).setTags(cloned);
+  }
+
+  public MorphosyntacticTagsAttributeImpl clone() {
+    MorphosyntacticTagsAttributeImpl cloned = new MorphosyntacticTagsAttributeImpl();
+    this.copyTo(cloned);
+    return cloned;
+  }
+  
+  @Override
+  public String toString() {
+    return tags == null ? "<no tags>" : tags.toString();
+  }
+}

Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java?rev=1354840&r1=1354839&r2=1354840&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
(original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
Thu Jun 28 07:35:36 2012
@@ -1,4 +1,3 @@
-// -*- c-basic-offset: 2 -*-
 package org.apache.lucene.analysis.morfologik;
 
 /*
@@ -20,10 +19,9 @@ package org.apache.lucene.analysis.morfo
 
 import java.io.IOException;
 import java.io.StringReader;
+import java.util.TreeSet;
 
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.*;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 
 /**
@@ -39,8 +37,8 @@ public class TestMorfologikAnalyzer exte
   public final void testSingleTokens() throws IOException {
     Analyzer a = getTestAnalyzer();
     assertAnalyzesToReuse(a, "a", new String[] { "a" });
-    assertAnalyzesToReuse(a, "liście", new String[] { "liść", "list", "lista", });
-    assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dane", "dać" });
+    assertAnalyzesToReuse(a, "liście", new String[] { "liście", "liść", "list",
"lista" });
+    assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dana", "dane", "dać" });
     assertAnalyzesToReuse(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń"
});
   }
 
@@ -50,10 +48,10 @@ public class TestMorfologikAnalyzer exte
     assertAnalyzesToReuse(
       a,
       "liście danych",
-      new String[] { "liść", "list", "lista", "dany", "dane", "dać" },
-      new int[] { 0, 0, 0, 7, 7, 7 },
-      new int[] { 6, 6, 6, 13, 13, 13 },
-      new int[] { 1, 0, 0, 1, 0, 0 });
+      new String[] { "liście", "liść", "list", "lista", "dany", "dana", "dane",
"dać" },
+      new int[] { 0, 0, 0, 0, 7, 7, 7, 7 },
+      new int[] { 6, 6, 6, 6, 13, 13, 13, 13 },
+      new int[] { 1, 0, 0, 0, 1, 0, 0, 0 });
   }
 
   /** Test reuse of MorfologikFilter with leftover stems. */
@@ -63,7 +61,7 @@ public class TestMorfologikAnalyzer exte
     CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
     ts_1.reset();
     ts_1.incrementToken();
-    assertEquals("first stream", "liść", termAtt_1.toString());
+    assertEquals("first stream", "liście", termAtt_1.toString());
 
     TokenStream ts_2 = a.tokenStream("dummy", new StringReader("danych"));
     CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
@@ -76,33 +74,61 @@ public class TestMorfologikAnalyzer exte
   public final void testCase() throws IOException {
     Analyzer a = getTestAnalyzer();
 
-    assertAnalyzesToReuse(a, "AGD",      new String[] { "artykuły gospodarstwa domowego"
});
+    assertAnalyzesToReuse(a, "AGD",      new String[] { "AGD", "artykuły gospodarstwa
domowego" });
     assertAnalyzesToReuse(a, "agd",      new String[] { "artykuły gospodarstwa domowego"
});
 
     assertAnalyzesToReuse(a, "Poznania", new String[] { "Poznań" });
-    assertAnalyzesToReuse(a, "poznania", new String[] { "poznać" });
+    assertAnalyzesToReuse(a, "poznania", new String[] { "poznanie", "poznać" });
 
     assertAnalyzesToReuse(a, "Aarona",   new String[] { "Aaron" });
     assertAnalyzesToReuse(a, "aarona",   new String[] { "aarona" });
 
-    assertAnalyzesToReuse(a, "Liście",   new String[] { "liść", "list", "lista"
});
+    assertAnalyzesToReuse(a, "Liście",   new String[] { "liście", "liść", "list",
"lista" });
   }
 
-  private void assertPOSToken(TokenStream ts, String term, String pos) throws IOException
{
+  private void assertPOSToken(TokenStream ts, String term, String... tags) throws IOException
{
     ts.incrementToken();
     assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString());
-    assertEquals(pos,  ts.getAttribute(MorphosyntacticTagAttribute.class).getTag().toString());
+    
+    TreeSet<String> actual = new TreeSet<String>();
+    TreeSet<String> expected = new TreeSet<String>();
+    for (StringBuilder b : ts.getAttribute(MorphosyntacticTagsAttribute.class).getTags())
{
+      actual.add(b.toString());
+    }
+    for (String s : tags) {
+      expected.add(s);
+    }
+    
+    if (!expected.equals(actual)) {
+      System.out.println("Expected:\n" + expected);
+      System.out.println("Actual:\n" + actual);
+      assertEquals(expected, actual);
+    }
   }
 
   /** Test morphosyntactic annotations. */
   public final void testPOSAttribute() throws IOException {
     TokenStream ts = getTestAnalyzer().tokenStream("dummy", new StringReader("liście"));
 
-    assertPOSToken(ts, "liść",  "subst:pl:acc.nom.voc:m3");
-    assertPOSToken(ts, "list",  "subst:sg:loc.voc:m3");
-    assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
+    assertPOSToken(ts, "liście",  
+        "subst:sg:acc:n2",
+        "subst:sg:nom:n2",
+        "subst:sg:voc:n2");
+
+    assertPOSToken(ts, "liść",  
+        "subst:pl:acc:m3",
+        "subst:pl:nom:m3",
+        "subst:pl:voc:m3");
+
+    assertPOSToken(ts, "list",  
+        "subst:sg:loc:m3",
+        "subst:sg:voc:m3");
+
+    assertPOSToken(ts, "lista", 
+        "subst:sg:dat:f",
+        "subst:sg:loc:f");
   }
-  
+
   /** blast some random strings through the analyzer */
   public void testRandom() throws Exception {
     checkRandomData(random(), getTestAnalyzer(), 10000 * RANDOM_MULTIPLIER); 

Modified: lucene/dev/trunk/lucene/module-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/module-build.xml?rev=1354840&r1=1354839&r2=1354840&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/module-build.xml (original)
+++ lucene/dev/trunk/lucene/module-build.xml Thu Jun 28 07:35:36 2012
@@ -312,9 +312,7 @@
   <property name="analyzers-morfologik.jar" value="${common.dir}/build/analysis/morfologik/lucene-analyzers-morfologik-${version}.jar"/>
   <fileset id="analyzers-morfologik.fileset" dir="${common.dir}">
     <include name="build/analysis/morfologik/lucene-analyzers-morfologik-${version}.jar"
/>
-    <include name="analysis/morfologik/lib/morfologik-fsa-1.5.2.jar" />
-    <include name="analysis/morfologik/lib/morfologik-polish-1.5.2.jar" />
-    <include name="analysis/morfologik/lib/morfologik-stemming-1.5.2.jar" />
+    <include name="analysis/morfologik/lib/morfologik-*.jar" />
   </fileset>
   <target name="check-analyzers-morfologik-uptodate" unless="analyzers-morfologik.uptodate">
     <module-uptodate name="analysis/morfologik" jarfile="${analyzers-morfologik.jar}"
property="analyzers-morfologik.uptodate"/>

Modified: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java?rev=1354840&r1=1354839&r2=1354840&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java
(original)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java
Thu Jun 28 07:35:36 2012
@@ -7,7 +7,6 @@ import java.util.Map;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
-import org.apache.solr.schema.IndexSchema;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more



Mime
View raw message