lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jan...@apache.org
Subject svn commit: r1302833 [1/3] - in /lucene/dev/trunk: modules/analysis/ modules/analysis/common/src/java/org/apache/lucene/analysis/no/ modules/analysis/common/src/test/org/apache/lucene/analysis/no/ solr/ solr/core/src/java/org/apache/solr/analysis/ solr...
Date Tue, 20 Mar 2012 10:57:51 GMT
Author: janhoy
Date: Tue Mar 20 10:57:50 2012
New Revision: 1302833

URL: http://svn.apache.org/viewvc?rev=1302833&view=rev
Log:
SOLR-2764: Create a NorwegianLightStemmer and NorwegianMinimalStemmer

Added:
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java
  (with props)
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java
  (with props)
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilter.java
  (with props)
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.java
  (with props)
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java
  (with props)
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java
  (with props)
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_light.txt
  (with props)
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_minimal.txt
  (with props)
    lucene/dev/trunk/solr/CHANGES.txt.orig
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/NorwegianLightStemFilterFactory.java
  (with props)
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/NorwegianMinimalStemFilterFactory.java
  (with props)
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestNorwegianLightStemFilterFactory.java
  (with props)
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestNorwegianMinimalStemFilterFactory.java
  (with props)
Modified:
    lucene/dev/trunk/modules/analysis/CHANGES.txt
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/example/solr/conf/schema.xml

Modified: lucene/dev/trunk/modules/analysis/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/CHANGES.txt?rev=1302833&r1=1302832&r2=1302833&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/CHANGES.txt (original)
+++ lucene/dev/trunk/modules/analysis/CHANGES.txt Tue Mar 20 10:57:50 2012
@@ -106,3 +106,6 @@ New Features
    All analyzers in contrib/analyzers and contrib/icu were moved to the
    analysis module.  The 'smartcn' and 'stempel' components now depend on 'common'.
    (Chris Male, Robert Muir)
+
+ * SOLR-2764: Create a NorwegianLightStemmer and NorwegianMinimalStemmer (janhoy)
+   
\ No newline at end of file

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java?rev=1302833&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java
(added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java
Tue Mar 20 10:57:50 2012
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.no;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link NorwegianLightStemmer} to stem Norwegian
+ * words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class NorwegianLightStemFilter extends TokenFilter {
+  private final NorwegianLightStemmer stemmer = new NorwegianLightStemmer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  public NorwegianLightStemFilter(TokenStream input) {
+    super(input);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java?rev=1302833&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java
(added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java
Tue Mar 20 10:57:50 2012
@@ -0,0 +1,119 @@
+package org.apache.lucene.analysis.no;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* 
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ * 
+ * Full copyright for that code follows:
+ */
+
+/*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer. Redistributions in binary 
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials 
+ * provided with the distribution. Neither the name of the author nor the names 
+ * of its contributors may be used to endorse or promote products derived from 
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Light Stemmer for Norwegian.
+ * <p>
+ * Parts of this stemmer is adapted from SwedishLightStemFilter, except
+ * that while the Swedish one has a pre-defined rule set and a corresponding
+ * corpus to validate against whereas the Norwegian one is hand crafted.
+ */
+public class NorwegianLightStemmer {
+  
+  public int stem(char s[], int len) {   
+    // Remove posessive -s (bilens -> bilen) and continue checking 
+    if (len > 4 && s[len-1] == 's')
+      len--;
+
+    // Remove common endings, single-pass
+    if (len > 7 && 
+        (endsWith(s, len, "heter") ||  // general ending (hemmelig-heter -> hemmelig)
+         endsWith(s, len, "heten")))   // general ending (hemmelig-heten -> hemmelig)
+      return len - 5;
+
+    if (len > 5 &&
+        (endsWith(s, len, "dom") || // general ending (kristen-dom -> kristen)
+         endsWith(s, len, "het")))  // general ending (hemmelig-het -> hemmelig)
+      return len - 3;
+    
+    if (len > 7 && 
+        (endsWith(s, len, "elser") ||   // general ending (føl-elser -> føl)
+         endsWith(s, len, "elsen")))    // general ending (føl-elsen -> føl)
+      return len - 5;
+    
+    if (len > 6 &&
+        (endsWith(s, len, "ende") ||  // (sov-ende -> sov)
+         endsWith(s, len, "else") ||  // general ending (føl-else -> føl)
+         endsWith(s, len, "este") ||  // adj (fin-este -> fin)
+         endsWith(s, len, "eren")))   // masc
+      return len - 4;
+    
+    if (len > 5 &&
+        (endsWith(s, len, "ere") || // adj (fin-ere -> fin)
+         endsWith(s, len, "est") || // adj (fin-est -> fin)
+         endsWith(s, len, "ene")    // masc/fem/neutr pl definite (hus-ene)
+         )) 
+      return len - 3;
+    
+    if (len > 4 &&
+        (endsWith(s, len, "er") ||  // masc/fem indefinite
+         endsWith(s, len, "en") ||  // masc/fem definite
+         endsWith(s, len, "et") ||  // neutr definite
+         endsWith(s, len, "st") ||  // adj (billig-st -> billig)
+         endsWith(s, len, "te")))
+      return len - 2;
+    
+    if (len > 3)
+      switch(s[len-1]) {
+        case 'a':     // fem definite
+        case 'e':     // to get correct stem for nouns ending in -e (kake -> kak, kaker
-> kak)
+        case 'n': 
+          return len - 1;
+      }
+    
+    return len;
+  }
+}

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilter.java?rev=1302833&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilter.java
(added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilter.java
Tue Mar 20 10:57:50 2012
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.no;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link NorwegianMinimalStemmer} to stem Norwegian
+ * words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class NorwegianMinimalStemFilter extends TokenFilter {
+  private final NorwegianMinimalStemmer stemmer = new NorwegianMinimalStemmer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  public NorwegianMinimalStemFilter(TokenStream input) {
+    super(input);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.java?rev=1302833&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.java
(added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.java
Tue Mar 20 10:57:50 2012
@@ -0,0 +1,90 @@
+package org.apache.lucene.analysis.no;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* 
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ * 
+ * Full copyright for that code follows:
+ */
+
+/*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer. Redistributions in binary 
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials 
+ * provided with the distribution. Neither the name of the author nor the names 
+ * of its contributors may be used to endorse or promote products derived from 
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Minimal Stemmer for Norwegian bokmål (no-nb)
+ * <p>
+ * Stems known plural forms for Norwegian nouns only, together with genitiv -s
+ */
+public class NorwegianMinimalStemmer {
+  
+  public int stem(char s[], int len) {       
+    // Remove genitiv s
+    if (len > 4 && s[len-1] == 's')
+      len--;
+    
+    if (len > 5 &&
+         endsWith(s, len, "ene")    // masc/fem/neutr pl definite (hus-ene)
+        )
+      return len - 3;
+    
+    if (len > 4 &&
+        (endsWith(s, len, "er") ||  // masc/fem indefinite
+         endsWith(s, len, "en") ||  // masc/fem definite
+         endsWith(s, len, "et")     // neutr definite
+        ))
+      return len - 2;
+    
+    if (len > 3)
+      switch(s[len-1]) {
+        case 'a':     // fem definite
+        case 'e':     // to get correct stem for nouns ending in -e (kake -> kak, kaker
-> kak)
+          return len - 1;
+      }
+    
+    return len;
+  }
+}

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java?rev=1302833&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java
(added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java
Tue Mar 20 10:57:50 2012
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.no;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+
+import static org.apache.lucene.analysis.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link NorwegianLightStemFilter}
+ */
+public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName,
+        Reader reader) {
+      Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+      return new TokenStreamComponents(source, new NorwegianLightStemFilter(source));
+    }
+  };
+  
+  /** Test against a vocabulary file */
+  public void testVocabulary() throws IOException {
+    assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_light.txt")));
+  }
+
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+  }
+}

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java?rev=1302833&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java
(added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java
Tue Mar 20 10:57:50 2012
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.no;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+
+import static org.apache.lucene.analysis.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link NorwegianMinimalStemFilter}
+ */
+public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName,
+        Reader reader) {
+      Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+      return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(source));
+    }
+  };
+  
+  /** Test against a vocabulary file */
+  public void testVocabulary() throws IOException {
+    assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_minimal.txt")));
+  }
+
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+  }
+}

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_light.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_light.txt?rev=1302833&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_light.txt
(added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_light.txt
Tue Mar 20 10:57:50 2012
@@ -0,0 +1,144 @@
+#
+# Tests for norwegian Bokmål light stemmer
+# It should tackle nouns, adjectives, genitiv and some general endings
+#
+# Nouns masculine
+bil	bil
+bilen	bil
+biler	bil
+bilene	bil
+bilens	bil
+bilenes	bil
+sekretæren	sekretær
+sekretær	sekretær
+sekretærene	sekretær
+kaker	kak
+kaken	kak
+kakene	kak
+kakenes	kak
+bibliotekar	bibliotekar
+bibliotekarer	bibliotekar
+bibliotekaren	bibliotekar
+bibliotekarens	bibliotekar
+bibliotekarene	bibliotekar
+bibliotekarenes	bibliotekar
+# Nouns feminine
+veske	vesk
+veska	vesk
+vesken	vesk
+veskene	vesk
+veskas	vesk
+# Nouns neutral
+huset	hus
+husene	hus
+husets	hus
+hus	hus
+huset	hus
+husene	hus
+husenes	hus
+flagg	flagg
+flagga	flagg
+flaggene	flagg
+flaggets	flagg
+flaggenes	flagg
+politi	politi
+politiet	politi
+politiets	politi
+politienes	politi
+# General endings
+god	god
+godhet	god
+godheten	god
+forelskelse	forelsk
+forelsket	forelsk
+forelskelsen	forelsk
+forelske	forelsk
+kristen	krist
+kristendom	kristen
+kristendommen	kristendomm
+kristendommens	kristendomm
+fattig	fattig
+fattigdom	fattig
+fattigdommen	fattigdomm
+fattigdommens	fattigdomm
+# -het (see http://no.wiktionary.org/wiki/Kategori:Ord_som_ender_p%C3%A5_%C2%AB-het%C2%BB)
+hemmelig	hemmelig
+hemmelighet	hemmelig
+hemmelighets	hemmelig
+hemmeligheter	hemmelig
+hemmeligheten	hemmelig
+hemmelighetens	hemmelig
+kjærlig	kjærlig
+kjærlighet	kjærlig
+kjærligheter	kjærlig
+kjærligheten	kjærlig
+forlegen	forleg
+forlegenhet	forlegen
+forlegenheten	forlegen
+forlegenhetens	forlegen
+tvetydig	tvetydig
+tvetydighet	tvetydig
+tvetydigheter	tvetydig
+tvetydigheten	tvetydig
+tvetydighetens	tvetydig
+virkelig	virkelig
+virkelighet	virkelig
+virkeligheten	virkelig
+virkelighetens	virkelig
+# Adjectives
+billig	billig
+billigere	billig
+billigst	billig
+billige	billig
+frisk	frisk
+friskere	frisk
+friskest	frisk
+syk	syk
+sykere	syk
+sykest	syk
+#########################################
+# Words that should not be stemmed
+#
+# Irregular masculine nouns (not supposed to be handled correctly)
+# Fetched from http://no.wiktionary.org/wiki/Kategori:Substantiv_i_norsk_med_uregelrett_flertallsb%C3%B8yning
+vaffel	vaffel
+vafler	vafl
+vaflene	vafl
+tittel	tittel
+titler	titl
+titlene	titl
+kam	kam
+kammer	kamm
+kammene	kamm
+kamrene	kamr
+# Irregular feminine nouns, not handled
+ku	ku
+ku	ku
+kyr	kyr
+kuer	kuer
+kyrne	kyrn
+kuene	kuen
+datter	datt
+døtre	døtr
+døtrene	døtr
+# Other words that should not be touched
+abc	abc
+123	123
+Jens	Jens
+# Adjectives
+billig	billig
+billigere	billig
+billigst	billig
+billige	billig
+frisk	frisk
+friskere	frisk
+friskest	frisk
+# Irregular adjectives that should not be stemmed
+god	god
+bedre	bedr
+best	best
+# Verbs, should not be stemmed
+føle	føl
+følte	føl
+følt	følt
+

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_minimal.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_minimal.txt?rev=1302833&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_minimal.txt
(added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_minimal.txt
Tue Mar 20 10:57:50 2012
@@ -0,0 +1,99 @@
+#
+# Tests for norwegian Bokmål minimal stemmer
+# It only tries to stem nouns, i.e. being very little agressive
+#
+# Nouns masculine
+bil	bil
+bilen	bil
+biler	bil
+bilene	bil
+bilens	bil
+bilenes	bil
+sekretæren	sekretær
+sekretær	sekretær
+sekretærene	sekretær
+kaker	kak
+kaken	kak
+kakene	kak
+kakenes	kak
+bibliotekar	bibliotekar
+bibliotekarer	bibliotekar
+bibliotekaren	bibliotekar
+bibliotekarens	bibliotekar
+bibliotekarene	bibliotekar
+bibliotekarenes	bibliotekar
+# Nouns feminine
+veske	vesk
+veska	vesk
+vesken	vesk
+veskene	vesk
+veskas	vesk
+# Nouns neutral
+huset	hus
+husene	hus
+husets	hus
+hus	hus
+huset	hus
+husene	hus
+husenes	hus
+flagg	flagg
+flagga	flagg
+flaggene	flagg
+flaggets	flagg
+flaggenes	flagg
+politi	politi
+politiet	politi
+politiets	politi
+politienes	politi
+#########################################
+# Words that should not be stemmed
+#
+# Irregular masculine nouns (not supposed to be handled correctly)
+# Fetched from http://no.wiktionary.org/wiki/Kategori:Substantiv_i_norsk_med_uregelrett_flertallsb%C3%B8yning
+vaffel	vaffel
+vafler	vafl
+vaflene	vafl
+tittel	tittel
+titler	titl
+titlene	titl
+kam	kam
+kammer	kamm
+kammene	kamm
+kamrene	kamr
+# Irregular feminine nouns, not handled
+ku	ku
+ku	ku
+kyr	kyr
+kuer	kuer
+kyrne	kyrn
+kuene	kuen
+datter	datt
+døtre	døtr
+døtrene	døtr
+# Other words that should not be touched
+abc	abc
+123	123
+Jens	Jens
+# Adjective, should not be stemmed
+billig	billig
+billigere	billiger
+billigst	billigst
+billige	billig
+god	god
+bedre	bedr
+best	best
+# General endings, should not be stemmed
+god	god
+godhet	godh
+forelskelse	forelskels
+kristendom	kristendom
+# Verbs, should not be stemmed
+føle	føl
+følte	følt
+følt	følt
+hemmelig	hemmelig
+hemmelighet	hemmeligh
+hemmeligheten	hemmelighet
+kjærlig	kjærlig
+kjærlighet	kjærlig
+kjærligheten	kjærlig
\ No newline at end of file

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1302833&r1=1302832&r2=1302833&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Tue Mar 20 10:57:50 2012
@@ -561,6 +561,8 @@ New Features
 
 * SOLR-2826: URLClassify Update Processor (janhoy)
 
+* SOLR-2764: Create a NorwegianLightStemmer and NorwegianMinimalStemmer (janhoy)
+
 Optimizations
 ----------------------
 * SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter



Mime
View raw message