lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1400566 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/analysis/ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/ lucene/suggest/ lucene/suggest/src/j...
Date Sun, 21 Oct 2012 03:45:41 GMT
Author: rmuir
Date: Sun Oct 21 03:45:40 2012
New Revision: 1400566

URL: http://svn.apache.org/viewvc?rev=1400566&view=rev
Log:
SOLR-3906: add factory for AnalyzingSuggester

Added:
    lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java
      - copied unchanged from r1400565, lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java
    lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/jasuggest.txt
      - copied unchanged from r1400565, lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/jasuggest.txt
    lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzedSuggestions.java
      - copied unchanged from r1400565, lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzedSuggestions.java
Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java
    lucene/dev/branches/branch_4x/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java
    lucene/dev/branches/branch_4x/lucene/suggest/   (props changed)
    lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
    lucene/dev/branches/branch_4x/solr/   (props changed)
    lucene/dev/branches/branch_4x/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/solr/core/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/schema-phrasesuggest.xml
    lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1400566&r1=1400565&r2=1400566&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Sun Oct 21 03:45:40 2012
@@ -69,6 +69,10 @@ Bug Fixes
 * LUCENE-4479: Highlighter works correctly for fields with term vector
   positions, but no offsets.  (Alan Woodward)
 
+* SOLR-3906: JapaneseReadingFormFilter in romaji mode will return
+  romaji even for out-of-vocabulary kana cases (e.g. half-width forms).
+  (Robert Muir)
+
 Optimizations
 
 * LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets 

Modified: lucene/dev/branches/branch_4x/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java?rev=1400566&r1=1400565&r2=1400566&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java
Sun Oct 21 03:45:40 2012
@@ -35,6 +35,7 @@ public final class JapaneseReadingFormFi
   private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
   private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class);
 
+  private StringBuilder buffer = new StringBuilder();
   private boolean useRomaji;
 
   public JapaneseReadingFormFilter(TokenStream input, boolean useRomaji) {
@@ -50,10 +51,19 @@ public final class JapaneseReadingFormFi
   public boolean incrementToken() throws IOException {
     if (input.incrementToken()) {
       String reading = readingAttr.getReading();
-      if (reading != null) {
-        if (useRomaji) {
-          ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
+      
+      if (useRomaji) {
+        if (reading == null) {
+          // if its an OOV term, just try the term text
+          buffer.setLength(0);
+          ToStringUtil.getRomanization(buffer, termAttr);
+          termAttr.setEmpty().append(buffer);
         } else {
+          ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
+        }
+      } else {
+        // just replace the term text with the reading, if it exists
+        if (reading != null) {
           termAttr.setEmpty().append(reading);
         }
       }

Modified: lucene/dev/branches/branch_4x/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java?rev=1400566&r1=1400565&r2=1400566&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java
Sun Oct 21 03:45:40 2012
@@ -19,7 +19,9 @@ package org.apache.lucene.analysis.ja;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cjk.CJKWidthFilter;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 
 import java.io.IOException;
@@ -52,12 +54,40 @@ public class TestJapaneseReadingFormFilt
         new String[] { "コンヤ", "ハ", "ロバート", "センセイ",
"ト", "ハナシ", "タ" }
     );
   }
+  
+  public void testKatakanaReadingsHalfWidth() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.Mode.SEARCH);
+        TokenStream stream = new CJKWidthFilter(tokenizer);
+        return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream,
false));
+      }
+    };
+    assertAnalyzesTo(a, "今夜はロバート先生と話した",
+        new String[] { "コンヤ", "ハ", "ロバート", "センセイ",
"ト", "ハナシ", "タ" }
+    );
+  }
 
   public void testRomajiReadings() throws IOException {
     assertAnalyzesTo(romajiAnalyzer, "今夜はロバート先生と話した",
         new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
     );
   }
+  
+  public void testRomajiReadingsHalfWidth() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.Mode.SEARCH);
+        TokenStream stream = new CJKWidthFilter(tokenizer);
+        return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream,
true));
+      }
+    };
+    assertAnalyzesTo(a, "今夜はロバート先生と話した",
+        new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
+    );
+  }
 
   public void testRandomData() throws IOException {
     Random random = random();

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java?rev=1400566&r1=1400565&r2=1400566&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
Sun Oct 21 03:45:40 2012
@@ -127,7 +127,7 @@ public class AnalyzingSuggester extends 
   private final boolean exactFirst;
   
   /** 
-   * True if separator between tokens should be preservered.
+   * True if separator between tokens should be preserved.
    */
   private final boolean preserveSep;
 

Modified: lucene/dev/branches/branch_4x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/CHANGES.txt?rev=1400566&r1=1400565&r2=1400566&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/solr/CHANGES.txt Sun Oct 21 03:45:40 2012
@@ -41,6 +41,10 @@ New Features
 * SOLR-3929: Support configuring IndexWriter max thread count in solrconfig.
   (phunt via Mark Miller)
 
+* SOLR-3906: Add support for AnalyzingSuggester (LUCENE-3842), where the
+  underlying analyzed form used for suggestions is separate from the returned
+  text.  (Robert Muir)
+
 Optimizations
 ----------------------
 

Modified: lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/schema-phrasesuggest.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/schema-phrasesuggest.xml?rev=1400566&r1=1400565&r2=1400566&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/schema-phrasesuggest.xml
(original)
+++ lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/schema-phrasesuggest.xml
Sun Oct 21 03:45:40 2012
@@ -40,6 +40,14 @@
 	    <filter class="solr.TrimFilterFactory"/>
 	  </analyzer>
 	</fieldtype>
+	
+	<fieldtype name="ja_suggest" class="solr.TextField">
+	  <analyzer>
+	    <tokenizer class="solr.JapaneseTokenizerFactory" mode="normal"/>
+	    <filter class="solr.CJKWidthFilterFactory"/>
+	    <filter class="solr.JapaneseReadingFormFilterFactory" useRomaji="true"/>
+	  </analyzer>
+	</fieldtype>
   </types>
 
   <fields>

Modified: lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml?rev=1400566&r1=1400565&r2=1400566&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml
(original)
+++ lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml
Sun Oct 21 03:45:40 2012
@@ -43,6 +43,27 @@
     <str name="queryAnalyzerFieldType">phrase_suggest</str>
   </searchComponent>
   
+  <!-- AnalyzingLookup suggest component -->
+  <searchComponent class="solr.SpellCheckComponent" name="suggest_analyzing">
+    <lst name="spellchecker">
+      <str name="name">suggest_analyzing</str>
+      <str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
+      <str name="lookupImpl">org.apache.solr.spelling.suggest.fst.AnalyzingLookupFactory</str>
+      <str name="storeDir">suggest_analyzing</str>
+      <str name="buildOnCommit">false</str>
+
+      <!-- Suggester properties -->
+      <bool name="exactMatchFirst">true</bool>
+      <str name="suggestAnalyzerFieldType">ja_suggest</str>
+      <bool name="preserveSep">false</bool>
+      
+      <str name="sourceLocation">jasuggest.txt</str>
+    </lst>
+    
+    <!-- specify a fieldtype using keywordtokenizer + lowercase + cleanup -->
+    <str name="queryAnalyzerFieldType">phrase_suggest</str>
+  </searchComponent>
+  
   <!-- is this thing just configured globally or wtf is going on here?! -->
   <queryConverter name="queryConverter" class="org.apache.solr.spelling.SuggestQueryConverter"/>
   
@@ -60,4 +81,18 @@
     </arr>
   </requestHandler>
   
+  <!--  analyzing (finite state automaton based) -->
+  <requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_analyzing">
+    <lst name="defaults">
+      <str name="spellcheck">true</str>
+      <str name="spellcheck.dictionary">suggest_analyzing</str>
+      <str name="spellcheck.collate">false</str>
+      <!-- NOTE: if this is false, results are alpha-ordered, not by weight! -->
+      <str name="spellcheck.onlyMorePopular">true</str>
+    </lst>
+    <arr name="components">
+      <str>suggest_analyzing</str>
+    </arr>
+  </requestHandler>
+  
 </config>



Mime
View raw message