lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From c.@apache.org
Subject svn commit: r1360592 - in /lucene/dev/trunk/solr: CHANGES.txt core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java example/solr/collection1/conf/schema.xml
Date Thu, 12 Jul 2012 09:19:02 GMT
Author: cm
Date: Thu Jul 12 09:19:02 2012
New Revision: 1360592

URL: http://svn.apache.org/viewvc?rev=1360592&view=rev
Log:
Made discarding punctuation configurable in JapaneseTokenizerFactory (SOLR-3524)

Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java
    lucene/dev/trunk/solr/example/solr/collection1/conf/schema.xml

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1360592&r1=1360591&r2=1360592&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Thu Jul 12 09:19:02 2012
@@ -78,6 +78,10 @@ Bug Fixes
 
 Other Changes
 
+* SOLR-3524: Make discarding punctuation configurable in JapaneseTokenizerFactory.
+  The default is to discard punctuation, but this is overridable as an expert option.
+  (Kazuaki Hiraga, Jun Ohtani via Christian Moen)
+
 * SOLR-1770: Move the default core instance directory into a collection1 folder.
   (Mark Miller)
   

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java?rev=1360592&r1=1360591&r2=1360592&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java
(original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java
Thu Jul 12 09:19:02 2012
@@ -42,9 +42,10 @@ import org.apache.lucene.analysis.util.R
  * <fieldType name="text_ja" class="solr.TextField">
  *   <analyzer>
  *     <tokenizer class="solr.JapaneseTokenizerFactory"
- *       mode=NORMAL
- *       userDictionary=user.txt
- *       userDictionaryEncoding=UTF-8
+ *       mode="NORMAL"
+ *       userDictionary="user.txt"
+ *       userDictionaryEncoding="UTF-8"
+ *       discardPunctuation="true"
  *     />
  *     <filter class="solr.JapaneseBaseFormFilterFactory"/>
  *   </analyzer>
@@ -58,9 +59,14 @@ public class JapaneseTokenizerFactory ex
   
   private static final String USER_DICT_ENCODING = "userDictionaryEncoding";
 
+  private static final String DISCARD_PUNCTUATION = "discardPunctuation"; // Expert option
+
   private UserDictionary userDictionary;
+
   private Mode mode;
-  
+
+  private boolean discardPunctuation;
+
   @Override
   public void inform(ResourceLoader loader) {
     mode = getMode(args);
@@ -83,11 +89,12 @@ public class JapaneseTokenizerFactory ex
     } catch (Exception e) {
       throw new InitializationException("Exception thrown while loading dictionary", e);
     }
+    discardPunctuation = getBoolean(DISCARD_PUNCTUATION, true);
   }
   
   @Override
   public Tokenizer create(Reader input) {
-    return new JapaneseTokenizer(input, userDictionary, true, mode);
+    return new JapaneseTokenizer(input, userDictionary, discardPunctuation, mode);
   }
   
   private Mode getMode(Map<String, String> args) {

Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java?rev=1360592&r1=1360591&r2=1360592&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java
(original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java
Thu Jul 12 09:19:02 2012
@@ -74,7 +74,10 @@ public class TestJapaneseTokenizerFactor
         new String[] { "シニアソフトウェアエンジニア"
}
     );
   }
-  
+
+  /**
+   * Test user dictionary
+   */
   public void testUserDict() throws IOException {
     String userDict = 
         "# Custom segmentation for long entries\n" +
@@ -92,4 +95,25 @@ public class TestJapaneseTokenizerFactor
         new String[] { "関西", "国際", "空港", "に",  "行っ",
 "た" }
     );
   }
+
+  /**
+   * Test preserving punctuation
+   */
+  public void testPreservePunctuation() throws IOException {
+    JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("discardPunctuation", "false");
+    factory.init(args);
+    factory.inform(new SolrResourceLoader(null, null));
+    TokenStream ts = factory.create(
+        new StringReader("今ノルウェーにいますが、来週の頭日本に戻ります。楽しみにしています!お寿司が食べたいな。。。")
+    );
+    System.out.println(ts.toString());
+    assertTokenStreamContents(ts,
+        new String[] { "今", "ノルウェー", "に", "い", "ます",
"が", "、",
+            "来週", "の", "頭", "日本", "に", "戻り",
"ます", "。",
+            "楽しみ", "に", "し", "て", "い", "ます",
"!",
+            "お", "寿司", "が", "食べ", "たい", "な",
"。", "。", "。"}
+    );
+  }
 }

Modified: lucene/dev/trunk/solr/example/solr/collection1/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/collection1/conf/schema.xml?rev=1360592&r1=1360591&r2=1360592&view=diff
==============================================================================
--- lucene/dev/trunk/solr/example/solr/collection1/conf/schema.xml (original)
+++ lucene/dev/trunk/solr/example/solr/collection1/conf/schema.xml Thu Jul 12 09:19:02 2012
@@ -923,6 +923,8 @@
 
            See lang/userdict_ja.txt for a sample user dictionary file.
 
+           Punctuation characters are discarded by default.  Use discardPunctuation="false"
to keep them.
+
            See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language
support.
         -->
         <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>



Mime
View raw message