lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From c.@apache.org
Subject svn commit: r1656670 - in /lucene/dev/trunk/lucene: ./ analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/ analysis/kuromoji/src/resources/META-INF/services/ analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/
Date Tue, 03 Feb 2015 10:44:12 GMT
Author: cm
Date: Tue Feb  3 10:44:11 2015
New Revision: 1656670

URL: http://svn.apache.org/r1656670
Log:
Added JapaneseNumberFilter (LUCENE-3922)

Added:
    lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
  (with props)
    lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilterFactory.java
  (with props)
    lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilter.java
  (with props)
    lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilterFactory.java
  (with props)
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1656670&r1=1656669&r2=1656670&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Feb  3 10:44:11 2015
@@ -31,6 +31,11 @@ API Changes
 
 ======================= Lucene 5.1.0 =======================
 
+New Features
+
+* LUCENE-3922: Added JapaneseNumberFilter that normalizes Japanese numbers
+  in kansuji form to regular/Arabic numbers. (Gaute Lambertsen, Christian Moen)
+
 Bug Fixes
 
 * Spatial pointsOnly flag on PrefixTreeStrategy shouldn't switch all predicates to

Added: lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java?rev=1656670&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
(added)
+++ lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilter.java
Tue Feb  3 10:44:11 2015
@@ -0,0 +1,618 @@
+package org.apache.lucene.analysis.ja;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.math.BigDecimal;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+
+/**
+ * A {@link TokenFilter} that normalizes Japanese numbers (kansūji) to regular Arabic
+ * decimal numbers in half-width characters.
+ * <p>
+ * Japanese numbers are often written using a combination of kanji and Arabic numbers with
+ * various kinds punctuation. For example, 3.2千 means 3200. This filter
does this kind
+ * of normalization and allows a search for 3200 to match 3.2千 in text,
but can also be
+ * used to make range facets based on the normalized numbers and so on.
+ * <p>
+ * Notice that this analyzer uses a token composition scheme and relies on punctuation
+ * tokens being found in the token stream. Please make sure your {@link JapaneseTokenizer}
+ * has {@code discardPunctuation} set to false. In case punctuation characters, such as .
+ * (U+FF0E FULLWIDTH FULL STOP), is removed from the token stream, this filter would find
+ * input tokens tokens 3 and 2千 and give outputs 3 and 2000 instead of 3200,
which is
+ * likely not the intended result. If you want to remove punctuation characters from your
+ * index that are not part of normalized numbers, add a
+ * {@link org.apache.lucene.analysis.core.StopFilter} with the punctuation you wish to
+ * remove after {@link JapaneseNumberFilter} in your analyzer chain.
+ * <p>
+ * Below are some examples of normalizations this filter supports. The input is untokenized
+ * text and the result is the single term attribute emitted for the input.
+ * <ul>
+ * <li>〇〇七 becomes 7</li>
+ * <li>一〇〇〇 becomes 1000</li>
+ * <li>三千2百2十三 becomes 3223</li>
+ * <li>兆六百万五千一 becomes 1000006005001</li>
+ * <li>3.2千 becomes 3200</li>
+ * <li>1.2万345.67 becomes 12345.67</li>
+ * <li>4,647.100 becomes 4647.1</li>
+ * <li>15,7 becomes 157 (be aware of this weakness)</li>
+ * </ul>
+ * <p>
+ * Tokens preceded by a token with {@link PositionIncrementAttribute} of zero are left
+ * left untouched and emitted as-is.
+ * <p>
+ * This filter does not use any part-of-speech information for its normalization and
+ * the motivation for this is to also support n-grammed token streams in the future.
+ * <p>
+ * This filter may in some cases normalize tokens that are not numbers in their context.
+ * For example, is 田中京一 is a name and means Tanaka Kyōichi, but 京一
(Kyōichi) out of
+ * context can strictly speaking also represent the number 10000000000000001. This filter
+ * respects the {@link KeywordAttribute}, which can be used to prevent specific
+ * normalizations from happening.
+ * <p>
+ * Also notice that token attributes such as
+ * {@link org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute},
+ * {@link org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute},
+ * {@link org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute} and
+ * {@link org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute} are left
+ * unchanged and will inherit the values of the last token used to compose the normalized
+ * number and can be wrong. Hence, for 10万 (10000), we will have
+ * {@link org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute}
+ * set to マン. This is a known issue and is subject to a future improvement.
+ * <p>
+ * Japanese formal numbers (daiji), accounting numbers and decimal fractions are currently
+ * not supported.
+ */
+public class JapaneseNumberFilter extends TokenFilter {
+
+  private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAttr = addAttribute(OffsetAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+  private final PositionIncrementAttribute posIncrAttr = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLengthAttr = addAttribute(PositionLengthAttribute.class);
+
+  private static char NO_NUMERAL = Character.MAX_VALUE;
+
+  private static char[] numerals;
+
+  private static char[] exponents;
+
+  private State state;
+
+  private StringBuilder numeral;
+
+  private int fallThroughTokens;
+
+  static {
+    numerals = new char[0x10000];
+    for (int i = 0; i < numerals.length; i++) {
+      numerals[i] = NO_NUMERAL;
+    }
+    numerals['〇'] = 0; // 〇 U+3007 0
+    numerals['一'] = 1; // 一 U+4E00 1
+    numerals['二'] = 2; // 二 U+4E8C 2
+    numerals['三'] = 3; // 三 U+4E09 3
+    numerals['四'] = 4; // 四 U+56DB 4
+    numerals['五'] = 5; // 五 U+4E94 5
+    numerals['六'] = 6; // 六 U+516D 6
+    numerals['七'] = 7; // 七 U+4E03 7
+    numerals['八'] = 8; // 八 U+516B 8
+    numerals['九'] = 9; // 九 U+4E5D 9
+
+    exponents = new char[0x10000];
+    for (int i = 0; i < exponents.length; i++) {
+      exponents[i] = 0;
+    }
+    exponents['十'] = 1;  // 十 U+5341 10
+    exponents['百'] = 2;  // 百 U+767E 100
+    exponents['千'] = 3;  // 千 U+5343 1,000
+    exponents['万'] = 4;  // 万 U+4E07 10,000
+    exponents['億'] = 8;  // 億 U+5104 100,000,000
+    exponents['兆'] = 12; // 兆 U+5146 1,000,000,000,000
+    exponents['京'] = 16; // 京 U+4EAC 10,000,000,000,000,000
+    exponents['垓'] = 20; // 垓 U+5793 100,000,000,000,000,000,000
+  }
+
+  public JapaneseNumberFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+
+    // Emit previously captured token we read past earlier
+    if (state != null) {
+      restoreState(state);
+      state = null;
+      return true;
+    }
+
+    if (!input.incrementToken()) {
+      return false;
+    }
+
+    if (keywordAttr.isKeyword()) {
+      return true;
+    }
+
+    if (fallThroughTokens > 0) {
+      fallThroughTokens--;
+      return true;
+    }
+
+    if (posIncrAttr.getPositionIncrement() == 0) {
+      fallThroughTokens = posLengthAttr.getPositionLength() - 1;
+      return true;
+    }
+
+    boolean moreTokens = true;
+    boolean composedNumberToken = false;
+    int startOffset = 0;
+    int endOffset = 0;
+    State preCompositionState = captureState();
+    String term = termAttr.toString();
+    boolean numeralTerm = isNumeral(term);
+    
+    while (moreTokens && numeralTerm) {
+
+      if (!composedNumberToken) {
+        startOffset = offsetAttr.startOffset();
+        composedNumberToken = true;
+      }
+
+      endOffset = offsetAttr.endOffset();
+      moreTokens = input.incrementToken();
+
+      if (posIncrAttr.getPositionIncrement() == 0) {
+        // This token is a stacked/synonym token, capture number of tokens "under" this token,
+        // except the first token, which we will emit below after restoring state
+        fallThroughTokens = posLengthAttr.getPositionLength() - 1;
+        state = captureState();
+        restoreState(preCompositionState);
+        return moreTokens;
+      }
+
+      numeral.append(term);
+
+      if (moreTokens) {
+        term = termAttr.toString();
+        numeralTerm = isNumeral(term) || isNumeralPunctuation(term);
+      }
+    }
+
+    if (composedNumberToken) {
+      if (moreTokens) {
+        // We have read past all numerals and there are still tokens left, so
+        // capture the state of this token and emit it on our next incrementToken()
+        state = captureState();
+      }
+
+      String normalizedNumber = normalizeNumber(numeral.toString());
+
+      termAttr.setEmpty();
+      termAttr.append(normalizedNumber);
+      offsetAttr.setOffset(startOffset, endOffset);
+
+      numeral = new StringBuilder();
+      return true;
+    }
+    return moreTokens;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    fallThroughTokens = 0;
+    numeral = new StringBuilder();
+    state = null;
+  }
+
+  /**
+   * Normalizes a Japanese number
+   *
+   * @param number number or normalize
+   * @return normalized number, or number to normalize on error (no op)
+   */
+  public String normalizeNumber(String number) {
+    try {
+      BigDecimal normalizedNumber = parseNumber(new NumberBuffer(number));
+      if (normalizedNumber == null) {
+        return number;
+      }
+      return normalizedNumber.stripTrailingZeros().toPlainString();
+    } catch (NumberFormatException | ArithmeticException e) {
+      // Return the source number in case of error, i.e. malformed input
+      return number;
+    }
+  }
+
+  /**
+   * Parses a Japanese number
+   *
+   * @param buffer buffer to parse
+   * @return parsed number, or null on error or end of input
+   */
+  private BigDecimal parseNumber(NumberBuffer buffer) {
+    BigDecimal sum = BigDecimal.ZERO;
+    BigDecimal result = parseLargePair(buffer);
+
+    if (result == null) {
+      return null;
+    }
+
+    while (result != null) {
+      sum = sum.add(result);
+      result = parseLargePair(buffer);
+    }
+
+    return sum;
+  }
+
+  /**
+   * Parses a pair of large numbers, i.e. large kanji factor is 10,000(万)or
larger
+   *
+   * @param buffer buffer to parse
+   * @return parsed pair, or null on error or end of input
+   */
+  private BigDecimal parseLargePair(NumberBuffer buffer) {
+    BigDecimal first = parseMediumNumber(buffer);
+    BigDecimal second = parseLargeKanjiNumeral(buffer);
+
+    if (first == null && second == null) {
+      return null;
+    }
+
+    if (second == null) {
+      // If there's no second factor, we return the first one
+      // This can happen if we our number is smaller than 10,000 (万)
+      return first;
+    }
+
+    if (first == null) {
+      // If there's no first factor, just return the second one,
+      // which is the same as multiplying by 1, i.e. with 万
+      return second;
+    }
+
+    return first.multiply(second);
+  }
+
+  /**
+   * Parses a "medium sized" number, typically less than 10,000(万), but might
be larger
+   * due to a larger factor from {link parseBasicNumber}.
+   *
+   * @param buffer buffer to parse
+   * @return parsed number, or null on error or end of input
+   */
+  private BigDecimal parseMediumNumber(NumberBuffer buffer) {
+    BigDecimal sum = BigDecimal.ZERO;
+    BigDecimal result = parseMediumPair(buffer);
+
+    if (result == null) {
+      return null;
+    }
+
+    while (result != null) {
+      sum = sum.add(result);
+      result = parseMediumPair(buffer);
+    }
+
+    return sum;
+  }
+
+  /**
+   * Parses a pair of "medium sized" numbers, i.e. large kanji factor is at most 1,000(千)
+   *
+   * @param buffer buffer to parse
+   * @return parsed pair, or null on error or end of input
+   */
+  private BigDecimal parseMediumPair(NumberBuffer buffer) {
+
+    BigDecimal first = parseBasicNumber(buffer);
+    BigDecimal second = parseMediumKanjiNumeral(buffer);
+
+    if (first == null && second == null) {
+      return null;
+    }
+
+    if (second == null) {
+      // If there's no second factor, we return the first one
+      // This can happen if we just have a plain number such as 五
+      return first;
+    }
+
+    if (first == null) {
+      // If there's no first factor, just return the second one,
+      // which is the same as multiplying by 1, i.e. with 千
+      return second;
+    }
+
+    // Return factors multiplied
+    return first.multiply(second);
+  }
+
+  /**
+   * Parse a basic number, which is a sequence of Arabic numbers or a sequence or 0-9 kanji
numerals (〇 to 九).
+   *
+   * @param buffer buffer to parse
+   * @return parsed number, or null on error or end of input
+   */
+  private BigDecimal parseBasicNumber(NumberBuffer buffer) {
+    StringBuilder builder = new StringBuilder();
+    int i = buffer.position();
+
+    while (i < buffer.length()) {
+      char c = buffer.charAt(i);
+
+      if (isArabicNumeral(c)) {
+        // Arabic numerals; 0 to 9 or 0 to 9 (full-width)
+        builder.append(arabicNumeralValue(c));
+      } else if (isKanjiNumeral(c)) {
+        // Kanji numerals; 〇, 一, 二, 三, 四, 五, 六, 七,
八, or 九
+        builder.append(kanjiNumeralValue(c));
+      } else if (isDecimalPoint(c)) {
+        builder.append(".");
+      } else if (isThousandSeparator(c)) {
+        // Just skip and move to the next character
+      } else {
+        // We don't have an Arabic nor kanji numeral, nor separation or punctuation, so we'll
stop.
+        break;
+      }
+
+      i++;
+      buffer.advance();
+    }
+
+    if (builder.length() == 0) {
+      // We didn't build anything, so we don't have a number
+      return null;
+    }
+
+    return new BigDecimal(builder.toString());
+  }
+
+  /**
+   * Parse large kanji numerals (ten thousands or larger)
+   *
+   * @param buffer buffer to parse
+   * @return parsed number, or null on error or end of input
+   */
+  public BigDecimal parseLargeKanjiNumeral(NumberBuffer buffer) {
+    int i = buffer.position();
+
+    if (i >= buffer.length()) {
+      return null;
+    }
+
+    char c = buffer.charAt(i);
+    int power = exponents[c];
+
+    if (power > 3) {
+      buffer.advance();
+      return BigDecimal.TEN.pow(power);
+    }
+
+    return null;
+  }
+
+  /**
+   * Parse medium kanji numerals (tens, hundreds or thousands)
+   *
+   * @param buffer buffer to parse
+   * @return parsed number or null on error
+   */
+  public BigDecimal parseMediumKanjiNumeral(NumberBuffer buffer) {
+    int i = buffer.position();
+
+    if (i >= buffer.length()) {
+      return null;
+    }
+
+    char c = buffer.charAt(i);
+    int power = exponents[c];
+
+    if (1 <= power && power <= 3) {
+      buffer.advance();
+      return BigDecimal.TEN.pow(power);
+    }
+
+    return null;
+  }
+
+  /**
+   * Numeral predicate
+   *
+   * @param input string to test
+   * @return true if and only if input is a numeral
+   */
+  public boolean isNumeral(String input) {
+    for (int i = 0; i < input.length(); i++) {
+      if (!isNumeral(input.charAt(i))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /**
+   * Numeral predicate
+   *
+   * @param c character to test
+   * @return true if and only if c is a numeral
+   */
+  public boolean isNumeral(char c) {
+    return isArabicNumeral(c) || isKanjiNumeral(c) || exponents[c] > 0;
+  }
+
+  /**
+   * Numeral punctuation predicate
+   *
+   * @param input string to test
+   * @return true if and only if c is a numeral punctuation string
+   */
+  public boolean isNumeralPunctuation(String input) {
+    for (int i = 0; i < input.length(); i++) {
+      if (!isNumeralPunctuation(input.charAt(i))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /**
+   * Numeral punctuation predicate
+   *
+   * @param c character to test
+   * @return true if and only if c is a numeral punctuation character
+   */
+  public boolean isNumeralPunctuation(char c) {
+    return isDecimalPoint(c) || isThousandSeparator(c);
+  }
+
+  /**
+   * Arabic numeral predicate. Both half-width and full-width characters are supported
+   *
+   * @param c character to test
+   * @return true if and only if c is an Arabic numeral
+   */
+  public boolean isArabicNumeral(char c) {
+    return isHalfWidthArabicNumeral(c) || isFullWidthArabicNumeral(c);
+  }
+
+  /**
+   * Arabic half-width numeral predicate
+   *
+   * @param c character to test
+   * @return true if and only if c is a half-width Arabic numeral
+   */
+  private boolean isHalfWidthArabicNumeral(char c) {
+    // 0 U+0030 - 9 U+0039
+    return '0' <= c && c <= '9';
+  }
+
+  /**
+   * Arabic full-width numeral predicate
+   *
+   * @param c character to test
+   * @return true if and only if c is a full-width Arabic numeral
+   */
+  private boolean isFullWidthArabicNumeral(char c) {
+    // 0 U+FF10 - 9 U+FF19
+    return '0' <= c && c <= '9';
+  }
+
+  /**
+   * Returns the numeric value for the specified character Arabic numeral.
+   * Behavior is undefined if a non-Arabic numeral is provided
+   *
+   * @param c arabic numeral character
+   * @return numeral value
+   */
+  private int arabicNumeralValue(char c) {
+    int offset;
+    if (isHalfWidthArabicNumeral(c)) {
+      offset = '0';
+    } else {
+      offset = '0';
+    }
+    return c - offset;
+  }
+
+  /**
+   * Kanji numeral predicate that tests if the provided character is one of 〇, 一,
二, 三, 四, 五, 六, 七, 八, or 九.
+   * Larger number kanji gives a false value.
+   *
+   * @param c character to test
+   * @return true if and only is character is one of 〇, 一, 二, 三, 四,
五, 六, 七, 八, or 九 (0 to 9)
+   */
+  private boolean isKanjiNumeral(char c) {
+    return numerals[c] != NO_NUMERAL;
+  }
+
+  /**
+   * Returns the value for the provided kanji numeral. Only numeric values for the characters
where
+   * {link isKanjiNumeral} return true are supported - behavior is undefined for other characters.
+   *
+   * @param c kanji numeral character
+   * @return numeral value
+   * @see #isKanjiNumeral(char)
+   */
+  private int kanjiNumeralValue(char c) {
+    return numerals[c];
+  }
+
+  /**
+   * Decimal point predicate
+   *
+   * @param c character to test
+   * @return true if and only if c is a decimal point
+   */
+  private boolean isDecimalPoint(char c) {
+    return c == '.'   // U+002E FULL STOP 
+        || c == '.'; // U+FF0E FULLWIDTH FULL STOP
+  }
+
+  /**
+   * Thousand separator predicate
+   *
+   * @param c character to test
+   * @return true if and only if c is a thousand separator predicate
+   */
+  private boolean isThousandSeparator(char c) {
+    return c == ','   // U+002C COMMA
+        || c == ','; // U+FF0C FULLWIDTH COMMA
+  }
+
+  /**
+   * Buffer that holds a Japanese number string and a position index used as a parsed-to
marker
+   */
+  public static class NumberBuffer {
+
+    private int position;
+
+    private String string;
+
+    public NumberBuffer(String string) {
+      this.string = string;
+      this.position = 0;
+    }
+
+    public char charAt(int index) {
+      return string.charAt(index);
+    }
+
+    public int length() {
+      return string.length();
+    }
+
+    public void advance() {
+      position++;
+    }
+
+    public int position() {
+      return position;
+    }
+  }
+}
\ No newline at end of file

Added: lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilterFactory.java?rev=1656670&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilterFactory.java
(added)
+++ lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseNumberFilterFactory.java
Tue Feb  3 10:44:11 2015
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.ja;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link JapaneseNumberFilter}.
+ * <p>
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_ja" class="solr.TextField"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.JapaneseTokenizerFactory" discardPunctuation="false"/&gt;
+ *     &lt;filter class="solr.JapaneseNumberFilter"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;
+ * </pre>
+ * <p>
+ * It is important that punctuation is not discarded by the tokenizer so use
+ * {@code discardPunctuation="false"} in your {@link JapaneseTokenizerFactory}.
+ */
+public class JapaneseNumberFilterFactory extends TokenFilterFactory {
+
+  public JapaneseNumberFilterFactory(Map<String, String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new JapaneseNumberFilter(input);
+  }
+}

Modified: lucene/dev/trunk/lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory?rev=1656670&r1=1656669&r2=1656670&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
(original)
+++ lucene/dev/trunk/lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
Tue Feb  3 10:44:11 2015
@@ -15,5 +15,6 @@
 
 org.apache.lucene.analysis.ja.JapaneseBaseFormFilterFactory
 org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory
+org.apache.lucene.analysis.ja.JapaneseNumberFilterFactory
 org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory
 org.apache.lucene.analysis.ja.JapaneseReadingFormFilterFactory

Added: lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilter.java?rev=1656670&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilter.java
(added)
+++ lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilter.java
Tue Feb  3 10:44:11 2015
@@ -0,0 +1,304 @@
+package org.apache.lucene.analysis.ja;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.junit.Ignore;
+import org.junit.Test;
+
+public class TestJapaneseNumberFilter extends BaseTokenStreamTestCase {
+
+  private Analyzer analyzer = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName) {
+      Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
+      return new TokenStreamComponents(tokenizer, new JapaneseNumberFilter(tokenizer));
+    }
+  };
+
+  @Test
+  public void testBasics() throws IOException {
+
+    assertAnalyzesTo(analyzer, "本日十万二千五百円のワインを買った",
+        new String[]{"本日", "102500", "円", "の", "ワイン",
"を", "買っ", "た"},
+        new int[]{0, 2, 8, 9, 10, 13, 14, 16},
+        new int[]{2, 8, 9, 10, 13, 14, 16, 17}
+    );
+
+    assertAnalyzesTo(analyzer, "昨日のお寿司は10万円でした。",
+        new String[]{"昨日", "の", "お", "寿司", "は", "100000",
"円", "でし", "た", "。"},
+        new int[]{0, 2, 3, 4, 6, 7, 10, 11, 13, 14},
+        new int[]{2, 3, 4, 6, 7, 10, 11, 13, 14, 15}
+    );
+
+    assertAnalyzesTo(analyzer, "アティリカの資本金は600万円です",
+        new String[]{"アティリカ", "の", "資本", "金",
"は", "6000000", "円", "です"},
+        new int[]{0, 5, 6, 8, 9, 10, 14, 15},
+        new int[]{5, 6, 8, 9, 10, 14, 15, 17}
+    );
+  }
+
+  @Test
+  public void testVariants() throws IOException {
+    // Test variants of three
+    assertAnalyzesTo(analyzer, "3", new String[]{"3"});
+    assertAnalyzesTo(analyzer, "3", new String[]{"3"});
+    assertAnalyzesTo(analyzer, "三", new String[]{"3"});
+
+    // Test three variations with trailing zero
+    assertAnalyzesTo(analyzer, "03", new String[]{"3"});
+    assertAnalyzesTo(analyzer, "03", new String[]{"3"});
+    assertAnalyzesTo(analyzer, "〇三", new String[]{"3"});
+    assertAnalyzesTo(analyzer, "003", new String[]{"3"});
+    assertAnalyzesTo(analyzer, "003", new String[]{"3"});
+    assertAnalyzesTo(analyzer, "〇〇三", new String[]{"3"});
+
+    // Test thousand variants
+    assertAnalyzesTo(analyzer, "千", new String[]{"1000"});
+    assertAnalyzesTo(analyzer, "1千", new String[]{"1000"});
+    assertAnalyzesTo(analyzer, "1千", new String[]{"1000"});
+    assertAnalyzesTo(analyzer, "一千", new String[]{"1000"});
+    assertAnalyzesTo(analyzer, "一〇〇〇", new String[]{"1000"});
+    assertAnalyzesTo(analyzer, "10百", new String[]{"1000"}); // Strange, but
supported
+  }
+
+  @Test
+  public void testLargeVariants() throws IOException {
+    // Test large numbers
+    assertAnalyzesTo(analyzer, "三五七八九", new String[]{"35789"});
+    assertAnalyzesTo(analyzer, "六百二万五千一", new String[]{"6025001"});
+    assertAnalyzesTo(analyzer, "兆六百万五千一", new String[]{"1000006005001"});
+    assertAnalyzesTo(analyzer, "十兆六百万五千一", new String[]{"10000006005001"});
+    assertAnalyzesTo(analyzer, "一京一", new String[]{"10000000000000001"});
+    assertAnalyzesTo(analyzer, "十京十", new String[]{"100000000000000010"});
+    assertAnalyzesTo(analyzer, "垓京兆億万千百十一",
new String[]{"100010001000100011111"});
+  }
+
+  @Test
+  public void testNegative() throws IOException {
+    assertAnalyzesTo(analyzer, "-100万", new String[]{"-", "1000000"});
+  }
+
+  @Test
+  public void testMixed() throws IOException {
+    // Test mixed numbers
+    assertAnalyzesTo(analyzer, "三千2百2十三", new String[]{"3223"});
+    assertAnalyzesTo(analyzer, "32二三", new String[]{"3223"});
+  }
+
+  @Test
+  public void testNininsankyaku() throws IOException {
+    // Unstacked tokens
+    assertAnalyzesTo(analyzer, "二", new String[]{"2"});
+    assertAnalyzesTo(analyzer, "二人", new String[]{"2", "人"});
+    assertAnalyzesTo(analyzer, "二人三", new String[]{"2", "人", "3"});
+    // Stacked tokens - emit tokens as they are
+    assertAnalyzesTo(analyzer, "二人三脚", new String[]{"二", "二人三脚",
"人", "三", "脚"});
+  }
+
+  @Test
+  public void testFujiyaichinisanu() throws IOException {
+    // Stacked tokens with a numeral partial
+    assertAnalyzesTo(analyzer, "不二家一二三", new String[]{"不",
"不二家", "二", "家", "123"});
+  }
+
+  @Test
+  public void testFunny() throws IOException {
+    // Test some oddities for inconsistent input
+    assertAnalyzesTo(analyzer, "十十", new String[]{"20"}); // 100?
+    assertAnalyzesTo(analyzer, "百百百", new String[]{"300"}); // 10,000?
+    assertAnalyzesTo(analyzer, "千千千千", new String[]{"4000"}); // 1,000,000,000,000?
+  }
+
+  @Test
+  public void testKanjiArabic() throws IOException {
+    // Test kanji numerals used as Arabic numbers (with head zero)
+    assertAnalyzesTo(analyzer, "〇一二三四五六七八九九八七六五四三二一〇",
+        new String[]{"1234567899876543210"}
+    );
+
+    // I'm Bond, James "normalized" Bond...
+    assertAnalyzesTo(analyzer, "〇〇七", new String[]{"7"});
+  }
+
+  @Test
+  public void testDoubleZero() throws IOException {
+    assertAnalyzesTo(analyzer, "〇〇",
+        new String[]{"0"},
+        new int[]{0},
+        new int[]{2},
+        new int[]{1}
+    );
+  }
+
+  @Test
+  public void testName() throws IOException {
+    // Test name that normalises to number
+    assertAnalyzesTo(analyzer, "田中京一",
+        new String[]{"田中", "10000000000000001"}, // 京一 is normalized
to a number
+        new int[]{0, 2},
+        new int[]{2, 4},
+        new int[]{1, 1}
+    );
+
+    // An analyzer that marks 京一 as a keyword
+    Analyzer keywordMarkingAnalyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        CharArraySet set = new CharArraySet(1, false);
+        set.add("京一");
+
+        Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
+        return new TokenStreamComponents(tokenizer, new JapaneseNumberFilter(new SetKeywordMarkerFilter(tokenizer,
set)));
+      }
+    };
+
+    assertAnalyzesTo(keywordMarkingAnalyzer, "田中京一",
+        new String[]{"田中", "京一"}, // 京一 is not normalized
+        new int[]{0, 2},
+        new int[]{2, 4},
+        new int[]{1, 1}
+    );
+  }
+
+  @Test
+  public void testDecimal() throws IOException {
+    // Test Arabic numbers with punctuation, i.e. 3.2 thousands
+    assertAnalyzesTo(analyzer, "1.2万345.67",
+        new String[]{"12345.67"}
+    );
+  }
+
+  @Test
+  public void testDecimalPunctuation() throws IOException {
+    // Test Arabic numbers with punctuation, i.e. 3.2 thousands yen
+    assertAnalyzesTo(analyzer, "3.2千円",
+        new String[]{"3200", "円"}
+    );
+  }
+
+  @Test
+  public void testThousandSeparator() throws IOException {
+    assertAnalyzesTo(analyzer, "4,647",
+        new String[]{"4647"}
+    );
+  }
+
+  @Test
+  public void testDecimalThousandSeparator() throws IOException {
+    assertAnalyzesTo(analyzer, "4,647.0010",
+        new String[]{"4647.001"}
+    );
+  }
+
+  @Test
+  public void testCommaDecimalSeparator() throws IOException {
+    assertAnalyzesTo(analyzer, "15,7",
+        new String[]{"157"}
+    );
+  }
+
+  @Test
+  public void testTrailingZeroStripping() throws IOException {
+    assertAnalyzesTo(analyzer, "1000.1000",
+        new String[]{"1000.1"}
+    );
+    assertAnalyzesTo(analyzer, "1000.0000",
+        new String[]{"1000"}
+    );
+  }
+
+  @Test
+  public void testEmpty() throws IOException {
+    assertAnalyzesTo(analyzer, "", new String[]{});
+  }
+
+  @Test
+  public void testRandomHugeStrings() throws Exception {
+    checkRandomData(random(), analyzer, 50 * RANDOM_MULTIPLIER, 8192);
+  }
+
+  @Test
+  public void testRandomSmallStrings() throws Exception {
+    checkRandomData(random(), analyzer, 500 * RANDOM_MULTIPLIER, 128);
+  }
+
+  @Test
+  public void testFunnyIssue() throws Exception {
+    BaseTokenStreamTestCase.checkAnalysisConsistency(
+        random(), analyzer, true, "〇〇\u302f\u3029\u3039\u3023\u3033\u302bB", true
+    );
+  }
+
+  @Ignore("This test is used during development when analyze normalizations in large amounts
of text")
+  @Test
+  public void testLargeData() throws IOException {
+    Path input = Paths.get("/tmp/test.txt");
+    Path tokenizedOutput = Paths.get("/tmp/test.tok.txt");
+    Path normalizedOutput = Paths.get("/tmp/test.norm.txt");
+
+    Analyzer plainAnalyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
+        return new TokenStreamComponents(tokenizer);
+      }
+    };
+
+    analyze(
+        plainAnalyzer,
+        Files.newBufferedReader(input, StandardCharsets.UTF_8),
+        Files.newBufferedWriter(tokenizedOutput, StandardCharsets.UTF_8)
+    );
+
+    analyze(
+        analyzer,
+        Files.newBufferedReader(input, StandardCharsets.UTF_8),
+        Files.newBufferedWriter(normalizedOutput, StandardCharsets.UTF_8)
+    );
+  }
+
+  public void analyze(Analyzer analyzer, Reader reader, Writer writer) throws IOException
{
+    TokenStream stream = analyzer.tokenStream("dummy", reader);
+    stream.reset();
+
+    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
+
+    while (stream.incrementToken()) {
+      writer.write(termAttr.toString());
+      writer.write("\n");
+    }
+
+    reader.close();
+    writer.close();
+  }
+}

Added: lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilterFactory.java?rev=1656670&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilterFactory.java
(added)
+++ lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseNumberFilterFactory.java
Tue Feb  3 10:44:11 2015
@@ -0,0 +1,62 @@
+package org.apache.lucene.analysis.ja;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * Simple tests for {@link org.apache.lucene.analysis.ja.JapaneseNumberFilterFactory}
+ */
+public class TestJapaneseNumberFilterFactory extends BaseTokenStreamTestCase {
+  public void testBasics() throws IOException {
+
+    Map<String, String> args = new HashMap<>();
+    args.put("discardPunctuation", "false");
+
+    JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(args);
+
+    tokenizerFactory.inform(new StringMockResourceLoader(""));
+    TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
+    ((Tokenizer)tokenStream).setReader(new StringReader("昨日のお寿司は10万円でした。"));
+
+    JapaneseNumberFilterFactory factory = new JapaneseNumberFilterFactory(new HashMap<>());
+    tokenStream = factory.create(tokenStream);
+    assertTokenStreamContents(tokenStream,
+        new String[] { "昨日", "の", "お", "寿司", "は", "100000",
"円", "でし", "た", "。" }
+    );
+  }
+  
+  /** Test that bogus arguments result in exception */
+  public void testBogusArguments() throws Exception {
+    try {
+      new JapaneseNumberFilterFactory(new HashMap<String,String>() {{
+        put("bogusArg", "bogusValue");
+      }});
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("Unknown parameters"));
+    }
+  }
+}



Mime
View raw message