lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r940447 [2/2] - in /lucene/dev/trunk/lucene/contrib: ./ icu/ icu/src/data/uax29/ icu/src/java/ icu/src/java/org/apache/lucene/analysis/icu/segmentation/ icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ icu/src/resources/org/apac...
Date Mon, 03 May 2010 13:20:10 GMT
Added: lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
(added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
Mon May  3 13:20:09 2010
@@ -0,0 +1,225 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
+
+import java.util.Arrays;
+
+public class TestICUTokenizer extends BaseTokenStreamTestCase {
+  
+  public void testHugeDoc() throws IOException {
+    StringBuilder sb = new StringBuilder();
+    char whitespace[] = new char[4094];
+    Arrays.fill(whitespace, ' ');
+    sb.append(whitespace);
+    sb.append("testing 1234");
+    String input = sb.toString();
+    ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
+    assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
+  }
+  
+  public void testHugeTerm2() throws IOException {
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < 40960; i++) {
+      sb.append('a');
+    }
+    String input = sb.toString();
+    ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
+    char token[] = new char[4096];
+    Arrays.fill(token, 'a');
+    String expectedToken = new String(token);
+    String expected[] = { 
+        expectedToken, expectedToken, expectedToken, 
+        expectedToken, expectedToken, expectedToken,
+        expectedToken, expectedToken, expectedToken,
+        expectedToken
+    };
+    assertTokenStreamContents(tokenizer, expected);
+  }
+  
+  private Analyzer a = new ReusableAnalyzerBase() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName,
+        Reader reader) {
+      Tokenizer tokenizer = new ICUTokenizer(reader);
+      TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
+      return new TokenStreamComponents(tokenizer, filter);
+    }
+  };
+
+  public void testArmenian() throws Exception {
+    assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն
հոդվածները (4,600` հայերեն վիքիպեդիայում)
գրվել են կամավորների կողմից
ու համարյա բոլոր հոդվածները
կարող է խմբագրել ցանկաց մարդ
ով կարող է բացել Վիքիպեդիայի
կայքը։",
+        new String[] { "վիքիպեդիայի", "13", "միլիոն",
"հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում",
"գրվել", "են", "կամավորների", "կողմից",

+        "ու", "համարյա", "բոլոր", "հոդվածները",
"կարող", "է", "խմբագրել", "ցանկաց",
"մարդ", "ով", "կարող", "է", "բացել", "վիքիպեդիայի",
"կայքը" } );
+  }
+  
+  public void testAmharic() throws Exception {
+    assertAnalyzesTo(a, "ዊኪፔድያ የባለ ብዙ ቋንቋ
የተሟላ ትክክለኛና ነጻ መዝገበ
ዕውቀት (ኢንሳይክሎፒዲያ) ነው።
ማንኛውም",
+        new String[] { "ዊኪፔድያ", "የባለ", "ብዙ",
"ቋንቋ", "የተሟላ", "ትክክለኛና",
"ነጻ", "መዝገበ", "ዕውቀት", "ኢንሳይክሎፒዲያ",
"ነው", "ማንኛውም" } );
+  }
+  
+  public void testArabic() throws Exception {
+    assertAnalyzesTo(a, "الفيلم الوثائقي الأول
عن ويكيبيديا يسمى \"الحقيقة
بالأرقام: قصة ويكيبيديا\" (بالإنجليزية:
Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في
2008.",
+        new String[] { "الفيلم", "الوثائقي", "الأول",
"عن", "ويكيبيديا", "يسمى", "الحقيقة",
"بالأرقام", "قصة", "ويكيبيديا",
+        "بالإنجليزية", "truth", "in", "numbers", "the",
"wikipedia", "story", "سيتم", "إطلاقه", "في", "2008" }
); 
+  }
+  
+  public void testAramaic() throws Exception {
+    assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia)
ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ
ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ
ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
+        new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "wikipedia",
"ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ",
"ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
+        "ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ",
"ܐܝܢܣܩܠܘܦܕܝܐ"});
+  }
+  
+  public void testBengali() throws Exception {
+    assertAnalyzesTo(a, "এই বিশ্বকোষ পরিচালনা
করে উইকিমিডিয়া ফাউন্ডেশন
(একটি অলাভজনক সংস্থা)।
উইকিপিডিয়ার শুরু
১৫ জানুয়ারি, ২০০১
সালে। এখন পর্যন্ত
২০০টিরও বেশী ভাষায়
উইকিপিডিয়া রয়েছে।",
+        new String[] { "এই", "বিশ্বকোষ",
"পরিচালনা", "করে", "উইকিমিডিয়া",
"ফাউন্ডেশন", "একটি", "অলাভজনক",
"সংস্থা", "উইকিপিডিয়ার",
+        "শুরু", "১৫", "জানুয়ারি",
"২০০১", "সালে", "এখন", "পর্যন্ত",
"২০০টিরও", "বেশী", "ভাষায়",
"উইকিপিডিয়া", "রয়েছে"
});
+  }
+  
+  public void testFarsi() throws Exception {
+    assertAnalyzesTo(a, "ویکی پدیای انگلیسی
در تاریخ ۲۵ دی ۱۳۷۹ به صورت
مکملی برای دانشنامهٔ تخصصی
نوپدیا نوشته شد.",
+        new String[] { "ویکی", "پدیای", "انگلیسی",
"در", "تاریخ", "۲۵", "دی", "۱۳۷۹", "به",
"صورت", "مکملی",
+        "برای", "دانشنامهٔ", "تخصصی",
"نوپدیا", "نوشته", "شد" });
+  }
+  
+  public void testGreek() throws Exception {
+    assertAnalyzesTo(a, "Γράφεται σε συνεργασία
από εθελοντές με το λογισμικό
wiki, κάτι που σημαίνει ότι άρθρα
μπορεί να προστεθούν ή να αλλάξουν
από τον καθένα.",
+        new String[] { "γράφεται", "σε", "συνεργασία",
"από", "εθελοντέσ", "με", "το", "λογισμικό",
"wiki", "κάτι", "που",
+        "σημαίνει", "ότι", "άρθρα", "μπορεί",
"να", "προστεθούν", "ή", "να", "αλλάξουν",
"από", "τον", "καθένα" });
+  }
+  
+  public void testLao() throws Exception {
+    assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ",
"ດອກ" });
+  }
+  
+  public void testThai() throws Exception {
+    assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี.
แล้วเธอจะไปไหน? ๑๒๓๔",
+        new String[] { "การ", "ที่", "ได้",
"ต้อง", "แสดง", "ว่า", "งาน",
"ดี", "แล้ว", "เธอ", "จะ", "ไป",
"ไหน", "๑๒๓๔"});
+  }
+  
+  public void testTibetan() throws Exception {
+    assertAnalyzesTo(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ།
།",
+        new String[] { "སྣོན", "མཛོད", "དང",
"ལས", "འདིས", "བོད", "ཡིག", "མི",
"ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང",
"བར", "ཧ", "ཅང", "དགེ", "མཚན", "མཆིས",
"སོ" });
+  }
+  
+  /*
+   * For chinese, tokenize as char (these can later form bigrams or whatever)
+   * TODO: why do full-width numerics have no word-break prop?
+   */
+  public void testChinese() throws Exception {
+    assertAnalyzesTo(a, "我是中国人。 1234 Tests
",
+        new String[] { "我", "是", "中", "国", "人", "tests"});
+  }
+  
+  public void testEmpty() throws Exception {
+    assertAnalyzesTo(a, "", new String[] {});
+    assertAnalyzesTo(a, ".", new String[] {});
+    assertAnalyzesTo(a, " ", new String[] {});
+  }
+  
+  /* test various jira issues this analyzer is related to */
+  
+  public void testLUCENE1545() throws Exception {
+    /*
+     * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING
LATIN SMALL LETTRE E.
+     * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character
is lost.
+     * Expected result is only on token "moͤchte".
+     */
+    assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); 
+  }
+  
+  /* Tests from StandardAnalyzer, just to show behavior is similar */
+  public void testAlphanumericSA() throws Exception {
+    // alphanumeric tokens
+    assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
+    assertAnalyzesTo(a, "2B", new String[]{"2b"});
+  }
+
+  public void testDelimitersSA() throws Exception {
+    // other delimiters: "-", "/", ","
+    assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
+    assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
+    assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
+  }
+
+  public void testApostrophesSA() throws Exception {
+    // internal apostrophes: O'Reilly, you're, O'Reilly's
+    assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
+    assertAnalyzesTo(a, "you're", new String[]{"you're"});
+    assertAnalyzesTo(a, "she's", new String[]{"she's"});
+    assertAnalyzesTo(a, "Jim's", new String[]{"jim's"});
+    assertAnalyzesTo(a, "don't", new String[]{"don't"});
+    assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"});
+  }
+
+  public void testNumericSA() throws Exception {
+    // floating point, serial, model numbers, ip addresses, etc.
+    // every other segment must have at least one digit
+    assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
+    assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
+    assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
+    assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
+  }
+
+  public void testTextWithNumbersSA() throws Exception {
+    // numbers
+    assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
+  }
+
+  public void testVariousTextSA() throws Exception {
+    // various
+    assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers",
"wanted"});
+    assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
+    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "foo",
"bar"});
+    assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
+  }
+
+  public void testKoreanSA() throws Exception {
+    // Korean words
+    assertAnalyzesTo(a, "안녕하세요 한글입니다",
new String[]{"안녕하세요", "한글입니다"});
+  }
+  
+  public void testReusableTokenStream() throws Exception {
+    assertAnalyzesToReuse(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ།
།",
+        new String[] { "སྣོན", "མཛོད", "དང",
"ལས", "འདིས", "བོད", "ཡིག", "མི",
"ཉམས", "གོང", 
+                      "འཕེལ", "དུ", "གཏོང",
"བར", "ཧ", "ཅང", "དགེ", "མཚན", "མཆིས",
"སོ" });
+  }
+  
+  public void testOffsets() throws Exception {
+    assertAnalyzesTo(a, "David has 5000 bones", 
+        new String[] {"david", "has", "5000", "bones"},
+        new int[] {0, 6, 10, 15},
+        new int[] {5, 9, 14, 20});
+  }
+  
+  public void testTypes() throws Exception {
+    assertAnalyzesTo(a, "David has 5000 bones", 
+        new String[] {"david", "has", "5000", "bones"},
+        new String[] { "<WORD>", "<WORD>", "<NUM>", "<WORD>" });
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestLaoBreakIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestLaoBreakIterator.java?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestLaoBreakIterator.java
(added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestLaoBreakIterator.java
Mon May  3 13:20:09 2010
@@ -0,0 +1,90 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.InputStream;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+import com.ibm.icu.text.UTF16;
+
+/**
+ * Tests LaoBreakIterator and its RBBI rules
+ */
+public class TestLaoBreakIterator extends LuceneTestCase {
+  private BreakIterator wordIterator;
+  
+  @Override
+  protected void setUp() throws Exception {
+    super.setUp();
+    InputStream is = getClass().getResourceAsStream("Lao.brk");
+    wordIterator = new LaoBreakIterator(RuleBasedBreakIterator.getInstanceFromCompiledRules(is));
+    is.close();
+  }
+  
+  private void assertBreaksTo(BreakIterator iterator, String sourceText, String tokens[])
{
+    char text[] = sourceText.toCharArray();
+    CharArrayIterator ci = new CharArrayIterator();
+    ci.setText(text, 0, text.length);
+    iterator.setText(ci);
+    
+    for (int i = 0; i < tokens.length; i++) {
+      int start, end;
+      do {
+        start = iterator.current();
+        end = iterator.next();
+      } while (end != BreakIterator.DONE && !isWord(text, start, end));
+      assertTrue(start != BreakIterator.DONE);
+      assertTrue(end != BreakIterator.DONE);
+      assertEquals(tokens[i], new String(text, start, end - start));
+    }
+    
+    assertTrue(iterator.next() == BreakIterator.DONE);
+  }
+  
+  protected boolean isWord(char text[], int start, int end) {
+    int codepoint;
+    for (int i = start; i < end; i += UTF16.getCharCount(codepoint)) {
+      codepoint = UTF16.charAt(text, 0, end, start);
+
+      if (UCharacter.isLetterOrDigit(codepoint))
+        return true;
+      }
+
+    return false;
+  }
+  
+  public void testBasicUsage() throws Exception {
+    assertBreaksTo(wordIterator, "ກວ່າດອກ", new String[]
{ "ກວ່າ", "ດອກ" });
+    assertBreaksTo(wordIterator, "ຜູ້​ເຂົ້າ",
new String[] { "ຜູ້", "ເຂົ້າ" });
+    assertBreaksTo(wordIterator, "", new String[] {});
+    assertBreaksTo(wordIterator, "ສະບາຍດີ", new String[]
{ "ສະ", "ບາຍ", "ດີ" });
+  }
+  
+  public void testNumerics() throws Exception {
+    assertBreaksTo(wordIterator, "໐໑໒໓", new String[] { "໐໑໒໓"
});
+    assertBreaksTo(wordIterator, "໐໑໒໓.໕໖", new String[] {
"໐໑໒໓.໕໖" });
+  }
+ 
+  public void testTextAndNumerics() throws Exception {
+    assertBreaksTo(wordIterator, "ກວ່າດອກ໐໑໒໓",
new String[] { "ກວ່າ", "ດອກ", "໐໑໒໓"
});
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestLaoBreakIterator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/RBBIRuleCompiler.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/RBBIRuleCompiler.java?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/RBBIRuleCompiler.java
(added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/RBBIRuleCompiler.java
Mon May  3 13:20:09 2010
@@ -0,0 +1,101 @@
+package org.apache.lucene.analysis.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.lang.reflect.Method;
+
+import com.ibm.icu.text.RuleBasedBreakIterator;
+
+/**
+ * Command-line utility to converts RuleBasedBreakIterator (.rbbi) files into
+ * binary compiled form (.brk).
+ */
+public class RBBIRuleCompiler {
+  
+  static String getRules(File ruleFile) throws IOException {
+    StringBuilder rules = new StringBuilder();
+    InputStream in = new FileInputStream(ruleFile);
+    BufferedReader cin = new BufferedReader(new InputStreamReader(in, "UTF-8"));
+    String line = null;
+    while ((line = cin.readLine()) != null) {
+      if (!line.startsWith("#"))
+        rules.append(line);
+      rules.append('\n');
+    }
+    cin.close();
+    in.close();
+    return rules.toString();
+  }
+  
+  static void compile(File srcDir, File destDir) throws Exception {
+    File files[] = srcDir.listFiles(new FilenameFilter() {
+      public boolean accept(File dir, String name) {
+        return name.endsWith("rbbi");
+      }});
+    if (files == null) throw new IOException("Path does not exist: " + srcDir);
+    for (int i = 0; i < files.length; i++) {
+      File file = files[i];
+      File outputFile = new File(destDir, 
+          file.getName().replaceAll("rbbi$", "brk"));
+      String rules = getRules(file);
+      System.err.print("Compiling " + file.getName() + " to "
+          + outputFile.getName() + ": ");
+      /*
+       * if there is a syntax error, compileRules() may succeed. the way to
+       * check is to try to instantiate from the string. additionally if the
+       * rules are invalid, you can get a useful syntax error.
+       */
+      try {
+        new RuleBasedBreakIterator(rules);
+      } catch (IllegalArgumentException e) {
+        /*
+         * do this intentionally, so you don't get a massive stack trace
+         * instead, get a useful syntax error!
+         */
+        System.err.println(e.getMessage());
+        System.exit(1);
+      }
+      FileOutputStream os = new FileOutputStream(outputFile);
+      // RBBIRuleBuilder.compileRules(rules, os);
+      Class<?> builderClass = Class.forName("com.ibm.icu.text.RBBIRuleBuilder");
+      Method method = builderClass.getDeclaredMethod("compileRules", String.class, OutputStream.class);
+      method.setAccessible(true);
+      method.invoke(null, rules, os);
+      os.close();
+      System.err.println(outputFile.length() + " bytes.");
+    }
+  }
+  
+  public static void main(String args[]) throws Exception {
+    if (args.length < 2) {
+      System.err.println("Usage: RBBIRuleComputer <sourcedir> <destdir>");
+      System.exit(1);
+    }
+    compile(new File(args[0]), new File(args[1]));
+    System.exit(0);
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/RBBIRuleCompiler.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message