lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r940447 [1/2] - in /lucene/dev/trunk/lucene/contrib: ./ icu/ icu/src/data/uax29/ icu/src/java/ icu/src/java/org/apache/lucene/analysis/icu/segmentation/ icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ icu/src/resources/org/apac...
Date Mon, 03 May 2010 13:20:10 GMT
Author: rmuir
Date: Mon May  3 13:20:09 2010
New Revision: 940447

URL: http://svn.apache.org/viewvc?rev=940447&view=rev
Log:
LUCENE-2414: Add ICUTokenizer, tailorable impl of Unicode Text Segmentation

Added:
    lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/
    lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Hebrew.rbbi
    lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Khmer.rbbi
    lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Lao.rbbi
    lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Myanmar.rbbi
    lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/
    lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CharArrayIterator.java   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/package.html   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/
    lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttribute.java   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/
    lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Hebrew.brk   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Lao.brk   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/
    lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestCharArrayIterator.java   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestLaoBreakIterator.java   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/tools/
    lucene/dev/trunk/lucene/contrib/icu/src/tools/java/
    lucene/dev/trunk/lucene/contrib/icu/src/tools/java/org/
    lucene/dev/trunk/lucene/contrib/icu/src/tools/java/org/apache/
    lucene/dev/trunk/lucene/contrib/icu/src/tools/java/org/apache/lucene/
    lucene/dev/trunk/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/
    lucene/dev/trunk/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/
    lucene/dev/trunk/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/RBBIRuleCompiler.java   (with props)
Modified:
    lucene/dev/trunk/lucene/contrib/CHANGES.txt
    lucene/dev/trunk/lucene/contrib/icu/build.xml
    lucene/dev/trunk/lucene/contrib/icu/src/java/overview.html

Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=940447&r1=940446&r2=940447&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Mon May  3 13:20:09 2010
@@ -140,6 +140,11 @@ New features
  * LUCENE-2298: Add analyzers/stempel, an algorithmic stemmer with support for
    the Polish language.  (Andrzej Bialecki via Robert Muir)
 
+ * LUCENE-2414: Add ICUTokenizer, a tailorable tokenizer that implements Unicode
+   Text Segmentation. This tokenizer is useful for documents or collections with
+   multiple languages.  The default configuration includes special support for
+   Thai, Lao, Myanmar, and Khmer.  (Robert Muir, Uwe Schindler)
+
 Build
 
  * LUCENE-2124: Moved the JDK-based collation support from contrib/collation 

Modified: lucene/dev/trunk/lucene/contrib/icu/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/build.xml?rev=940447&r1=940446&r2=940447&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/build.xml (original)
+++ lucene/dev/trunk/lucene/contrib/icu/build.xml Mon May  3 13:20:09 2010
@@ -43,7 +43,39 @@
   <target name="gennorm2">
   	<echo>Warning: only works on a big-endian platform!</echo>
     <exec executable="gennorm2" failonerror="true">
-      <arg line="-v -s ${gennorm2.src.dir} ${gennorm2.src.files} -o ${gennorm2.dst}"/>
+      <arg value="-v"/>
+      <arg value="-s"/>
+      <arg value="${gennorm2.src.dir}"/>
+      <arg value="${gennorm2.src.files}"/>
+      <arg value="-o"/>
+      <arg value="${gennorm2.dst}"/>
     </exec>
   </target>
+  
+  <property name="rbbi.src.dir" location="src/data/uax29"/>
+  <property name="rbbi.dst.dir" location="src/resources/org/apache/lucene/analysis/icu/segmentation"/>
+		
+  <target name="genrbbi" depends="compile-tools">
+    <mkdir dir="${rbbi.dst.dir}"/>
+    <java
+      classname="org.apache.lucene.analysis.icu.RBBIRuleCompiler"
+      dir="."
+      fork="true"
+      failonerror="true">
+      <classpath>
+      	<path refid="additional.dependencies"/>
+      	<pathelement location="${build.dir}/classes/tools"/>
+      </classpath>
+      <arg value="${rbbi.src.dir}"/>
+      <arg value="${rbbi.dst.dir}"/>
+    </java>
+  </target>
+			
+  <target name="compile-tools">
+    <compile
+      srcdir="src/tools/java"
+      destdir="${build.dir}/classes/tools">
+      <classpath refid="classpath"/>
+    </compile>
+  </target>
 </project>

Added: lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Hebrew.rbbi
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Hebrew.rbbi?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Hebrew.rbbi (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Hebrew.rbbi Mon May  3 13:20:09 2010
@@ -0,0 +1,61 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+# This is an example of rule tailoring for Hebrew.
+# In this example the single-quote is added to the Extend category
+# The double-quote is added to the MidLetter category.
+#
+!!chain;
+$CR           = [\p{Word_Break = CR}];
+$LF           = [\p{Word_Break = LF}];
+$Newline      = [\p{Word_Break = Newline}];
+$Extend       = [\p{Word_Break = Extend}\u0027];
+$Format       = [\p{Word_Break = Format}];
+$ALetter      = [\p{Word_Break = ALetter}];
+$MidNumLet    = [\p{Word_Break = MidNumLet}];
+$MidLetter    = [\p{Word_Break = MidLetter}\u0022];
+$MidNum       = [\p{Word_Break = MidNum}];
+$Numeric      = [\p{Word_Break = Numeric}];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+$dictionary   = [:LineBreak = Complex_Context:];
+$Control        = [\p{Grapheme_Cluster_Break = Control}]; 
+$ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]]; 
+                                                              
+$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
+$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
+$MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
+$MidNumEx       = $MidNum       ($Extend |  $Format)*;
+$NumericEx      = $Numeric      ($Extend |  $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
+
+!!forward;
+
+$CR $LF;
+[^$CR $LF $Newline]? ($Extend |  $Format)+;
+$NumericEx {100};
+$ALetterEx {200};    
+$ALetterEx $ALetterEx {200};
+$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
+$NumericEx $NumericEx {100};
+$ALetterEx $NumericEx {200};
+$NumericEx $ALetterEx {200};
+$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
+$ALetterEx      $ExtendNumLetEx {200};    
+$NumericEx      $ExtendNumLetEx {100};      
+$ExtendNumLetEx $ExtendNumLetEx {200};    
+$ExtendNumLetEx $ALetterEx  {200};    
+$ExtendNumLetEx $NumericEx  {100};    

Added: lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Khmer.rbbi
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Khmer.rbbi?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Khmer.rbbi (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Khmer.rbbi Mon May  3 13:20:09 2010
@@ -0,0 +1,61 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# 
+# Parses Khmer text, with orthographic syllable as token.
+#
+# The definition of Khmer orthographic syllable is taken from the Unicode Standard.
+#
+# B = base character (consonant, independent vowel, etc)
+$KhmerBase = [\u1780-\u17B3];
+# R = robat
+$KhmerRobat = [\u17CC];
+# C = consonant shifter
+$KhmerShifter = [\u17C9\u17CA];
+# S = subscript consonant or independent vowel sign
+$KhmerSub = ([\u17D2] $KhmerBase);
+# V = dependent vowel sign
+$KhmerVowel = [\u17B4-\u17C5];
+# Z = zero-width joiner or non-joiner
+$KhmerZWC = [\u200C\u200D];
+# O = any other sign
+$KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD]; 
+
+$WordJoin = [:Line_Break=Word_Joiner:];
+
+$KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;
+
+$KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;
+
+#
+# default numerical definitions
+#
+$Extend       = [\p{Word_Break = Extend}];
+$Format       = [\p{Word_Break = Format}];
+$MidNumLet    = [\p{Word_Break = MidNumLet}];
+$MidNum       = [\p{Word_Break = MidNum}];
+$Numeric      = [\p{Word_Break = Numeric}];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
+$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
+$MidNumEx       = $MidNum       ($Extend |  $Format)*;
+$NumericEx      = $Numeric      ($Extend |  $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
+
+!!forward;
+$KhmerJoinedSyllableEx {200};
+
+# default numeric rules
+$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};

Added: lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Lao.rbbi
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Lao.rbbi?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Lao.rbbi (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Lao.rbbi Mon May  3 13:20:09 2010
@@ -0,0 +1,192 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Parses Lao text, with syllable as token.
+#
+# The definition of Lao syllable is based from:
+#
+#   Syllabification of Lao Script for Line Breaking
+#   Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, 
+#     Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP
+#   http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
+#	http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
+#
+# NOTE:
+# There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper.
+# For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work.
+#
+# Syllable structure, where X is the nuclear consonant:
+#
+#           +----+
+#           | X5 |
+#           +----+
+#           | X4 |
+# +----+----+----+----+----+----+----+-----+
+# | X0 | X1 | X  | X6 | X7 | X8 | X9 | X10 |
+# +----+----+----+----+----+----+----+-----+
+#           | X2 |
+#           +----+
+#           | X3 |
+#           +----+
+#
+# X0 represents a vowel which occurs before the nuclear consonant. 
+# It can always define the beginning of syllable.
+$X0 = [\u0EC0-\u0EC4];
+# X1 is a combination consonant which comes before the nuclear consonant, 
+# but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ}
+$X1 = [\u0EAB];
+# X represents the nuclear consonant.
+$X = [\u0E81-\u0EAE\u0EDC\u0EDD];
+# X2 is a combination consonant which comes after the nuclear consonant, 
+# which is placed under or next to the nuclear consonant.
+$X2 = [\u0EBC\u0EA3\u0EA7\u0EA5];
+# X3 represents a vowel which occurs under the nuclear consonant.
+$X3 = [\u0EB8\u0EB9];
+# X4 represents a vowel which occurs above the nuclear consonant. 
+$X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1];
+# X5 represents a tone mark which occurs above the nuclear consonant or upper vowel.
+$X5 = [\u0EC8-\u0ECB];
+# X6 represents a consonant vowel, which occurs after the nuclear consonant. 
+# It functions when the syllable doesn’t have any vowels. And it always exists with X8.
+$X6 = [\u0EA7\u0EAD\u0EBD];
+# X7 represents a final vowel. 
+# However X7_1 always represents the end of syllable and it never exists with tone mark.
+$X7 = [\u0EB0\u0EB2\u0EB3];
+# X8 represents an alternate consonant.
+$X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7];
+# X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3.
+$X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5];
+# X10 represents a sign mark. 
+# It always occurs at the end of a syllable, but mostly people keep it separate from syllable.
+$X10 = [\u0EAF\u0EC6\u0ECC];
+
+# Section 1
+$X0_1 = [\u0EC0];
+$X4_1_2 = [\u0EB4\u0EB5];
+$X4_3_4 = [\u0EB6\u0EB7];
+$X4_6 = [\u0EBB];
+$X4_7 = [\u0EB1];
+$X6_2 = [\u0EAD];
+$X6_3 = [\u0EBD];
+$X7_1 = [\u0EB0];
+$X7_2 = [\u0EB2];
+$X10_1 = [\u0EAF];
+$X10_2 = [\u0EC6];
+$X10_3 = [\u0ECC];
+
+$Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1;
+$Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+$Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7);
+
+# Section 2
+$X0_2 = [\u0EC1];
+
+$Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1;
+$Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 
+
+$Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3);
+
+# Section 3
+$X0_3 = [\u0EC2];
+$X8_3 = [\u0E8D];
+$X8_8 = [\u0EA7];
+
+$Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1;
+$Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8);
+
+$Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3);
+
+# Section 4
+$X0_4 = [\u0EC4];
+$X6_1 = [\u0EA7];
+
+$Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 5
+$X0_5 = [\u0EC3];
+
+$Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 6
+$Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 7
+$X4_1_4 = [\u0EB4-\u0EB7];
+
+$Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 8
+$X4_5 = [\u0ECD];
+
+$Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 9
+
+$Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+$Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1;
+
+$Rule9 = ($Rule9_1 | $Rule9_2);
+
+# Section 10
+$Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 11
+$Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 12
+$Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1;
+
+# Section 13
+$Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+# Section 14
+$X7_3 = [\u0EB3];
+
+$Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
+
+$LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14);
+
+$WordJoin = [:Line_Break=Word_Joiner:];
+
+$LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*;
+
+#
+# default numerical definitions
+#
+$Extend       = [\p{Word_Break = Extend}];
+$Format       = [\p{Word_Break = Format}];
+$MidNumLet    = [\p{Word_Break = MidNumLet}];
+$MidNum       = [\p{Word_Break = MidNum}];
+$Numeric      = [\p{Word_Break = Numeric}];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
+$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
+$MidNumEx       = $MidNum       ($Extend |  $Format)*;
+$NumericEx      = $Numeric      ($Extend |  $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
+
+!!forward;
+
+$LaoJoinedSyllableEx {200};
+# default numeric rules
+$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};

Added: lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Myanmar.rbbi
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Myanmar.rbbi?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Myanmar.rbbi (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/data/uax29/Myanmar.rbbi Mon May  3 13:20:09 2010
@@ -0,0 +1,50 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# 
+# Parses Myanmar text, with syllable as token. 
+#
+
+$Cons = [[:Other_Letter:]&[:Myanmar:]];
+$Virama = [\u1039];
+$Asat = [\u103A];
+
+$WordJoin = [:Line_Break=Word_Joiner:]; 
+
+#
+# default numerical definitions
+#
+$Extend       = [\p{Word_Break = Extend}];
+$Format       = [\p{Word_Break = Format}];
+$MidNumLet    = [\p{Word_Break = MidNumLet}];
+$MidNum       = [\p{Word_Break = MidNum}];
+$Numeric      = [\p{Word_Break = Numeric}];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
+$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
+$MidNumEx       = $MidNum       ($Extend |  $Format)*;
+$NumericEx      = $Numeric      ($Extend |  $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
+
+$ConsEx = $Cons ($Extend | $Format)*;
+$AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
+$MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
+$MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
+
+!!forward;
+$MyanmarJoinedSyllableEx {200};
+
+# default numeric rules
+$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};

Added: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java Mon May  3 13:20:09 2010
@@ -0,0 +1,171 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.CharacterIterator;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.DictionaryBasedBreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+import com.ibm.icu.text.UTF16;
+
+/**
+ * Contain all the issues surrounding BreakIterators in ICU in one place.
+ * Basically this boils down to the fact that they aren't very friendly to any
+ * sort of OO design.
+ * <p>
+ * http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
+ * BreakIterator from RuleBasedBreakIterator
+ * <p>
+ * DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but
+ * doesn't actually behave as a subclass: it always returns 0 for
+ * getRuleStatus(): 
+ * http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
+ * tags
+ * @lucene.experimental
+ */
+abstract class BreakIteratorWrapper {
+  protected final CharArrayIterator textIterator = new CharArrayIterator();
+  protected char text[];
+  protected int start;
+  protected int length;
+
+  abstract int next();
+  abstract int current();
+  abstract int getRuleStatus();
+  abstract void setText(CharacterIterator text);
+
+  void setText(char text[], int start, int length) {
+    this.text = text;
+    this.start = start;
+    this.length = length;
+    textIterator.setText(text, start, length);
+    setText(textIterator);
+  }
+
+  /**
+   * If its a DictionaryBasedBreakIterator, it doesn't return rulestatus, so
+   * treat it like a generic BreakIterator If its any other
+   * RuleBasedBreakIterator, the rule status can be used for token type. If its
+   * any other BreakIterator, the rulestatus method is not available, so treat
+   * it like a generic BreakIterator.
+   */
+  static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
+    if (breakIterator instanceof RuleBasedBreakIterator
+        && !(breakIterator instanceof DictionaryBasedBreakIterator))
+      return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
+    else
+      return new BIWrapper(breakIterator);
+  }
+
+  /**
+   * RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as its not
+   * a DictionaryBasedBreakIterator) behaves correctly.
+   */
+  static final class RBBIWrapper extends BreakIteratorWrapper {
+    private final RuleBasedBreakIterator rbbi;
+
+    RBBIWrapper(RuleBasedBreakIterator rbbi) {
+      this.rbbi = rbbi;
+    }
+
+    @Override
+    int current() {
+      return rbbi.current();
+    }
+
+    @Override
+    int getRuleStatus() {
+      return rbbi.getRuleStatus();
+    }
+
+    @Override
+    int next() {
+      return rbbi.next();
+    }
+
+    @Override
+    void setText(CharacterIterator text) {
+      rbbi.setText(text);
+    }
+  }
+
+  /**
+   * Generic BreakIterator wrapper: Either the rulestatus method is not
+   * available or always returns 0. Calculate a rulestatus here so it behaves
+   * like RuleBasedBreakIterator.
+   * 
+   * Note: This is slower than RuleBasedBreakIterator.
+   */
+  static final class BIWrapper extends BreakIteratorWrapper {
+    private final BreakIterator bi;
+    private int status;
+
+    BIWrapper(BreakIterator bi) {
+      this.bi = bi;
+    }
+
+    @Override
+    int current() {
+      return bi.current();
+    }
+
+    @Override
+    int getRuleStatus() {
+      return status;
+    }
+
+    @Override
+    int next() {
+      int current = bi.current();
+      int next = bi.next();
+      status = calcStatus(current, next);
+      return next;
+    }
+
+    private int calcStatus(int current, int next) {
+      if (current == BreakIterator.DONE || next == BreakIterator.DONE)
+        return RuleBasedBreakIterator.WORD_NONE;
+
+      int begin = start + current;
+      int end = start + next;
+
+      int codepoint;
+      for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
+        codepoint = UTF16.charAt(text, 0, end, begin);
+
+        if (UCharacter.isDigit(codepoint))
+          return RuleBasedBreakIterator.WORD_NUMBER;
+        else if (UCharacter.isLetter(codepoint)) {
+          // TODO: try to separately specify ideographic, kana? 
+          // [currently all bundled as letter for this case]
+          return RuleBasedBreakIterator.WORD_LETTER;
+        }
+      }
+
+      return RuleBasedBreakIterator.WORD_NONE;
+    }
+
+    @Override
+    void setText(CharacterIterator text) {
+      bi.setText(text);
+      status = RuleBasedBreakIterator.WORD_NONE;
+    }
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CharArrayIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CharArrayIterator.java?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CharArrayIterator.java (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CharArrayIterator.java Mon May  3 13:20:09 2010
@@ -0,0 +1,118 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.CharacterIterator;
+
+/**
+ * Wraps a char[] as CharacterIterator for processing with a BreakIterator
+ * @lucene.experimental
+ */
+final class CharArrayIterator implements CharacterIterator {
+  private char array[];
+  private int start;
+  private int index;
+  private int length;
+  private int limit;
+
+  public char [] getText() {
+    return array;
+  }
+  
+  public int getStart() {
+    return start;
+  }
+  
+  public int getLength() {
+    return length;
+  }
+  
+  /**
+   * Set a new region of text to be examined by this iterator
+   * 
+   * @param array text buffer to examine
+   * @param start offset into buffer
+   * @param length maximum length to examine
+   */
+  void setText(final char array[], int start, int length) {
+    this.array = array;
+    this.start = start;
+    this.index = start;
+    this.length = length;
+    this.limit = start + length;
+  }
+
+  public char current() {
+    return (index == limit) ? DONE : array[index];
+  }
+
+  public char first() {
+    index = start;
+    return current();
+  }
+
+  public int getBeginIndex() {
+    return 0;
+  }
+
+  public int getEndIndex() {
+    return length;
+  }
+
+  public int getIndex() {
+    return index - start;
+  }
+
+  public char last() {
+    index = (limit == start) ? limit : limit - 1;
+    return current();
+  }
+
+  public char next() {
+    if (++index >= limit) {
+      index = limit;
+      return DONE;
+    } else {
+      return current();
+    }
+  }
+
+  public char previous() {
+    if (--index < start) {
+      index = start;
+      return DONE;
+    } else {
+      return current();
+    }
+  }
+
+  public char setIndex(int position) {
+    if (position < getBeginIndex() || position > getEndIndex())
+      throw new IllegalArgumentException("Illegal Position: " + position);
+    index = start + position;
+    return current();
+  }
+
+  @Override
+  public Object clone() {
+    CharArrayIterator clone = new CharArrayIterator();
+    clone.setText(array, start, length);
+    clone.index = index;
+    return clone;
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CharArrayIterator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java Mon May  3 13:20:09 2010
@@ -0,0 +1,126 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.BreakIterator;
+
+/**
+ * An internal BreakIterator for multilingual text, following recommendations
+ * from: UAX #29: Unicode Text Segmentation. (http://unicode.org/reports/tr29/)
+ * <p>
+ * See http://unicode.org/reports/tr29/#Tailoring for the motivation of this
+ * design.
+ * <p>
+ * Text is first divided into script boundaries. The processing is then
+ * delegated to the appropriate break iterator for that specific script.
+ * <p>
+ * This break iterator also allows you to retrieve the ISO 15924 script code
+ * associated with a piece of text.
+ * <p>
+ * See also UAX #29, UTR #24
+ * @lucene.experimental
+ */
+final class CompositeBreakIterator {
+  private final ICUTokenizerConfig config;
+  private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT];
+
+  private BreakIteratorWrapper rbbi;
+  private final ScriptIterator scriptIterator = new ScriptIterator();
+
+  private char text[];
+
+  CompositeBreakIterator(ICUTokenizerConfig config) {
+    this.config = config;
+  }
+
+  /**
+   * Retrieve the next break position. If the RBBI range is exhausted within the
+   * script boundary, examine the next script boundary.
+   * 
+   * @return the next break position or BreakIterator.DONE
+   */
+  int next() {
+    int next = rbbi.next();
+    while (next == BreakIterator.DONE && scriptIterator.next()) {
+      rbbi = getBreakIterator(scriptIterator.getScriptCode());
+      rbbi.setText(text, scriptIterator.getScriptStart(), 
+          scriptIterator.getScriptLimit() - scriptIterator.getScriptStart());
+      next = rbbi.next();
+    }
+    return (next == BreakIterator.DONE) ? BreakIterator.DONE : next
+        + scriptIterator.getScriptStart();
+  }
+
+  /**
+   * Retrieve the current break position.
+   * 
+   * @return the current break position or BreakIterator.DONE
+   */
+  int current() {
+    final int current = rbbi.current();
+    return (current == BreakIterator.DONE) ? BreakIterator.DONE : current
+        + scriptIterator.getScriptStart();
+  }
+
+  /**
+   * Retrieve the rule status code (token type) from the underlying break
+   * iterator
+   * 
+   * @return rule status code (see RuleBasedBreakIterator constants)
+   */
+  int getRuleStatus() {
+    return rbbi.getRuleStatus();
+  }
+
+  /**
+   * Retrieve the UScript script code for the current token. This code can be
+   * decoded with UScript into a name or ISO 15924 code.
+   * 
+   * @return UScript script code for the current token.
+   */
+  int getScriptCode() {
+    return scriptIterator.getScriptCode();
+  }
+
+  /**
+   * Set a new region of text to be examined by this iterator
+   * 
+   * @param text buffer of text
+   * @param start offset into buffer
+   * @param length maximum length to examine
+   */
+  void setText(final char text[], int start, int length) {
+    this.text = text;
+    scriptIterator.setText(text, start, length);
+    if (scriptIterator.next()) {
+      rbbi = getBreakIterator(scriptIterator.getScriptCode());
+      rbbi.setText(text, scriptIterator.getScriptStart(), 
+          scriptIterator.getScriptLimit() - scriptIterator.getScriptStart());
+    } else {
+      rbbi = getBreakIterator(UScript.COMMON);
+      rbbi.setText(text, 0, 0);
+    }
+  }
+  
+  private BreakIteratorWrapper getBreakIterator(int scriptCode) {
+    if (wordBreakers[scriptCode] == null)
+      wordBreakers[scriptCode] = BreakIteratorWrapper.wrap(config.getBreakIterator(scriptCode));
+    return wordBreakers[scriptCode];
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java Mon May  3 13:20:09 2010
@@ -0,0 +1,112 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Default {@link ICUTokenizerConfig} that is generally applicable
+ * to many languages.
+ * <p>
+ * Generally tokenizes Unicode text according to UAX#29 
+ * ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}), 
+ * but with the following tailorings:
+ * <ul>
+ *   <li>Thai text is broken into words with a 
+ *   {@link com.ibm.icu.text.DictionaryBasedBreakIterator}
+ *   <li>Lao, Myanmar, and Khmer text is broken into syllables
+ *   based on custom BreakIterator rules.
+ *   <li>Hebrew text has custom tailorings to handle special cases
+ *   involving punctuation.
+ * </ul>
+ * @lucene.experimental
+ */
+public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
+  /** Token type for words containing ideographic characters */
+  public static final String WORD_IDEO = "<IDEO>";
+  /** Token type for words containing Japanese kana */
+  public static final String WORD_KANA = "<KANA>";
+  /** Token type for words that contain letters */
+  public static final String WORD_LETTER = "<WORD>";
+  /** Token type for words that appear to be numbers */
+  public static final String WORD_NUMBER = "<NUM>";
+  
+  /*
+   * the default breakiterators in use. these can be expensive to
+   * instantiate, cheap to clone.
+   */  
+  private static final BreakIterator rootBreakIterator = 
+    BreakIterator.getWordInstance(ULocale.ROOT);
+  private static final BreakIterator thaiBreakIterator = 
+    BreakIterator.getWordInstance(new ULocale("th_TH"));
+  private static final BreakIterator hebrewBreakIterator = 
+    readBreakIterator("Hebrew.brk");
+  private static final BreakIterator khmerBreakIterator = 
+    readBreakIterator("Khmer.brk");
+  private static final BreakIterator laoBreakIterator = 
+    new LaoBreakIterator(readBreakIterator("Lao.brk"));
+  private static final BreakIterator myanmarBreakIterator = 
+    readBreakIterator("Myanmar.brk");
+  
+  @Override
+  public BreakIterator getBreakIterator(int script) {
+    switch(script) {
+      case UScript.THAI: return (BreakIterator)thaiBreakIterator.clone();
+      case UScript.HEBREW: return (BreakIterator)hebrewBreakIterator.clone();
+      case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
+      case UScript.LAO: return (BreakIterator)laoBreakIterator.clone();
+      case UScript.MYANMAR: return (BreakIterator)myanmarBreakIterator.clone();
+      default: return (BreakIterator)rootBreakIterator.clone();
+    }
+  }
+
+  @Override
+  public String getType(int script, int ruleStatus) {
+    switch (ruleStatus) {
+      case RuleBasedBreakIterator.WORD_IDEO:
+        return WORD_IDEO;
+      case RuleBasedBreakIterator.WORD_KANA:
+        return WORD_KANA;
+      case RuleBasedBreakIterator.WORD_LETTER:
+        return WORD_LETTER;
+      case RuleBasedBreakIterator.WORD_NUMBER:
+        return WORD_NUMBER;
+      default: /* some other custom code */
+        return "<OTHER>";
+    }
+  }
+
+  private static RuleBasedBreakIterator readBreakIterator(String filename) {
+    InputStream is = 
+      DefaultICUTokenizerConfig.class.getResourceAsStream(filename);
+    try {
+      RuleBasedBreakIterator bi = 
+        RuleBasedBreakIterator.getInstanceFromCompiledRules(is);
+      is.close();
+      return bi;
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java Mon May  3 13:20:09 2010
@@ -0,0 +1,196 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.BreakIterator;
+
+/**
+ * Breaks text into words according to UAX #29: Unicode Text Segmentation
+ * (http://www.unicode.org/reports/tr29/)
+ * <p>
+ * Words are broken across script boundaries, then segmented according to
+ * the BreakIterator and typing provided by the {@link ICUTokenizerConfig}
+ * </p>
+ * @see ICUTokenizerConfig
+ * @lucene.experimental
+ */
+public final class ICUTokenizer extends Tokenizer {
+  private static final int IOBUFFER = 4096;
+  private final char buffer[] = new char[IOBUFFER];
+  /** true length of text in the buffer */
+  private int length = 0; 
+  /** length in buffer that can be evaluated safely, up to a safe end point */
+  private int usableLength = 0; 
+  /** accumulated offset of previous buffers for this reader, for offsetAtt */
+  private int offset = 0; 
+
+  private final CompositeBreakIterator breaker; /* tokenizes a char[] of text */
+  private final ICUTokenizerConfig config;
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  private final ScriptAttribute scriptAtt = addAttribute(ScriptAttribute.class);
+
+  /**
+   * Construct a new ICUTokenizer that breaks text into words from the given
+   * Reader.
+   * <p>
+   * The default script-specific handling is used.
+   * 
+   * @param input Reader containing text to tokenize.
+   * @see DefaultICUTokenizerConfig
+   */
+  public ICUTokenizer(Reader input) {
+    this(input, new DefaultICUTokenizerConfig());
+  }
+
+  /**
+   * Construct a new ICUTokenizer that breaks text into words from the given
+   * Reader, using a tailored BreakIterator configuration.
+   *
+   * @param input Reader containing text to tokenize.
+   * @param config Tailored BreakIterator configuration 
+   */
+  public ICUTokenizer(Reader input, ICUTokenizerConfig config) {
+    super(input);
+    this.config = config;
+    breaker = new CompositeBreakIterator(config);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    clearAttributes();
+    if (length == 0)
+      refill();
+    while (!incrementTokenBuffer()) {
+      refill();
+      if (length <= 0) // no more bytes to read;
+        return false;
+    }
+    return true;
+  }
+  
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    breaker.setText(buffer, 0, 0);
+    length = usableLength = offset = 0;
+  }
+
+  @Override
+  public void reset(Reader input) throws IOException {
+    super.reset(input);
+    reset();
+  }
+  
+  @Override
+  public void end() throws IOException {
+    final int finalOffset = (length < 0) ? offset : offset + length;
+    offsetAtt.setOffset(finalOffset, finalOffset);
+  }  
+
+  /*
+   * This tokenizes text based upon the longest matching rule, and because of 
+   * this, isn't friendly to a Reader.
+   * 
+   * Text is read from the input stream in 4kB chunks. Within a 4kB chunk of
+   * text, the last unambiguous break point is found (in this implementation:
+   * white space character) Any remaining characters represent possible partial
+   * words, so are appended to the front of the next chunk.
+   * 
+   * There is the possibility that there are no unambiguous break points within
+   * an entire 4kB chunk of text (binary data). So there is a maximum word limit
+   * of 4kB since it will not try to grow the buffer in this case.
+   */
+
+  /**
+   * Returns the last unambiguous break position in the text.
+   * 
+   * @return position of character, or -1 if one does not exist
+   */
+  private int findSafeEnd() {
+    for (int i = length - 1; i >= 0; i--)
+      if (UCharacter.isWhitespace(buffer[i]))
+        return i + 1;
+    return -1;
+  }
+
+  /**
+   * Refill the buffer, accumulating the offset and setting usableLength to the
+   * last unambiguous break position
+   * 
+   * @throws IOException
+   */
+  private void refill() throws IOException {
+    offset += usableLength;
+    int leftover = length - usableLength;
+    System.arraycopy(buffer, usableLength, buffer, 0, leftover);
+    int requested = buffer.length - leftover;
+    int returned = input.read(buffer, leftover, requested);
+    length = returned < 0 ? leftover : returned + leftover;
+    if (returned < requested) /* reader has been emptied, process the rest */
+      usableLength = length;
+    else { /* still more data to be read, find a safe-stopping place */
+      usableLength = findSafeEnd();
+      if (usableLength < 0)
+        usableLength = length; /*
+                                * more than IOBUFFER of text without space,
+                                * gonna possibly truncate tokens
+                                */
+    }
+
+    breaker.setText(buffer, 0, Math.max(0, usableLength));
+  }
+
+  /*
+   * return true if there is a token from the buffer, or null if it is
+   * exhausted.
+   */
+  private boolean incrementTokenBuffer() {
+    int start = breaker.current();
+    if (start == BreakIterator.DONE)
+      return false; // BreakIterator exhausted
+
+    // find the next set of boundaries, skipping over non-tokens (rule status 0)
+    int end = breaker.next();
+    while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
+      start = end;
+      end = breaker.next();
+    }
+
+    if (start == BreakIterator.DONE)
+      return false; // BreakIterator exhausted
+
+    termAtt.copyBuffer(buffer, start, end - start);
+    offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end));
+    typeAtt.setType(config.getType(breaker.getScriptCode(), breaker.getRuleStatus()));
+    scriptAtt.setCode(breaker.getScriptCode());
+
+    return true;
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java Mon May  3 13:20:09 2010
@@ -0,0 +1,33 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.ibm.icu.text.BreakIterator;
+
+/**
+ * Class that allows for tailored Unicode Text Segmentation on
+ * a per-writing system basis.
+ * @lucene.experimental
+ */
+public abstract class ICUTokenizerConfig {
+  /** Return a breakiterator capable of processing a given script. */
+  public abstract BreakIterator getBreakIterator(int script);
+  /** Return a token type value for a given script and BreakIterator
+   *  rule status. */
+  public abstract String getType(int script, int ruleStatus);
+}

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java Mon May  3 13:20:09 2010
@@ -0,0 +1,226 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.CharacterIterator;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * Syllable iterator for Lao text.
+ * <p>
+ * This breaks Lao text into syllables according to:
+ * <i>Syllabification of Lao Script for Line Breaking</i>
+ * Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, 
+ * Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP.
+ * <ul>
+ *  <li>http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
+ *  <li>http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
+ * </ul>
+ * <p>
+ * Most work is accomplished with RBBI rules, however some additional special logic is needed
+ * that cannot be coded in a grammar, and this is implemented here.
+ * <p>
+ * For example, what appears to be a final consonant might instead be part of the next syllable.
+ * Rules match in a greedy fashion, leaving an illegal sequence that matches no rules.
+ * <p>
+ * Take for instance the text ກວ່າດອກ
+ * The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal.
+ * What LaoBreakIterator does, according to the paper:
+ * <ol>
+ *  <li>backtrack and remove the ດ from the last syllable, placing it on the current syllable.
+ *  <li>verify the modified previous syllable (ກວ່າ ) is still legal.
+ *  <li>verify the modified current syllable (ດອກ) is now legal.
+ *  <li>If 2 or 3 fails, then restore the ດ to the last syllable and skip the current character.
+ * </ol>
+ * <p>
+ * Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper.
+ * This is the issue of combining marks being in the wrong order (typos).
+ * @lucene.experimental
+ */
+public class LaoBreakIterator extends BreakIterator {
+  RuleBasedBreakIterator rules;
+  CharArrayIterator text;
+  
+  CharArrayIterator working = new CharArrayIterator();
+  int workingOffset = 0;
+  
+  CharArrayIterator verifyText = new CharArrayIterator();
+  RuleBasedBreakIterator verify;
+  
+  private static final UnicodeSet laoSet;
+  static {
+    laoSet = new UnicodeSet("[:Lao:]");
+    laoSet.compact();
+    laoSet.freeze();
+  }
+  
+  public LaoBreakIterator(RuleBasedBreakIterator rules) {
+    this.rules = (RuleBasedBreakIterator) rules.clone();
+    this.verify = (RuleBasedBreakIterator) rules.clone();
+  }
+
+  @Override
+  public int current() {
+    int current = rules.current();
+    return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current;
+  }
+
+  @Override
+  public int first() {
+    working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
+    rules.setText(working);
+    workingOffset = 0;
+    int first = rules.first();
+    return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first;
+  }
+
+  @Override
+  public int following(int offset) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public CharacterIterator getText() {
+    return text;
+  }
+
+  @Override
+  public int last() {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public int next() {
+    int current = current();
+    int next = rules.next();
+    if (next == BreakIterator.DONE)
+      return next;
+    else
+      next += workingOffset;
+    
+    char c = working.current();
+    int following = rules.next(); // lookahead
+    if (following != BreakIterator.DONE) {
+      following += workingOffset;
+      if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
+        workingOffset = next - 1;
+        working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
+        return next - 1;
+      }
+    rules.previous(); // undo the lookahead
+    }
+    
+    return next;
+  }
+
+  @Override
+  public int next(int n) {
+    if (n < 0)
+      throw new UnsupportedOperationException("Backwards traversal is unsupported");
+
+    int result = current();
+    while (n > 0) {
+        result = next();
+        --n;
+    }
+    return result;
+  }
+
+  @Override
+  public int previous() {
+    throw new UnsupportedOperationException("Backwards traversal is unsupported");
+  }
+
+  @Override
+  public void setText(CharacterIterator text) {
+    if (!(text instanceof CharArrayIterator))
+      throw new UnsupportedOperationException("unsupported CharacterIterator");
+    this.text = (CharArrayIterator) text;
+    ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength());
+    working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
+    rules.setText(working);
+    workingOffset = 0;
+  }
+  
+  @Override
+  public void setText(String newText) {
+    CharArrayIterator ci = new CharArrayIterator();
+    ci.setText(newText.toCharArray(), 0, newText.length());
+    setText(ci);
+  }
+  
+  private boolean verifyPushBack(int current, int next) {
+    int shortenedSyllable = next - current - 1;
+
+    verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable);
+    verify.setText(verifyText);
+    if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0)
+      return false;
+    
+
+    verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1);
+    verify.setText(verifyText);
+
+    return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0);
+  }
+
+  // TODO: only bubblesort around runs of combining marks, instead of the entire text.
+  private void ccReorder(char[] text, int start, int length) {
+    boolean reordered;
+    do {
+      int prevCC = 0;
+      reordered = false;
+      for (int i = start; i < start + length; i++) {
+        final char c = text[i];
+        final int cc = UCharacter.getCombiningClass(c);
+        if (cc > 0 && cc < prevCC) {
+          // swap
+          text[i] = text[i - 1];
+          text[i - 1] = c;
+          reordered = true;
+        } else {
+          prevCC = cc;
+        }
+      }
+
+    } while (reordered == true);
+  }
+  
+  /**
+   * Clone method.  Creates another LaoBreakIterator with the same behavior 
+   * and current state as this one.
+   * @return The clone.
+   */
+  @Override
+  public Object clone() {
+    LaoBreakIterator other = (LaoBreakIterator) super.clone();
+    other.rules = (RuleBasedBreakIterator) rules.clone();
+    other.verify = (RuleBasedBreakIterator) verify.clone();
+    if (text != null)
+      other.text = (CharArrayIterator) text.clone();
+    if (working != null)
+      other.working = (CharArrayIterator) working.clone();
+    if (verifyText != null)
+      other.verifyText = (CharArrayIterator) verifyText.clone();
+    return other;
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java Mon May  3 13:20:09 2010
@@ -0,0 +1,170 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/** 
+ * Copyright (C) 1999-2010, International Business Machines
+ * Corporation and others.  All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy 
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights 
+ * to use, copy, modify, merge, publish, distribute, and/or sell copies of the 
+ * Software, and to permit persons to whom the Software is furnished to do so, 
+ * provided that the above copyright notice(s) and this permission notice appear 
+ * in all copies of the Software and that both the above copyright notice(s) and
+ * this permission notice appear in supporting documentation.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. 
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE 
+ * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR 
+ * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 
+ * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 
+ * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall not 
+ * be used in advertising or otherwise to promote the sale, use or other 
+ * dealings in this Software without prior written authorization of the 
+ * copyright holder.
+ */
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.UTF16;
+
+/**
+ * An iterator that locates ISO 15924 script boundaries in text. 
+ * <p>
+ * This is not the same as simply looking at the Unicode block, or even the 
+ * Script property. Some characters are 'common' across multiple scripts, and
+ * some 'inherit' the script value of text surrounding them.
+ * <p>
+ * This is similar to ICU (internal-only) UScriptRun, with the following
+ * differences:
+ * <ul>
+ *  <li>Doesn't attempt to match paired punctuation. For tokenization purposes, this
+ * is not necessary. Its also quite expensive. 
+ *  <li>Non-spacing marks inherit the script of their base character, following 
+ *  recommendations from UTR #24.
+ * </ul>
+ * @lucene.experimental
+ */
+final class ScriptIterator {
+  private char text[];
+  private int start;
+  private int limit;
+  private int index;
+
+  private int scriptStart;
+  private int scriptLimit;
+  private int scriptCode;
+
+  /**
+   * Get the start of this script run
+   * 
+   * @return start position of script run
+   */
+  int getScriptStart() {
+    return scriptStart;
+  }
+
+  /**
+   * Get the index of the first character after the end of this script run
+   * 
+   * @return position of the first character after this script run
+   */
+  int getScriptLimit() {
+    return scriptLimit;
+  }
+
+  /**
+   * Get the UScript script code for this script run
+   * 
+   * @return code for the script of the current run
+   */
+  int getScriptCode() {
+    return scriptCode;
+  }
+
+  /**
+   * Iterates to the next script run, returning true if one exists.
+   * 
+   * @return true if there is another script run, false otherwise.
+   */
+  boolean next() {
+    if (scriptLimit >= limit)
+      return false;
+
+    scriptCode = UScript.COMMON;
+    scriptStart = scriptLimit;
+
+    while (index < limit) {
+      final int ch = UTF16.charAt(text, start, limit, index - start);
+      final int sc = getScript(ch);
+
+      /*
+       * From UTR #24: Implementations that determine the boundaries between
+       * characters of given scripts should never break between a non-spacing
+       * mark and its base character. Thus for boundary determinations and
+       * similar sorts of processing, a non-spacing mark — whatever its script
+       * value — should inherit the script value of its base character.
+       */
+      if (isSameScript(scriptCode, sc)
+          || UCharacter.getType(ch) == UCharacter.NON_SPACING_MARK) {
+        index += UTF16.getCharCount(ch);
+
+        /*
+         * Inherited or Common becomes the script code of the surrounding text.
+         */
+        if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
+          scriptCode = sc;
+        }
+
+      } else {
+        break;
+      }
+    }
+
+    scriptLimit = index;
+    return true;
+  }
+
+  /** Determine if two scripts are compatible. */
+  private static boolean isSameScript(int scriptOne, int scriptTwo) {
+    return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED
+        || scriptOne == scriptTwo;
+  }
+
+  /**
+   * Set a new region of text to be examined by this iterator
+   * 
+   * @param text text buffer to examine
+   * @param start offset into buffer
+   * @param length maximum length to examine
+   */
+  void setText(char text[], int start, int length) {
+    this.text = text;
+    this.start = start;
+    this.index = start;
+    this.limit = start + length;
+    this.scriptStart = start;
+    this.scriptLimit = start;
+    this.scriptCode = UScript.INVALID_CODE;
+  }
+
+  /** linear fast-path for basic latin case */
+  private static final int basicLatin[] = new int[128];
+
+  static {
+    for (int i = 0; i < basicLatin.length; i++)
+      basicLatin[i] = UScript.getScript(i);
+  }
+
+  /** fast version of UScript.getScript(). Basic Latin is an array lookup */
+  private static int getScript(int codepoint) {
+    if (0 <= codepoint && codepoint < basicLatin.length)
+      return basicLatin[codepoint];
+    else
+      return UScript.getScript(codepoint);
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/package.html?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/package.html (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/package.html Mon May  3 13:20:09 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<body>
+Tokenizer that breaks text into words with the Unicode Text Segmentation algorithm.
+</body>
+</html>

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttribute.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttribute.java?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttribute.java (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttribute.java Mon May  3 13:20:09 2010
@@ -0,0 +1,51 @@
+package org.apache.lucene.analysis.icu.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.Attribute;
+
+import com.ibm.icu.lang.UScript; // javadoc @link
+
+/**
+ * This attribute stores the UTR #24 script value for a token of text.
+ * @lucene.experimental
+ */
+public interface ScriptAttribute extends Attribute {
+  /**
+   * Get the numeric code for this script value.
+   * This is the constant value from {@link UScript}.
+   * @return numeric code
+   */
+  public int getCode();
+  /**
+   * Set the numeric code for this script value.
+   * This is the constant value from {@link UScript}.
+   * @param code numeric code
+   */
+  public void setCode(int code);
+  /**
+   * Get the full name.
+   * @return UTR #24 full name.
+   */
+  public String getName();
+  /**
+   * Get the abbreviated name.
+   * @return UTR #24 abbreviated name.
+   */
+  public String getShortName();
+}

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttribute.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java Mon May  3 13:20:09 2010
@@ -0,0 +1,83 @@
+package org.apache.lucene.analysis.icu.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Serializable;
+
+import org.apache.lucene.util.AttributeImpl;
+
+import com.ibm.icu.lang.UScript;
+
+/**
+ * Implementation of {@link ScriptAttribute} that stores the script
+ * as an integer.
+ * @lucene.experimental
+ */
+public class ScriptAttributeImpl extends AttributeImpl implements ScriptAttribute, Cloneable, Serializable {
+  private int code = UScript.COMMON;
+  
+  public int getCode() {
+    return code;
+  }
+  
+  public void setCode(int code) {
+    this.code = code;
+  }
+
+  public String getName() {
+    return UScript.getName(code);
+  }
+
+  public String getShortName() {
+    return UScript.getShortName(code);
+  }
+  
+  @Override
+  public void clear() {
+    code = UScript.COMMON;
+  }
+
+  @Override
+  public void copyTo(AttributeImpl target) {
+    ScriptAttribute t = (ScriptAttribute) target;
+    t.setCode(code);
+  }
+  
+  @Override
+  public boolean equals(Object other) {
+    if (this == other) {
+      return true;
+    }
+    
+    if (other instanceof ScriptAttributeImpl) {
+      return ((ScriptAttributeImpl) other).code == code;
+    }
+    
+    return false;
+  }
+
+  @Override
+  public int hashCode() {
+    return code;
+  }
+
+  @Override
+  public String toString() {
+    return "script=" + getName();
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/dev/trunk/lucene/contrib/icu/src/java/overview.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/java/overview.html?rev=940447&r1=940446&r2=940447&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/java/overview.html (original)
+++ lucene/dev/trunk/lucene/contrib/icu/src/java/overview.html Mon May  3 13:20:09 2010
@@ -30,6 +30,8 @@ performance, keeping current with the Un
 APIs. This module exposes the following functionality:
 </p>
 <ul>
+  <li><a href="#segmentation">Text Segmentation</a>: Tokenizes text based on 
+  properties and rules defined in Unicode.</li>
   <li><a href="#collation">Collation</a>: Compare strings according to the 
   conventions and standards of a particular language, region or country.</li>
   <li><a href="#normalization">Normalization</a>: Converts text to a unique,
@@ -42,6 +44,35 @@ APIs. This module exposes the following 
   a context-sensitive fashion: e.g. mapping Traditional to Simplified Chinese</li>
 </ul>
 <hr/>
+<h1><a name="segmentation">Text Segmentation</a></h1>
+<p>
+Text Segmentation (Tokenization) divides document and query text into index terms
+(typically words). Unicode provides special properties and rules so that this can
+be done in a manner that works well with most languages.
+</p>
+<p>
+Text Segmentation implements the word segmentation specified in
+<a href="http://unicode.org/reports/tr29/">Unicode Text Segmentation</a>.
+Additionally the algorithm can be tailored based on writing system, for example
+text in the Thai script is automatically delegated to a dictionary-based segmentation 
+algorithm.
+</p>
+<h2>Use Cases</h2>
+<ul>
+  <li>
+    As a more thorough replacement for StandardTokenizer that works well for
+    most languages. 
+  </li>
+</ul>
+<h2>Example Usages</h2>
+<h3>Tokenizing multilanguage text</h3>
+<code><pre>
+  /**
+   * This tokenizer will work well in general for most languages.
+   */
+  Tokenizer tokenizer = new ICUTokenizer(reader);
+</pre></code>
+<hr/>
 <h1><a name="collation">Collation</a></h1>
 <p>
   <code>ICUCollationKeyFilter</code>

Added: lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Hebrew.brk
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Hebrew.brk?rev=940447&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Hebrew.brk
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk?rev=940447&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Lao.brk
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Lao.brk?rev=940447&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Lao.brk
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk?rev=940447&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestCharArrayIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestCharArrayIterator.java?rev=940447&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestCharArrayIterator.java (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestCharArrayIterator.java Mon May  3 13:20:09 2010
@@ -0,0 +1,109 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.CharacterIterator;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestCharArrayIterator extends LuceneTestCase {
+  public void testBasicUsage() {
+    CharArrayIterator ci = new CharArrayIterator();
+    ci.setText("testing".toCharArray(), 0, "testing".length());
+    assertEquals(0, ci.getBeginIndex());
+    assertEquals(7, ci.getEndIndex());
+    assertEquals(0, ci.getIndex());
+    assertEquals('t', ci.current());
+    assertEquals('e', ci.next());
+    assertEquals('g', ci.last());
+    assertEquals('n', ci.previous());
+    assertEquals('t', ci.first());
+    assertEquals(CharacterIterator.DONE, ci.previous());
+  }
+  
+  public void testFirst() {
+    CharArrayIterator ci = new CharArrayIterator();
+    ci.setText("testing".toCharArray(), 0, "testing".length());
+    ci.next();
+    // Sets the position to getBeginIndex() and returns the character at that position. 
+    assertEquals('t', ci.first());
+    assertEquals(ci.getBeginIndex(), ci.getIndex());
+    // or DONE if the text is empty
+    ci.setText(new char[] {}, 0, 0);
+    assertEquals(CharacterIterator.DONE, ci.first());
+  }
+  
+  public void testLast() {
+    CharArrayIterator ci = new CharArrayIterator();
+    ci.setText("testing".toCharArray(), 0, "testing".length());
+    // Sets the position to getEndIndex()-1 (getEndIndex() if the text is empty) 
+    // and returns the character at that position. 
+    assertEquals('g', ci.last());
+    assertEquals(ci.getIndex(), ci.getEndIndex() - 1);
+    // or DONE if the text is empty
+    ci.setText(new char[] {}, 0, 0);
+    assertEquals(CharacterIterator.DONE, ci.last());
+    assertEquals(ci.getEndIndex(), ci.getIndex());
+  }
+  
+  public void testCurrent() {
+    CharArrayIterator ci = new CharArrayIterator();
+    // Gets the character at the current position (as returned by getIndex()). 
+    ci.setText("testing".toCharArray(), 0, "testing".length());
+    assertEquals('t', ci.current());
+    ci.last();
+    ci.next();
+    // or DONE if the current position is off the end of the text.
+    assertEquals(CharacterIterator.DONE, ci.current());
+  }
+  
+  public void testNext() {
+    CharArrayIterator ci = new CharArrayIterator();
+    ci.setText("te".toCharArray(), 0, 2);
+    // Increments the iterator's index by one and returns the character at the new index.
+    assertEquals('e', ci.next());
+    assertEquals(1, ci.getIndex());
+    // or DONE if the new position is off the end of the text range.
+    assertEquals(CharacterIterator.DONE, ci.next());
+    assertEquals(ci.getEndIndex(), ci.getIndex());
+  }
+  
+  public void testSetIndex() {
+    CharArrayIterator ci = new CharArrayIterator();
+    ci.setText("test".toCharArray(), 0, "test".length());
+    try {
+      ci.setIndex(5);
+      fail();
+    } catch (Exception e) {
+      assertTrue(e instanceof IllegalArgumentException);
+    }
+  }
+  
+  public void testClone() {
+    char text[] = "testing".toCharArray();
+    CharArrayIterator ci = new CharArrayIterator();
+    ci.setText(text, 0, text.length);
+    ci.next();
+    CharArrayIterator ci2 = (CharArrayIterator) ci.clone();
+    assertEquals(ci.getIndex(), ci2.getIndex());
+    assertEquals(ci.next(), ci2.next());
+    assertEquals(ci.last(), ci2.last());
+  }
+  
+
+}

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestCharArrayIterator.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message