lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r936657 [1/3] - in /lucene/dev/trunk/lucene: ./ contrib/ contrib/icu/ contrib/icu/src/data/ contrib/icu/src/data/utr30/ contrib/icu/src/java/org/apache/lucene/analysis/icu/ contrib/icu/src/resources/ contrib/icu/src/resources/org/ contrib/i...
Date Thu, 22 Apr 2010 08:47:50 GMT
Author: rmuir
Date: Thu Apr 22 08:47:49 2010
New Revision: 936657

URL: http://svn.apache.org/viewvc?rev=936657&view=rev
Log:
LUCENE-1343: Add ICUFoldingFilter, normalizes unicode text for search

Added:
    lucene/dev/trunk/lucene/contrib/icu/src/data/
    lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/
    lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/BasicFoldings.txt   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/DiacriticFolding.txt   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/DingbatFolding.txt   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/HanRadicalFolding.txt   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/NativeDigitFolding.txt   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/nfkc.txt   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/nfkc_cf.txt   (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
  (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/resources/
    lucene/dev/trunk/lucene/contrib/icu/src/resources/org/
    lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/
    lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/
    lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/
    lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/
    lucene/dev/trunk/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
  (with props)
    lucene/dev/trunk/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java
  (with props)
Modified:
    lucene/dev/trunk/lucene/NOTICE.txt
    lucene/dev/trunk/lucene/contrib/CHANGES.txt
    lucene/dev/trunk/lucene/contrib/icu/build.xml

Modified: lucene/dev/trunk/lucene/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/NOTICE.txt?rev=936657&r1=936656&r2=936657&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/NOTICE.txt (original)
+++ lucene/dev/trunk/lucene/NOTICE.txt Thu Apr 22 08:47:49 2010
@@ -47,6 +47,10 @@ ICU4J, (under contrib/icu) is licensed u
 (contrib/icu/lib/ICU-LICENSE.txt) and Copyright (c) 1995-2008 
 International Business Machines Corporation and others
 
+Some data files (under contrib/icu/src/data) are derived from Unicode data such
+as the Unicode Character Database. See http://unicode.org/copyright.html for more
+details.
+
 Brics Automaton (under src/java/org/apache/lucene/util/automaton) is 
 BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/
 

Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=936657&r1=936656&r2=936657&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Thu Apr 22 08:47:49 2010
@@ -129,6 +129,10 @@ New features
    mappings in addition to standard normalization, and normalization combined
    with unicode case folding.  (Robert Muir)
 
+ * LUCENE-1343: Add ICUFoldingFilter, a replacement for ASCIIFoldingFilter that
+   does a more thorough job of normalizing unicode text for search.
+   (Robert Haschart, Robert Muir)
+
 Build
 
  * LUCENE-2124: Moved the JDK-based collation support from contrib/collation 

Modified: lucene/dev/trunk/lucene/contrib/icu/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/build.xml?rev=936657&r1=936656&r2=936657&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/build.xml (original)
+++ lucene/dev/trunk/lucene/contrib/icu/build.xml Thu Apr 22 08:47:49 2010
@@ -36,4 +36,14 @@
 
   <import file="../contrib-build.xml"/>
 
+  <property name="gennorm2.src.dir" value="src/data/utr30"/>
+  <property name="gennorm2.src.files" 
+  	value="nfkc.txt nfkc_cf.txt BasicFoldings.txt DiacriticFolding.txt DingbatFolding.txt
HanRadicalFolding.txt NativeDigitFolding.txt"/>
+  <property name="gennorm2.dst" value="src/resources/org/apache/lucene/analysis/icu/utr30.nrm"/>
+  <target name="gennorm2">
+  	<echo>Warning: only works on a big-endian platform!</echo>
+    <exec executable="gennorm2" failonerror="true">
+      <arg line="-v -s ${gennorm2.src.dir} ${gennorm2.src.files} -o ${gennorm2.dst}"/>
+    </exec>
+  </target>
 </project>

Added: lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/BasicFoldings.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/BasicFoldings.txt?rev=936657&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/BasicFoldings.txt (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/BasicFoldings.txt Thu Apr 22 08:47:49
2010
@@ -0,0 +1,111 @@
+# Copyright 2001-2010 Unicode, Inc.
+# 
+# Disclaimer
+# 
+# This source code is provided as is by Unicode, Inc. No claims are
+# made as to fitness for any particular purpose. No warranties of any
+# kind are expressed or implied. The recipient agrees to determine
+# applicability of information provided. If this file has been
+# purchased on magnetic or optical media from Unicode, Inc., the
+# sole remedy for any claim will be exchange of defective media
+# within 90 days of receipt.
+# 
+# Limitations on Rights to Redistribute This Code
+# 
+# Unicode, Inc. hereby grants the right to freely use the information
+# supplied in this file in the creation of products supporting the
+# Unicode Standard, and to make copies of this file in any form
+# for internal or external distribution as long as this notice
+# remains attached.
+
+### Custom Normalization mappings for UTR#30 
+### (http://www.unicode.org/reports/tr30/tr30-4.html)
+###
+### Created from Unicode 5.2 UCD
+###
+
+## Accent removal
+# See DiacriticFolding.txt
+## Case Folding (done by cf)
+## Canonical Duplicates Folding (done by cd)
+## Dashes folding
+# [[:Dash:][:Pd:]]-2053(swung dash) > U+002D
+058A>002D
+05BE>002D
+1400>002D
+1806>002D
+2010..2015>002D
+2E17>002D
+2E1A>002D
+301C>002D
+3030>002D
+30A0>002D
+#2053>002D
+2212>002D
+# FE31,FE32,FE58,FE63,FF0D done by kd
+
+## Greek letterforms folding (done by kd)
+## Hebrew alternates folding (done by kd)
+## Jamo folding (done by kd)
+## Math symbol folding (done by kd)
+## Native digit folding
+# See NativeDigitFolding.txt
+## Nobreak folding (done by kd)
+## Overline Folding
+FE49..FE4C>203E
+## Positional forms folding (done by kd)
+## Small forms folding (done by kd)
+## Space Folding
+# [:Zs:] > U+0020
+1680>0020
+180E>0020
+# 00A0, 2000..200A,202F,205F,3000 done by kd
+## Spacing Accents folding (done by kd)
+## Subscript folding (done by kd)
+## Symbol folding (done by kd)
+## Underline Folding
+2017>005E
+FE4D..FE4F>005E
+## Diacritic Folding
+#
+
+## Vertical forms folding (done by kd)
+## Han Radical Folding
+# See HanRadicalFolding.txt
+## Letter Form Folding (done by kd)
+## Superscript folding
+# Additions to kd:
+02C0>0294
+02C1>0295
+06E5>0648
+06E6>064A
+## Suzhou Numeral Folding
+# Additions to kd:
+3021>4E00
+3022>4E8C
+3023>4E09
+3024>56DB
+3025>4E94
+3026>516D
+3027>4E03
+3028>516B
+3029>4E5D
+## Width Folding (done by kd)
+# Punctuation Folding
+00AB>0022
+00BB>0022
+201C..201E>0022
+2018..201B>0027
+2032>0027
+2035>0027
+2039..203A>0027
+2045>005B
+2046>005D
+2E28>0028 0028
+2E29>0029 0029
+2052>0025
+204E>002A
+2044>002F
+204F>003B
+2038>005E
+2053>007E

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/BasicFoldings.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/DiacriticFolding.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/DiacriticFolding.txt?rev=936657&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/DiacriticFolding.txt (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/DiacriticFolding.txt Thu Apr 22 08:47:49
2010
@@ -0,0 +1,562 @@
+# Copyright 2001-2010 Unicode, Inc.
+# 
+# Disclaimer
+# 
+# This source code is provided as is by Unicode, Inc. No claims are
+# made as to fitness for any particular purpose. No warranties of any
+# kind are expressed or implied. The recipient agrees to determine
+# applicability of information provided. If this file has been
+# purchased on magnetic or optical media from Unicode, Inc., the
+# sole remedy for any claim will be exchange of defective media
+# within 90 days of receipt.
+# 
+# Limitations on Rights to Redistribute This Code
+# 
+# Unicode, Inc. hereby grants the right to freely use the information
+# supplied in this file in the creation of products supporting the
+# Unicode Standard, and to make copies of this file in any form
+# for internal or external distribution as long as this notice
+# remains attached.
+
+### Custom Normalization mappings for UTR#30 
+### (http://www.unicode.org/reports/tr30/tr30-4.html)
+###
+### Created from Unicode 5.2 UCD
+###
+
+# Removes diacritics, as defined by [:Diacritic:]
+# These may or may not be combining marks
+005E>
+0060>
+00B7>
+02B9..02D7>
+02DE>
+02DF>
+02E5..033F>
+0342>
+0346..034E>
+0350..0357>
+035D..0362>
+0375>
+0483..0487>
+0559>
+0591..05A1>
+05A3..05BD>
+05BF>
+05C1>
+05C2>
+05C4>
+064B..0652>
+0657>
+0658>
+06DF>
+06E0>
+06E5>
+06E6>
+06EA..06EC>
+0730..074A>
+07A6..07B0>
+07EB..07F5>
+0818>
+0819>
+093C>
+094D>
+0951..0954>
+0971>
+09BC>
+09CD>
+0A3C>
+0A4D>
+0ABC>
+0ACD>
+0B3C>
+0B4D>
+0BCD>
+0C4D>
+0CBC>
+0CCD>
+0D4D>
+0DCA>
+0E47..0E4C>
+0E4E>
+0EC8..0ECC>
+0F18>
+0F19>
+0F35>
+0F37>
+0F39>
+0F3E>
+0F3F>
+0F82..0F84>
+0F86>
+0F87>
+0FC6>
+1037>
+1039>
+103A>
+1087..108D>
+108F>
+109A>
+109B>
+17C9..17D3>
+17DD>
+1939..193B>
+1A75..1A7C>
+1A7F>
+1B34>
+1B44>
+1B6B..1B73>
+1BAA>
+1C36>
+1C37>
+1C78..1C7D>
+1CD0..1CE8>
+1CED>
+1D2F>
+1D3B>
+1D4E>
+1DC4..1DCF>
+1DFD..1DFF>
+2CEF..2CF1>
+2E2F>
+302A..302F>
+3099>
+309A>
+30FC>
+A66F>
+A67C>
+A67D>
+A67F>
+A6F0>
+A6F1>
+A717..A721>
+A788>
+A8C4>
+A8E0..A8F1>
+A92B..A92E>
+A953>
+A9B3>
+A9C0>
+AA7B>
+AABF..AAC2>
+ABEC>
+ABED>
+FB1E>
+FE20..FE26>
+110B9>
+110BA>
+1D167..1D169>
+1D16D..1D172>
+1D17B..1D182>
+1D185..1D18B>
+1D1AA..1D1AD>
+
+# Latin script "composed" that do not further decompose, so decompose here
+# These are from AsciiFoldingFilter
+00E6>0061 0065
+00F0>0064
+00F8>006F
+00FE>0074 0068
+0111>0064
+0127>0068
+0131>0069
+0138>0071
+0142>006C
+014B>006E
+0153>006F 0065
+0167>0074
+0180>0062
+0183>0062
+0185>0062
+0188>0063
+018C>0064
+018D>0064
+0192>0066
+0195>0068 0076
+0199>006B
+019A>006C
+#019B>
+019E>006E
+#01A3>
+01A5>0070
+#01A8>
+#01AA>
+01AB>0074
+01AD>0074
+01B4>0079
+01B6>007A
+#01B9>
+#01BA>
+01BB>0032
+01BD>0035
+#01BE>
+01BF>0077
+01C0>007C
+01C1>007C 007C
+#01C2>
+01C3>0021
+01DD>0065
+01E5>0047
+021D>007A
+0221>0064
+0223>006F 0075
+0225>007A
+0234>006C
+0235>006E
+0236>0074
+0237>006A
+0238>0064 0062
+0239>0071 0070
+023C>0063
+023F>0073
+0240>007A
+#0242>
+0247>0065
+0249>006A
+024B>0071
+024D>0072
+024F>0079
+0250>0061
+0251>0061
+0252>0061
+0253>0062
+0254>006F
+0255>0063
+0256>0064
+0257>0064
+0258>0065
+0259>0061
+025A>0061
+025B>0065
+025C>0065
+025D>0065
+025E>0065
+025F>006A
+0260>0067
+0261>0067
+0262>0047
+#0263>
+#0264>
+0265>0068
+0266>0068
+#0267>
+0268>0069
+0269>0069
+026A>0049
+026B>006C
+026C>006C
+026D>006C
+#026E>
+026F>006D
+0270>006D
+0271>006D
+0272>006E
+0273>006E
+0274>004E
+0275>006F
+0276>004F 0045
+#0277>
+#0278>
+#0279>
+#027A>
+#027B>
+027C>0072
+027D>0072
+027E>0072
+027F>0072
+0280>0052
+0281>0052
+0282>0073
+#0283>
+0284>006A
+#0285>
+#0286>
+0287>0074
+0288>0074
+0289>0075
+#028A>
+028B>0076
+028C>0076
+028D>0077
+028E>0079
+028F>0059
+0290>007A
+0291>007A
+#0292>
+#0293>
+#0294>
+#0295>
+#0296>
+0297>0043
+0298>006F
+0299>0042
+029A>0065
+029B>0047
+029C>0048
+029D>006A
+029E>006B
+029F>004C
+02A0>0071
+#02A1>
+#02A2>
+02A3>0064 007A
+#02A4>
+02A5>0064 007A
+02A6>0074 0073
+#02A7>
+02A8>0074 0063
+02A9>0066 006E
+02AA>006C 0073
+02AB>006C 007A
+02AC>0077 0077
+#02AD>
+02AE>0068
+02AF>0068
+1D00>0041
+1D01>0041 0045
+1D02>0061 0065
+1D03>0042
+1D04>0043
+1D05>0044
+1D06>0044
+1D07>0045
+1D08>0065
+1D09>0069
+1D0A>004A
+1D0B>004B
+1D0C>004C
+1D0D>004D
+1D0E>004E
+1D0F>004F
+1D10>004F
+1D11>006F
+#1D12>
+1D13>006F
+1D14>006F 0065
+1D15>004F 0055
+1D16>006F
+1D17>006F
+1D18>0050
+1D19>0052
+1D1A>0052
+1D1B>0054
+1D1C>0055
+1D1D>0075
+1D1E>0075
+1D1F>006D
+1D20>0056
+1D21>0057
+1D22>005A
+#1D23>
+#1D24>
+#1D25>
+1D6B>0075 0065
+1D6C>0062
+1D6D>0064
+1D6E>0066
+1D6F>006D
+1D70>006E
+1D71>0070
+1D72>0072
+1D73>0072
+1D74>0073
+1D75>0074
+1D76>007A
+1D77>0067
+1D79>0067
+1D7A>0074 0068
+1D7B>0049
+1D7C>0069
+1D7D>0070
+1D7E>0055
+#1D7F>
+1D80>0062
+1D81>0064
+1D82>0066
+1D83>0067
+1D84>006B
+1D85>006C
+1D86>006D
+1D87>006E
+1D88>0070
+1D89>0072
+1D8A>0073
+#1D8B>
+1D8C>0076
+1D8D>0078
+1D8E>007A
+1D8F>0061
+1D90>0061
+1D91>0064
+1D92>0065
+1D93>0065
+1D94>0065
+1D95>0061
+1D96>0069
+1D97>006F
+#1D98>
+1D99>0075
+#1D9A>
+1E9C>0073
+1E9D>0073
+1E9F>0064
+1EFB>006C 006C
+1EFD>0076
+1EFF>0079
+214E>0066
+#2180>
+#2181>
+#2182>
+2184>0063
+#2185>
+#2186>
+#2187>
+#2188>
+2C61>006C
+2C65>0061
+2C66>0074
+2C68>0068
+2C6A>006B
+2C6C>007A
+2C71>0076
+2C73>0077
+2C74>0076
+2C76>0068
+#2C77>
+2C78>0065
+#2C79>
+2C7A>006F
+2C7B>0045
+#A723>
+#A725>
+#A727>
+A729>0074 007A
+#A72B>
+#A72D>
+#A72F>
+A730>0046
+A731>0053
+A733>0061 0061
+A735>0061 006F
+A737>0061 0075
+A739>0061 0076
+A73B>0061 0076
+A73D>0061 0079
+A73F>0063
+A741>006B
+A743>006B
+A745>006B
+A747>006C
+A749>006C
+A74B>006F
+A74D>006F
+A74F>006F 006F
+A751>0070
+A753>0070
+A755>0070
+A757>0071
+A759>0071
+A75B>0072
+#A75D>
+A75F>0076
+A761>0076 0079
+A763>007A
+A765>0074 0068
+A767>0074 0068
+A769>0076 
+#A76B>
+#A76D>
+#A76F>
+#A771>
+#A772>
+#A773>
+#A774>
+#A775>
+#A776>
+#A777>
+#A778>
+A77A>0064
+A77C>0066
+A77F>0067
+A781>006C
+A783>0072
+A785>0053
+A787>0074
+A78C>0027
+A7FB>0046
+A7FC>0070
+A7FD>004D
+A7FE>0049
+A7FF>004D
+
+# Cyrillic script "composed" that do not further decompose, so decompose here
+# These are from UTR#30 DiacriticFolding.txt
+
+047D>0461
+048B>0439
+048F>0440
+0491>0433
+0493>0433
+0495>0433
+0497>0436
+0499>0437
+049B>043A
+049D>043A
+049F>043A
+04A3>043D
+04A7>043F
+04AB>0441
+04AD>0442
+04B1>04AF
+04B3>0425
+04B7>04BC
+04B9>0447
+04BF>04BC
+04C4>043A
+04C6>043B
+04C8>043D
+04CA>043D
+04CC>04BC
+04CE>043C
+
+# Additional signs and diacritic, from examination of [:Mark:]&[:Lm:]
+0358..035C>
+05A2>
+05C5>
+05C7>
+0610..061A>
+0640>
+06D6..06DE>
+06E1..06E4>
+06E7..06E9>
+06ED>
+0653..0656>
+0659..065E>
+0670>
+0711>
+07FA>
+0816..0817>
+081B..0823>
+0825..0827>
+0829>
+082A..082D>
+0900>0901
+1714>
+1734>
+1DC0..1DC3>
+1DD0..1DE6>
+20D0..20F0>
+2DE0..2DFF>
+A670..A672>
+A802>
+10A3F>
+1D165..1D166>
+1D242..1D244>
+
+# Additional Arabic/Hebrew decompositions
+05F3>0027
+05F4>0022
+0629>0647
+0649>064A
+06A9>0643
+06CC>064A

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/DiacriticFolding.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/DingbatFolding.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/DingbatFolding.txt?rev=936657&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/DingbatFolding.txt (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/DingbatFolding.txt Thu Apr 22 08:47:49
2010
@@ -0,0 +1,97 @@
+# Copyright 2001-2010 Unicode, Inc.
+# 
+# Disclaimer
+# 
+# This source code is provided as is by Unicode, Inc. No claims are
+# made as to fitness for any particular purpose. No warranties of any
+# kind are expressed or implied. The recipient agrees to determine
+# applicability of information provided. If this file has been
+# purchased on magnetic or optical media from Unicode, Inc., the
+# sole remedy for any claim will be exchange of defective media
+# within 90 days of receipt.
+# 
+# Limitations on Rights to Redistribute This Code
+# 
+# Unicode, Inc. hereby grants the right to freely use the information
+# supplied in this file in the creation of products supporting the
+# Unicode Standard, and to make copies of this file in any form
+# for internal or external distribution as long as this notice
+# remains attached.
+
+### Custom Normalization mappings for UTR#30 
+### (http://www.unicode.org/reports/tr30/tr30-4.html)
+###
+### Created from Unicode 5.2 UCD
+###
+
+# Folds dingbats and other adorned forms
+# Generated from ASCIIFoldingFilter
+24EB>0031 0031
+24EC>0031 0032
+24ED>0031 0033
+24EE>0031 0034
+24EF>0031 0035
+24F0>0031 0036
+24F1>0031 0037
+24F2>0031 0038
+24F3>0031 0039
+24F4>0032 0030
+24F5>0031
+24F6>0032
+24F7>0033
+24F8>0034
+24F9>0035
+24FA>0036
+24FB>0037
+24FC>0038
+24FD>0039
+24FE>0031 0030
+24FF>0030
+275B>0027
+275C>0027
+275D>0022
+275E>0022
+2768>0028
+2769>0029
+276A>0028
+276B>0029
+276C>003C
+276D>003E
+276E>0022
+276F>0022
+2770>003C
+2771>003E
+2772>005B
+2773>005D
+2774>007B
+2775>007D
+2776>0031
+2777>0032
+2778>0033
+2779>0034
+277A>0035
+277B>0036
+277C>0037
+277D>0038
+277E>0039
+277F>0031 0030
+2780>0031
+2781>0032
+2782>0033
+2783>0034
+2784>0035
+2785>0036
+2786>0037
+2787>0038
+2788>0039
+2789>0031 0030
+278A>0031
+278B>0032
+278C>0033
+278D>0034
+278E>0035
+278F>0036
+2790>0037
+2791>0038
+2792>0039
+2793>0031 0030

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/DingbatFolding.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/HanRadicalFolding.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/HanRadicalFolding.txt?rev=936657&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/HanRadicalFolding.txt (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/HanRadicalFolding.txt Thu Apr 22 08:47:49
2010
@@ -0,0 +1,143 @@
+# Copyright 2001-2010 Unicode, Inc.
+# 
+# Disclaimer
+# 
+# This source code is provided as is by Unicode, Inc. No claims are
+# made as to fitness for any particular purpose. No warranties of any
+# kind are expressed or implied. The recipient agrees to determine
+# applicability of information provided. If this file has been
+# purchased on magnetic or optical media from Unicode, Inc., the
+# sole remedy for any claim will be exchange of defective media
+# within 90 days of receipt.
+# 
+# Limitations on Rights to Redistribute This Code
+# 
+# Unicode, Inc. hereby grants the right to freely use the information
+# supplied in this file in the creation of products supporting the
+# Unicode Standard, and to make copies of this file in any form
+# for internal or external distribution as long as this notice
+# remains attached.
+
+### Custom Normalization mappings for UTR#30 
+### (http://www.unicode.org/reports/tr30/tr30-4.html)
+###
+### Created from UTR#30 HanRadicalFolding.txt
+###
+
+# CJK Radicals
+2E81>5382
+2E82>4E5B
+2E83>4E5A
+2E84>4E59
+2E85>4EBB
+2E86>5182
+2E87>51E0
+2E88>5200
+2E89>5202
+2E8A>535C
+2E8B>353E
+2E8C>5C0F
+2E8D>5C0F
+2E8E>5C22
+2E8F>5C23
+2E90>5C22
+2E91>5C23
+2E92>5DF3
+2E93>5E7A
+2E94>5F51
+2E95>5F50
+2E96>5FC4
+2E97>5FC3
+2E98>624C
+2E99>6535
+2E9B>65E1
+2E9C>65E5
+2E9D>6708
+2E9E>6B7A
+2E9F>6BCD
+2EA0>6C11
+2EA1>6C35
+2EA2>6C3A
+2EA3>706C
+2EA4>722B
+2EA5>722B
+2EA6>4E2C
+2EA7>725B
+2EA8>72AD
+2EA9>738B
+2EAA>758B
+2EAB>76EE
+2EAC>793A
+2EAD>793B
+2EAE>7AF9
+2EAF>7CF9
+2EB0>7E9F
+2EB1>7F53
+2EB2>7F52
+2EB3>7F51
+2EB4>7F51
+2EB5>2626B
+2EB6>7F8A
+2EB7>7F8A
+2EB8>7F8B
+2EB9>8002
+2EBA>8080
+2EBB>807F
+2EBC>8089
+2EBD>81FC
+2EBE>8279
+2EBF>8279
+2EC0>8279
+2EC1>864E
+2EC2>8864
+2EC3>8980
+2EC4>897F
+2EC5>89C1
+2EC6>89D2
+2EC7>278B2
+2EC8>8BA0
+2EC9>8D1D
+2ECA>8DB3
+2ECB>8F66
+2ECC>8FB6
+2ECD>8FB6
+2ECE>8FB6
+2ECF>9091
+2ED0>9485
+2ED1>9577
+2ED2>9578
+2ED3>957F
+2ED4>95E8
+2ED5>961C
+2ED6>961D
+2ED7>96E8
+2ED8>9752
+2ED9>97E6
+2EDA>9875
+2EDB>98CE
+2EDC>98DE
+2EDD>98DF
+2EDE>2967F
+2EDF>98E0
+2EE0>9963
+2EE1>29810
+2EE2>9A6C
+2EE3>9AA8
+2EE4>9B3C
+2EE5>9C7C
+2EE6>9E1F
+2EE7>9E75
+2EE8>9EA6
+2EE9>9EC4
+2EEA>9EFE
+2EEB>6589
+2EEC>9F50
+2EED>6B6F
+2EEE>9F7F
+2EEF>9F8D
+2EF0>9F99
+2EF1>9F9C
+2EF2>4E80	
+2EF3>9F9F	
+
+# KangXi Radicals (done by kd)

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/HanRadicalFolding.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/NativeDigitFolding.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/NativeDigitFolding.txt?rev=936657&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/NativeDigitFolding.txt (added)
+++ lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/NativeDigitFolding.txt Thu Apr 22 08:47:49
2010
@@ -0,0 +1,460 @@
+# Copyright 2001-2010 Unicode, Inc.
+# 
+# Disclaimer
+# 
+# This source code is provided as is by Unicode, Inc. No claims are
+# made as to fitness for any particular purpose. No warranties of any
+# kind are expressed or implied. The recipient agrees to determine
+# applicability of information provided. If this file has been
+# purchased on magnetic or optical media from Unicode, Inc., the
+# sole remedy for any claim will be exchange of defective media
+# within 90 days of receipt.
+# 
+# Limitations on Rights to Redistribute This Code
+# 
+# Unicode, Inc. hereby grants the right to freely use the information
+# supplied in this file in the creation of products supporting the
+# Unicode Standard, and to make copies of this file in any form
+# for internal or external distribution as long as this notice
+# remains attached.
+
+### Custom Normalization mappings for UTR#30 
+### (http://www.unicode.org/reports/tr30/tr30-4.html)
+###
+### Created from Unicode 5.2 UCD
+###
+
+## Native digit folding
+# [:Nd:] > Ascii digit equivalent
+# Arabic-Indic
+0660>0030
+0661>0031
+0662>0032
+0663>0033
+0664>0034
+0665>0035
+0666>0036
+0667>0037
+0668>0038
+0669>0039
+# Eastern Arabic-Indic
+06F0>0030
+06F1>0031
+06F2>0032
+06F3>0033
+06F4>0034
+06F5>0035
+06F6>0036
+06F7>0037
+06F8>0038
+06F9>0039
+# NKo
+07C0>0030
+07C1>0031
+07C2>0032
+07C3>0033
+07C4>0034
+07C5>0035
+07C6>0036
+07C7>0037
+07C8>0038
+07C9>0039
+# Devanagari
+0966>0030
+0967>0031
+0968>0032
+0969>0033
+096A>0034
+096B>0035
+096C>0036
+096D>0037
+096E>0038
+096F>0039
+# Bengali
+09E6>0030
+09E7>0031
+09E8>0032
+09E9>0033
+09EA>0034
+09EB>0035
+09EC>0036
+09ED>0037
+09EE>0038
+09EF>0039
+# Gurmukhi
+0A66>0030
+0A67>0031
+0A68>0032
+0A69>0033
+0A6A>0034
+0A6B>0035
+0A6C>0036
+0A6D>0037
+0A6E>0038
+0A6F>0039
+# Gujarati
+0AE6>0030
+0AE7>0031
+0AE8>0032
+0AE9>0033
+0AEA>0034
+0AEB>0035
+0AEC>0036
+0AED>0037
+0AEE>0038
+0AEF>0039
+# Oriya
+0B66>0030
+0B67>0031
+0B68>0032
+0B69>0033
+0B6A>0034
+0B6B>0035
+0B6C>0036
+0B6D>0037
+0B6E>0038
+0B6F>0039
+# Tamil
+0BE6>0030
+0BE7>0031
+0BE8>0032
+0BE9>0033
+0BEA>0034
+0BEB>0035
+0BEC>0036
+0BED>0037
+0BEE>0038
+0BEF>0039
+# Telugu
+0C66>0030
+0C67>0031
+0C68>0032
+0C69>0033
+0C6A>0034
+0C6B>0035
+0C6C>0036
+0C6D>0037
+0C6E>0038
+0C6F>0039
+# Kannada
+0CE6>0030
+0CE7>0031
+0CE8>0032
+0CE9>0033
+0CEA>0034
+0CEB>0035
+0CEC>0036
+0CED>0037
+0CEE>0038
+0CEF>0039
+# Malayalam
+0D66>0030
+0D67>0031
+0D68>0032
+0D69>0033
+0D6A>0034
+0D6B>0035
+0D6C>0036
+0D6D>0037
+0D6E>0038
+0D6F>0039
+# Thai
+0E50>0030
+0E51>0031
+0E52>0032
+0E53>0033
+0E54>0034
+0E55>0035
+0E56>0036
+0E57>0037
+0E58>0038
+0E59>0039
+# Lao
+0ED0>0030
+0ED1>0031
+0ED2>0032
+0ED3>0033
+0ED4>0034
+0ED5>0035
+0ED6>0036
+0ED7>0037
+0ED8>0038
+0ED9>0039
+# Tibetan
+0F20>0030
+0F21>0031
+0F22>0032
+0F23>0033
+0F24>0034
+0F25>0035
+0F26>0036
+0F27>0037
+0F28>0038
+0F29>0039
+# Myanmar
+1040>0030
+1041>0031
+1042>0032
+1043>0033
+1044>0034
+1045>0035
+1046>0036
+1047>0037
+1048>0038
+1049>0039
+# Myanmar Shan
+1090>0030
+1091>0031
+1092>0032
+1093>0033
+1094>0034
+1095>0035
+1096>0036
+1097>0037
+1098>0038
+1099>0039
+# Khmer
+17E0>0030
+17E1>0031
+17E2>0032
+17E3>0033
+17E4>0034
+17E5>0035
+17E6>0036
+17E7>0037
+17E8>0038
+17E9>0039
+# Mongolian
+1810>0030
+1811>0031
+1812>0032
+1813>0033
+1814>0034
+1815>0035
+1816>0036
+1817>0037
+1818>0038
+1819>0039
+# Limbu
+1946>0030
+1947>0031
+1948>0032
+1949>0033
+194A>0034
+194B>0035
+194C>0036
+194D>0037
+194E>0038
+194F>0039
+# New Tai Lue
+19D0>0030
+19D1>0031
+19D2>0032
+19D3>0033
+19D4>0034
+19D5>0035
+19D6>0036
+19D7>0037
+19D8>0038
+19D9>0039
+# New Tai Lue Tham Digit One
+19DA>0031
+# Tai Tham Hora
+1A80>0030
+1A81>0031
+1A82>0032
+1A83>0033
+1A84>0034
+1A85>0035
+1A86>0036
+1A87>0037
+1A88>0038
+1A89>0039
+# Tai Tham Tham
+1A90>0030
+1A91>0031
+1A92>0032
+1A93>0033
+1A94>0034
+1A95>0035
+1A96>0036
+1A97>0037
+1A98>0038
+1A99>0039
+# Balinese
+1B50>0030
+1B51>0031
+1B52>0032
+1B53>0033
+1B54>0034
+1B55>0035
+1B56>0036
+1B57>0037
+1B58>0038
+1B59>0039
+# Sundanese
+1BB0>0030
+1BB1>0031
+1BB2>0032
+1BB3>0033
+1BB4>0034
+1BB5>0035
+1BB6>0036
+1BB7>0037
+1BB8>0038
+1BB9>0039
+# Lepcha
+1C40>0030
+1C41>0031
+1C42>0032
+1C43>0033
+1C44>0034
+1C45>0035
+1C46>0036
+1C47>0037
+1C48>0038
+1C49>0039
+# Ol Chiki
+1C50>0030
+1C51>0031
+1C52>0032
+1C53>0033
+1C54>0034
+1C55>0035
+1C56>0036
+1C57>0037
+1C58>0038
+1C59>0039
+# Vai
+A620>0030
+A621>0031
+A622>0032
+A623>0033
+A624>0034
+A625>0035
+A626>0036
+A627>0037
+A628>0038
+A629>0039
+# Saurashtra
+A8D0>0030
+A8D1>0031
+A8D2>0032
+A8D3>0033
+A8D4>0034
+A8D5>0035
+A8D6>0036
+A8D7>0037
+A8D8>0038
+A8D9>0039
+# Kayah Li
+A900>0030
+A901>0031
+A902>0032
+A903>0033
+A904>0034
+A905>0035
+A906>0036
+A907>0037
+A908>0038
+A909>0039
+# Javanese
+A9D0>0030
+A9D1>0031
+A9D2>0032
+A9D3>0033
+A9D4>0034
+A9D5>0035
+A9D6>0036
+A9D7>0037
+A9D8>0038
+A9D9>0039
+# Cham
+AA50>0030
+AA51>0031
+AA52>0032
+AA53>0033
+AA54>0034
+AA55>0035
+AA56>0036
+AA57>0037
+AA58>0038
+AA59>0039
+# Meetei Mayek
+ABF0>0030
+ABF1>0031
+ABF2>0032
+ABF3>0033
+ABF4>0034
+ABF5>0035
+ABF6>0036
+ABF7>0037
+ABF8>0038
+ABF9>0039
+# Halfwidth and Fullwidth Forms (done by kd)
+# Osmanya
+104A0>0030
+104A1>0031
+104A2>0032
+104A3>0033
+104A4>0034
+104A5>0035
+104A6>0036
+104A7>0037
+104A8>0038
+104A9>0039
+# Mathematical Alphanumeric Symbols - Bold digits
+1D7CE>0030
+1D7CF>0031
+1D7D0>0032
+1D7D1>0033
+1D7D2>0034
+1D7D3>0035
+1D7D4>0036
+1D7D5>0037
+1D7D6>0038
+1D7D7>0039
+# Mathematical Alphanumeric Symbols - Double-struck digits
+1D7D8>0030
+1D7D9>0031
+1D7DA>0032
+1D7DB>0033
+1D7DC>0034
+1D7DD>0035
+1D7DE>0036
+1D7DF>0037
+1D7E0>0038
+1D7E1>0039
+# Mathematical Alphanumeric Symbols - Sans-serif digits
+1D7E2>0030
+1D7E3>0031
+1D7E4>0032
+1D7E5>0033
+1D7E6>0034
+1D7E7>0035
+1D7E8>0036
+1D7E9>0037
+1D7EA>0038
+1D7EB>0039
+# Mathematical Alphanumeric Symbols - Sans-serif bold digits
+1D7EC>0030
+1D7ED>0031
+1D7EE>0032
+1D7EF>0033
+1D7F0>0034
+1D7F1>0035
+1D7F2>0036
+1D7F3>0037
+1D7F4>0038
+1D7F5>0039
+# Mathematical Alphanumeric Symbols - Monospace digits
+1D7F6>0030
+1D7F7>0031
+1D7F8>0032
+1D7F9>0033
+1D7FA>0034
+1D7FB>0035
+1D7FC>0036
+1D7FD>0037
+1D7FE>0038
+1D7FF>0039

Propchange: lucene/dev/trunk/lucene/contrib/icu/src/data/utr30/NativeDigitFolding.txt
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message