lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r682766 - in /lucene/java/trunk: CHANGES.txt src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java
Date Tue, 05 Aug 2008 15:47:33 GMT
Author: gsingers
Date: Tue Aug  5 08:47:33 2008
New Revision: 682766

URL: http://svn.apache.org/viewvc?rev=682766&view=rev
Log:
LUCENE-1351: clean additional ligatures

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
    lucene/java/trunk/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=682766&r1=682765&r2=682766&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Tue Aug  5 08:47:33 2008
@@ -147,6 +147,8 @@
 14. LUCENE-1310: Fixed SloppyPhraseScorer to work also for terms repeating more 
     than twice in the query. (Doron Cohen)
 
+15. LUCENE-1351: ISOLatin1AccentFilter now cleans additional ligatures (Cedrik Lime via Grant
Ingersoll)
+
 New features
 
  1. LUCENE-1137: Added Token.set/getFlags() accessors for passing more information about
a Token through the analysis

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java?rev=682766&r1=682765&r2=682766&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java Tue Aug
 5 08:47:33 2008
@@ -41,7 +41,7 @@
       // just return token as-is:
       for(int i=0;i<length;i++) {
         final char c = buffer[i];
-        if (c >= '\u00c0' && c <= '\u0178') {
+        if (c >= '\u00c0' && c <= '\uFB06') {
           removeAccents(buffer, length);
           result.setTermBuffer(output, 0, outputPos);
           break;
@@ -76,7 +76,7 @@
 
       // Quick test: if it's not in range then just keep
       // current character
-      if (c < '\u00c0')
+      if (c < '\u00c0' || c > '\uFB06')
         output[outputPos++] = c;
       else {
         switch (c) {
@@ -107,6 +107,10 @@
         case '\u00CF' : // Ï
           output[outputPos++] = 'I';
           break;
+        case '\u0132' : // IJ
+            output[outputPos++] = 'I';
+            output[outputPos++] = 'J';
+            break;
         case '\u00D0' : // Ð
           output[outputPos++] = 'D';
           break;
@@ -166,6 +170,10 @@
         case '\u00EF' : // ï
           output[outputPos++] = 'i';
           break;
+        case '\u0133' : // ij
+            output[outputPos++] = 'i';
+            output[outputPos++] = 'j';
+            break;
         case '\u00F0' : // ð
           output[outputPos++] = 'd';
           break;
@@ -202,6 +210,37 @@
         case '\u00FF' : // ÿ
           output[outputPos++] = 'y';
           break;
+        case '\uFB00': // ff
+            output[outputPos++] = 'f';
+            output[outputPos++] = 'f';
+            break;
+        case '\uFB01': // fi
+            output[outputPos++] = 'f';
+            output[outputPos++] = 'i';
+            break;
+        case '\uFB02': // fl
+            output[outputPos++] = 'f';
+            output[outputPos++] = 'l';
+            break;
+        // following 2 are commented as they can break the maxSizeNeeded (and doing *3 could
be expensive)
+//        case '\uFB03': // ffi
+//            output[outputPos++] = 'f';
+//            output[outputPos++] = 'f';
+//            output[outputPos++] = 'i';
+//            break;
+//        case '\uFB04': // ffl
+//            output[outputPos++] = 'f';
+//            output[outputPos++] = 'f';
+//            output[outputPos++] = 'l';
+//            break;
+        case '\uFB05': // ſt
+            output[outputPos++] = 'f';
+            output[outputPos++] = 't';
+            break;
+        case '\uFB06': // st
+            output[outputPos++] = 's';
+            output[outputPos++] = 't';
+        	break;
         default :
           output[outputPos++] = c;
           break;

Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java?rev=682766&r1=682765&r2=682766&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java Tue
Aug  5 08:47:33 2008
@@ -23,7 +23,7 @@
 
 public class TestISOLatin1AccentFilter extends LuceneTestCase {
   public void testU() throws Exception {
-    TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA
CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï
Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á
â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó
ô õ ö ø œ ß þ ù ú û ü ý ÿ"));
+    TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA
CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï
IJ Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à
á â ã ä å æ ç è é ê ë ì í î ï ij ð ñ
ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"));
     ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream);
     assertEquals("Des", filter.next().termText());
     assertEquals("mot", filter.next().termText());
@@ -47,6 +47,7 @@
     assertEquals("I", filter.next().termText());
     assertEquals("I", filter.next().termText());
     assertEquals("I", filter.next().termText());
+    assertEquals("IJ", filter.next().termText());
     assertEquals("D", filter.next().termText());
     assertEquals("N", filter.next().termText());
     assertEquals("O", filter.next().termText());
@@ -79,6 +80,7 @@
     assertEquals("i", filter.next().termText());
     assertEquals("i", filter.next().termText());
     assertEquals("i", filter.next().termText());
+    assertEquals("ij", filter.next().termText());
     assertEquals("d", filter.next().termText());
     assertEquals("n", filter.next().termText());
     assertEquals("o", filter.next().termText());
@@ -96,6 +98,8 @@
     assertEquals("u", filter.next().termText());
     assertEquals("y", filter.next().termText());
     assertEquals("y", filter.next().termText());
+    assertEquals("fi", filter.next().termText());
+    assertEquals("fl", filter.next().termText());
     assertNull(filter.next());
   }
 }



Mime
View raw message