abdera-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jmsn...@apache.org
Subject svn commit: r591706 [1/2] - in /incubator/abdera/java/trunk/dependencies/i18n/src/main/java/org/apache/abdera/i18n/unicode: Generator.java Normalizer.java UnicodeCharacterDatabase.java
Date Sat, 03 Nov 2007 22:52:45 GMT
Author: jmsnell
Date: Sat Nov  3 15:52:43 2007
New Revision: 591706

URL: http://svn.apache.org/viewvc?rev=591706&view=rev
Log:
This is a reworking of the UnicodeCharacterDatabase support that underlies the IRI implementation.
The previous version required a serialized java class to be decompressed and loaded at runtime
which
was expensive and annoying.  This approach uses a Generator class that produces static constants
that 
improve performance significantly.  Maintaining the code still requires several manual steps.
 I am
planning on automating the process soon.  

Over the next few days I plan on revisiting the entire IRI implementation to improve overall
performance 
and code quality.

Added:
    incubator/abdera/java/trunk/dependencies/i18n/src/main/java/org/apache/abdera/i18n/unicode/Generator.java
Modified:
    incubator/abdera/java/trunk/dependencies/i18n/src/main/java/org/apache/abdera/i18n/unicode/Normalizer.java
    incubator/abdera/java/trunk/dependencies/i18n/src/main/java/org/apache/abdera/i18n/unicode/UnicodeCharacterDatabase.java

Added: incubator/abdera/java/trunk/dependencies/i18n/src/main/java/org/apache/abdera/i18n/unicode/Generator.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/dependencies/i18n/src/main/java/org/apache/abdera/i18n/unicode/Generator.java?rev=591706&view=auto
==============================================================================
--- incubator/abdera/java/trunk/dependencies/i18n/src/main/java/org/apache/abdera/i18n/unicode/Generator.java
(added)
+++ incubator/abdera/java/trunk/dependencies/i18n/src/main/java/org/apache/abdera/i18n/unicode/Generator.java
Sat Nov  3 15:52:43 2007
@@ -0,0 +1,341 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  The ASF licenses this file to You
+* under the Apache License, Version 2.0 (the "License"); you may not
+* use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.  For additional information regarding
+* copyright in this work, please see the NOTICE file in the top level
+* directory of this distribution.
+*/
+package org.apache.abdera.i18n.unicode;
+
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.PrintWriter;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Scanner;
+import java.util.regex.MatchResult;
+
+/**
+ * Tool for parsing the Unicode Character Database file format and generating
+ * the constants for the UnicodeCharacterDatabase file.
+ */
+public class Generator {
+
+  public static void main(String... args) {    
+    PrintWriter pw = new PrintWriter(System.out);
+    BitSet exclusions = getExclusions(args[0]);
+    writeDecomposition(pw, args[1], exclusions);
+  }
+  
+  private static void writeDecomposition(PrintWriter pw, String file, BitSet excluded) {
+    Scanner s = read(file);
+    
+    BitSet compat = new BitSet();
+    
+    List<Integer> cc_idx = new ArrayList<Integer>();
+    List<Integer> cc_data = new ArrayList<Integer>();
+    
+    List<Integer> decomp_idx = new ArrayList<Integer>();
+    List<Integer[]> decomp_data = new ArrayList<Integer[]>();
+
+    List<Integer[]> comps = new ArrayList<Integer[]>();    
+    
+    List<Integer[]> hanguls = new ArrayList<Integer[]>();
+    
+    while(s.hasNextLine() && s.hasNext()) {
+      if (s.findInLine("([^;\\s]*);[^;]*;[^;]*;([^;]*);[^;]*;([^;]*);.*") != null) {
+        MatchResult result = s.match();
+        int codepoint = Integer.parseInt(result.group(1),16);
+        int cc = Integer.parseInt(result.group(2));
+        if (cc != 0) {
+          cc_idx.add(codepoint);
+          cc_data.add(cc);
+        }
+        String dc = result.group(3).trim();
+        if (dc.length() > 0) {
+          if (dc.charAt(0) == '<') compat.set(codepoint);
+          dc = dc.substring(dc.indexOf('>') + 1).trim();
+          String[] points = dc.split("\\s");
+          List<Integer> list = new ArrayList<Integer>();
+          for (int n = 0; n < points.length; n++)
+            list.add(Integer.parseInt(points[n],16));
+          decomp_idx.add(codepoint);
+          decomp_data.add(list.toArray(new Integer[list.size()]));
+           
+          if (!compat.get(codepoint) && 
+              !excluded.get(codepoint)) {
+              char f = (list.size() > 1) ?
+                (char)list.get(0).intValue() : '\u0000';
+              char l = (list.size() > 1) ?
+                (char)list.get(1).intValue() : (char)list.get(0).intValue();
+             comps.add(new Integer[] {(f << 16) | l,codepoint});
+           }
+          
+          
+        }
+      }
+    }
+        
+    // Hanguls
+    for (int z = 0; z < 0x2BA4; ++z) {
+      int t = z % 0x001C;
+      char f = (t != 0) ? 
+        (char)(0xAC00 + z - t) : 
+        (char)(0x1100 + z / 0x024C);
+      char e = (t != 0) ? 
+        (char)(0x11A7 + t) : 
+        (char)(0x1161 + (z % 0x024C) / 0x001C);
+      int pair = (f << 16) | e;
+      int value = z + 0xAC00;
+      hanguls.add(new Integer[] {pair,value});
+    }
+
+    Comparator<Integer[]> comp = new Comparator<Integer[]>() {
+      public int compare(Integer[] o1, Integer[] o2) {
+        int i1 = o1[0];
+        int i2 = o2[0];
+        return i1 < i2 ? -1 :
+               i1 > i2 ?  1 : 0;
+      }
+    };
+    Collections.sort(comps,comp);
+    Collections.sort(hanguls, comp);
+    
+    pw.print("  private static int[] getCompat() { return new int[] {");
+    int i = compat.nextSetBit(0), n = 0;
+    pw.print(i);
+    for (i = compat.nextSetBit(i); i>=0; i = compat.nextSetBit(i+1), n++) {
+      pw.print(',');
+      pw.print(i);
+      if (n % 20 == 0) {
+        pw.print("\n    ");
+        n = 0;
+      }
+    }
+    pw.print("};}\n\n");
+    pw.flush();
+    
+    
+    pw.print("  private static int[] getCCIdx() { return new int[] {");
+    for (i = 0, n = 0; i < cc_idx.size(); i++, n++) {
+      pw.print(cc_idx.get(i));
+      if (n % 20 == 0) {
+        pw.print("\n    ");
+        n = 0;
+      }
+      if (i < cc_idx.size() - 1) pw.print(',');
+    }
+    pw.print("};}\n\n");
+    pw.flush();
+
+    
+    pw.print("  private static int[] getCCData() { return new int[] {");
+    for (i = 0, n = 0; i < cc_data.size(); i++, n++) {
+      pw.print(cc_data.get(i));
+      if (n % 20 == 0) {
+        pw.print("\n    ");
+        n = 0;
+      }
+      if (i < cc_data.size() - 1) pw.print(',');
+    }
+    pw.print("};}\n\n");
+    pw.flush();
+    
+    
+    pw.print("  private static int[] getComposeIdx() { return new int[] {");
+    for (i = 0, n = 0; i < comps.size(); i++, n++) {
+      pw.print(comps.get(i)[0]);
+      if (n % 20 == 0) {
+        pw.print("\n    ");
+        n = 0;
+      }
+      if (i < comps.size() - 1) pw.print(',');
+    }
+    pw.print("};}\n\n");
+    pw.flush();
+
+    
+    pw.print("  private static int[] getComposeData() { return new int[] {");
+    for (i = 0, n = 0; i < comps.size(); i++, n++) {
+      pw.print(comps.get(i)[1]);
+      if (n % 20 == 0) {
+        pw.print("\n    ");
+        n = 0;
+      }
+      if (i < comps.size() - 1) pw.print(',');
+    }
+    pw.print("};}\n\n");
+    pw.flush();
+
+    
+    pw.print("  private static int[] getDecompIdx() { return new int[] {"); 
+    for (i = 0, n = 0; i < decomp_idx.size(); i++, n++) {
+      pw.print(decomp_idx.get(i));
+      if (n % 20 == 0) {
+        pw.print("\n    ");
+        n = 0;
+      }
+      if (i < decomp_idx.size() - 1) pw.print(',');
+    }
+    pw.print("};}\n\n");
+    
+    int sets = 2;
+    int size = decomp_idx.size() / sets;
+    i = 0;
+    for (int a = 0; a < sets; a++) {
+      pw.print("  private static int[][] getDecompData" + (a+1) + "() { return new int[][]
{");
+      for (i = a*i, n = 0; i < size * (a+1); i++, n++) {
+        Integer[] data = decomp_data.get(i);
+        pw.print('{');
+        for (int q = 0; q < data.length; q++) {
+          pw.print(data[q]);
+          if (q < data.length - 1) pw.print(',');
+        }
+        pw.print('}');
+        if (n % 20 == 0) {
+          pw.print("\n    ");
+          n = 0;
+        }
+        if (i < decomp_idx.size() - 1) pw.print(',');
+      }    
+      pw.print("};}\n\n");
+    }
+    
+    pw.println("  private static int[][] getDecompData() {");
+    for (n = 0; n < sets; n++)
+      pw.println("    int[][] d" + (n+1) + " = getDecompData" + (n+1) + "();");
+    
+    pw.print("    int[][] d = new int[");
+    for (n = 0; n < sets; n++) {
+      pw.print("d" + (n+1) + ".length");
+      if (n < sets - 1) pw.print('+');
+    }
+    pw.println("][];");
+    
+    String len = "0";
+    for (n = 0; n < sets; n++) {
+      pw.println("    System.arraycopy(d" + (n+1) + ",0,d," + len + ",d" + (n+1) + ".length);");
+      len = "d" + (n+1) + ".length";
+    }
+    pw.println("    return d;}");
+    
+    pw.flush();
+    
+    sets = 2;
+    i = 0;
+    int e = 0;
+    size = hanguls.size() / sets;
+    for (int a = 0; a < sets; a++) {
+      pw.print("  private static int[] getHangulPairs" + (a+1) + "() { return new int[] {");
+      for (i = a*i, n = 0; i < size * (a+1); i++, n++) {
+        pw.print(hanguls.get(i)[0]);
+        if (n % 20 == 0) {
+          pw.print("\n    ");
+          n = 0;
+        }
+        if (i < hanguls.size() - 1) pw.print(',');
+      }
+      pw.print("};}\n\n");
+      pw.flush();
+  
+      pw.print("  private static int[] getHangulCodepoints" + (a+1) + "() { return new int[]
{");
+      for (e = a*e, n = 0; e < size * (a+1); e++, n++) {
+        pw.print(hanguls.get(e)[1]);
+        if (n % 20 == 0) {
+          pw.print("\n    ");
+          n = 0;
+        }
+        if (e < hanguls.size() - 1) pw.print(',');
+      }
+      pw.print("};}\n\n");
+      pw.flush();
+      
+    }
+    
+    pw.println("  private static int[] getHangulPairs() {");
+    for (n = 0; n < sets; n++)
+      pw.println("    int[] d" + (n+1) + " = getHangulPairs" + (n+1) + "();");
+    
+    pw.print("    int[] d = new int[");
+    for (n = 0; n < sets; n++) {
+      pw.print("d" + (n+1) + ".length");
+      if (n < sets - 1) pw.print('+');
+    }
+    pw.println("];");
+    
+    len = "0";
+    for (n = 0; n < sets; n++) {
+      pw.println("    System.arraycopy(d" + (n+1) + ",0,d," + len + ",d" + (n+1) + ".length);");
+      len = "d" + (n+1) + ".length";
+    }
+    pw.println("    return d;}");
+    
+    pw.flush();
+
+    
+    pw.println("  private static int[] getHangulCodepoints() {");
+    for (n = 0; n < sets; n++)
+      pw.println("    int[] d" + (n+1) + " = getHangulCodepoints" + (n+1) + "();");
+    
+    pw.print("    int[] d = new int[");
+    for (n = 0; n < sets; n++) {
+      pw.print("d" + (n+1) + ".length");
+      if (n < sets - 1) pw.print('+');
+    }
+    pw.println("];");
+    
+    len = "0";
+    for (n = 0; n < sets; n++) {
+      pw.println("    System.arraycopy(d" + (n+1) + ",0,d," + len + ",d" + (n+1) + ".length);");
+      len = "d" + (n+1) + ".length";
+    }
+    pw.println("    return d;}\n\n");
+    
+    pw.flush();
+  }
+  
+  private static BitSet getExclusions(String file) {
+    Scanner s = read(file).useDelimiter("\\s*#.*");
+    BitSet set = new BitSet();
+    while(s.hasNext()) {
+      String exc = s.next().trim();
+      if (exc.length() > 0) {
+        int i = Integer.parseInt(exc,16);
+        set.set(i);
+      }
+    }
+    return set;
+  }
+  
+  private static Scanner read(String f) {
+    ClassLoader cl = Thread.currentThread().getContextClassLoader();
+    InputStream in = cl.getResourceAsStream(f);
+    if (in == null) {
+      try {
+        in = new FileInputStream(f);
+      } catch (Exception e) {}
+    }
+    if (in == null) {
+      try {
+        URL url = new URL(f);
+        in = url.openStream();
+      } catch (Exception e) {}
+    }
+    return in != null ? new Scanner(in) : null;
+  }
+  
+}

Modified: incubator/abdera/java/trunk/dependencies/i18n/src/main/java/org/apache/abdera/i18n/unicode/Normalizer.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/dependencies/i18n/src/main/java/org/apache/abdera/i18n/unicode/Normalizer.java?rev=591706&r1=591705&r2=591706&view=diff
==============================================================================
--- incubator/abdera/java/trunk/dependencies/i18n/src/main/java/org/apache/abdera/i18n/unicode/Normalizer.java
(original)
+++ incubator/abdera/java/trunk/dependencies/i18n/src/main/java/org/apache/abdera/i18n/unicode/Normalizer.java
Sat Nov  3 15:52:43 2007
@@ -21,8 +21,6 @@
 
 import org.apache.abdera.i18n.io.CharUtils;
 import org.apache.abdera.i18n.io.CodepointIterator;
-import org.apache.abdera.i18n.unicode.UnicodeCharacterDatabase;
-
 
 /**
  * Performs Unicode Normalization (Form D,C,KD and KC)
@@ -89,16 +87,14 @@
     Form form, 
     StringBuffer buf) 
       throws IOException {
-      UnicodeCharacterDatabase ucd = UnicodeCharacterDatabase.getInstance();
-      if (source.length() != 0 && ucd != null) {
-        decompose(ucd, source, form, buf);
-        compose(ucd, form, buf);
+      if (source.length() != 0) {
+        decompose(source, form, buf);
+        compose(form, buf);
       }
       return buf;
   }
   
   private static void decompose(
-    UnicodeCharacterDatabase ucd,
     String source, 
     Form form, 
     StringBuffer buf) 
@@ -109,34 +105,32 @@
       while (ci.hasNext()) {
         int c = ci.next();
         internal.setLength(0);
-        ucd.decompose(c, canonical, internal);
+        UnicodeCharacterDatabase.decompose(c, canonical, internal);
         CodepointIterator ii = CodepointIterator.forCharSequence(internal);
         while(ii.hasNext()) {
           int ch = ii.next();
-          int i = findInsertionPoint(ucd, buf, ch);
+          int i = findInsertionPoint(buf, ch);
           buf.insert(i,CharUtils.toString(ch));
         }
       }
     
   }
   
-  private static int findInsertionPoint(
-    UnicodeCharacterDatabase ucd, 
+  private static int findInsertionPoint( 
     StringBuffer buf, int c) {
-    int cc = ucd.getCanonicalClass(c);
+    int cc = UnicodeCharacterDatabase.getCanonicalClass(c);
     int i = buf.length();
     if (cc != 0) {
       int ch;
       for (; i > 0; i -= CharUtils.size(c)) {
         ch = CharUtils.charAt(buf, i-1);
-        if (ucd.getCanonicalClass(ch) <= cc) break;
+        if (UnicodeCharacterDatabase.getCanonicalClass(ch) <= cc) break;
       }
     }
     return i;
   }
   
   private static void compose(
-    UnicodeCharacterDatabase ucd,
     Form form, 
     StringBuffer buf) 
       throws IOException {
@@ -144,14 +138,14 @@
     int pos = 0;
     int lc = CharUtils.charAt(buf, pos);
     int cpos = CharUtils.size(lc);    
-    int lcc = ucd.getCanonicalClass(lc);
+    int lcc = UnicodeCharacterDatabase.getCanonicalClass(lc);
     if (lcc != 0) lcc = 256;
     int len = buf.length();
     int c;
     for (int dpos = cpos; dpos < buf.length(); dpos += CharUtils.size(c)) {
       c = CharUtils.charAt(buf,dpos);
-      int cc = ucd.getCanonicalClass(c);
-      int composite = ucd.getPairComposition(lc, c);
+      int cc = UnicodeCharacterDatabase.getCanonicalClass(c);
+      int composite = UnicodeCharacterDatabase.getPairComposition(lc, c);
       if (composite != '\uFFFF' && (lcc < cc || lcc == 0)) {
         CharUtils.setChar(buf, pos, composite);
         lc = composite;
@@ -172,9 +166,4 @@
     buf.setLength(cpos);
   }
   
-  public static void main(String... args) throws Exception {
-    
-    UnicodeCharacterDatabase.main("i18n/src/main/resources/org/apache/abdera/i18n/unicode/data/ucd.res");
-    
-  }
 }



Mime
View raw message