commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s...@apache.org
Subject svn commit: r995859 [30/30] - in /commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan: ./ color/ common/ common/byteSources/ common/mylzw/ formats/bmp/ formats/bmp/pixelparsers/ formats/bmp/writers/ formats/gif/ formats/ico/ formats/jpeg/ f...
Date Fri, 10 Sep 2010 16:33:42 GMT
Modified: commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan/util/UnicodeUtils.java
URL: http://svn.apache.org/viewvc/commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan/util/UnicodeUtils.java?rev=995859&r1=995858&r2=995859&view=diff
==============================================================================
--- commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan/util/UnicodeUtils.java
(original)
+++ commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan/util/UnicodeUtils.java
Fri Sep 10 16:33:35 2010
@@ -23,442 +23,442 @@ import org.apache.sanselan.common.Binary
 
 public abstract class UnicodeUtils implements BinaryConstants
 {
-	/**
-	 * This class should never be instantiated.
-	 */
-	private UnicodeUtils()
-	{
-	}
-	
-	public static class UnicodeException extends Exception
-	{
-		public UnicodeException(String message)
-		{
-			super(message);
-		}
-	}
-
-	// A default single-byte charset.
-	public static final int CHAR_ENCODING_CODE_ISO_8859_1 = 0;
-	public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM = 1;
-	public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM = 2;
-	public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM = 3;
-	public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM = 4;
-	public static final int CHAR_ENCODING_CODE_UTF_8 = 5;
-	public static final int CHAR_ENCODING_CODE_AMBIGUOUS = -1;
-
-	// /*
-	// * Guess the character encoding of arbitrary character data in a data
-	// * buffer.
-	// *
-	// * The data may not run to the end of the buffer; it may be terminated.
-	// This
-	// * makes the problem much harder, since the character data may be followed
-	// * by arbitrary data.
-	// */
-	// public static int guessCharacterEncoding(byte bytes[], int index)
-	// {
-	// int length = bytes.length - index;
-	//
-	// if (length < 1)
-	// return CHAR_ENCODING_CODE_AMBIGUOUS;
-	//
-	// if (length >= 2)
-	// {
-	// // look for BOM.
-	//
-	// int c1 = 0xff & bytes[index];
-	// int c2 = 0xff & bytes[index + 1];
-	// if (c1 == 0xFF && c2 == 0xFE)
-	// return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
-	// else if (c1 == 0xFE && c2 == 0xFF)
-	// return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
-	// }
-	//
-	// }
-	//
-	// /*
-	// * Guess the character encoding of arbitrary character data in a data
-	// * buffer.
-	// *
-	// * The data fills the entire buffer. If it is terminated, the terminator
-	// * byte(s) will be the last bytes in the buffer.
-	// *
-	// * This makes the problem a bit easier.
-	// */
-	// public static int guessCharacterEncodingSimple(byte bytes[], int index)
-	// throws UnicodeException
-	// {
-	// int length = bytes.length - index;
-	//
-	// if (length < 1)
-	// return CHAR_ENCODING_CODE_AMBIGUOUS;
-	//
-	// if (length >= 2)
-	// {
-	// // identify or eliminate UTF-16 with a BOM.
-	//
-	// int c1 = 0xff & bytes[index];
-	// int c2 = 0xff & bytes[index + 1];
-	// if (c1 == 0xFF && c2 == 0xFE)
-	// return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
-	// else if (c1 == 0xFE && c2 == 0xFF)
-	// return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
-	// }
-	//
-	// if (length >= 2)
-	// {
-	// // look for optional double-byte terminator.
-	//
-	// int c1 = 0xff & bytes[bytes.length - 2];
-	// int c2 = 0xff & bytes[bytes.length - 1];
-	// if (c1 == 0 && c2 == 0)
-	// {
-	// // definitely a flavor of UTF-16.
-	// if (length % 2 != 0)
-	// throw new UnicodeException(
-	// "Character data with double-byte terminator has an odd length.");
-	//
-	// boolean mayHaveTerminator = true;
-	// boolean mustHaveTerminator = false;
-	// boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
-	// BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
-	// mayHaveTerminator, mustHaveTerminator);
-	// boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
-	// BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
-	// mayHaveTerminator, mustHaveTerminator);
-	// if ((!possibleBigEndian) && (!possibleLittleEndian))
-	// throw new UnicodeException(
-	// "Invalid character data, possibly UTF-16.");
-	// if (possibleBigEndian && possibleLittleEndian)
-	// return CHAR_ENCODING_CODE_AMBIGUOUS;
-	// if (possibleBigEndian)
-	// return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
-	// if (possibleLittleEndian)
-	// return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
-	// }
-	// }
-	//
-	// List possibleEncodings = new ArrayList();
-	// if (length % 2 == 0)
-	// {
-	// boolean mayHaveTerminator = true;
-	// boolean mustHaveTerminator = false;
-	// boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
-	// BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
-	// mayHaveTerminator, mustHaveTerminator);
-	// boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
-	// BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
-	// mayHaveTerminator, mustHaveTerminator);
-	//
-	// if (possibleBigEndian)
-	// return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
-	// if (possibleLittleEndian)
-	// return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
-	// }
-	//
-	// }
-
-	public static final boolean isValidISO_8859_1(String s)
-	{
-		try
-		{
-			String roundtrip = new String(s.getBytes("ISO-8859-1"),
-					"ISO-8859-1");
-			return s.equals(roundtrip);
-		} catch (UnsupportedEncodingException e)
-		{
-			// should never be thrown.
-			throw new RuntimeException("Error parsing string.", e);
-		}
-	}
-
-	/*
-	 * Return the index of the first utf-16 terminator (ie. two even-aligned
-	 * nulls). If not found, return -1.
-	 */
-	private static int findFirstDoubleByteTerminator(byte bytes[], int index)
-	{
-		for (int i = index; i < bytes.length - 1; i += 2)
-		{
-			int c1 = 0xff & bytes[index];
-			int c2 = 0xff & bytes[index + 1];
-			if (c1 == 0 && c2 == 0)
-				return i;
-		}
-		return -1;
-	}
-
-	public final int findEndWithTerminator(byte bytes[], int index)
-			throws UnicodeException
-	{
-		return findEnd(bytes, index, true);
-	}
-
-	public final int findEndWithoutTerminator(byte bytes[], int index)
-			throws UnicodeException
-	{
-		return findEnd(bytes, index, false);
-	}
-
-	protected abstract int findEnd(byte bytes[], int index,
-			boolean includeTerminator) throws UnicodeException;
-
-	public static UnicodeUtils getInstance(int charEncodingCode)
-			throws UnicodeException
-	{
-		switch (charEncodingCode)
-		{
-		case CHAR_ENCODING_CODE_ISO_8859_1:
-			return new UnicodeMetricsASCII();
-		case CHAR_ENCODING_CODE_UTF_8:
-			// Debug.debug("CHAR_ENCODING_CODE_UTF_8");
-			return new UnicodeMetricsUTF8();
-		case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM:
-		case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM:
-			// Debug.debug("CHAR_ENCODING_CODE_UTF_16_WITH_BOM");
-			return new UnicodeMetricsUTF16WithBOM();
-		case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM:
-			return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_BIG_ENDIAN);
-		case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM:
-			return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_LITTLE_ENDIAN);
-		default:
-			throw new UnicodeException("Unknown char encoding code: "
-					+ charEncodingCode);
-		}
-	}
-
-	private static class UnicodeMetricsASCII extends UnicodeUtils
-	{
-		public int findEnd(byte bytes[], int index, boolean includeTerminator)
-				throws UnicodeException
-		{
-			for (int i = index; i < bytes.length; i++)
-			{
-				if (bytes[i] == 0)
-					return includeTerminator ? i + 1 : i;
-			}
-			return bytes.length;
-			// throw new UnicodeException("Terminator not found.");
-		}
-	}
-
-	// private static class UnicodeMetricsISO_8859_1 extends UnicodeUtils
-	// {
-	// public int findEnd(byte bytes[], int index, boolean includeTerminator)
-	// throws UnicodeException
-	// {
-	// for (int i = index; i < bytes.length; i++)
-	// {
-	// if (bytes[i] == 0)
-	// return includeTerminator ? i + 1 : i;
-	// }
-	// return bytes.length;
-	// // throw new UnicodeException("Terminator not found.");
-	// }
-	// }
-
-	private static class UnicodeMetricsUTF8 extends UnicodeUtils
-	{
-
-		public int findEnd(byte bytes[], int index, boolean includeTerminator)
-				throws UnicodeException
-		{
-			// http://en.wikipedia.org/wiki/UTF-8
-
-			while (true)
-			{
-				if (index == bytes.length)
-					return bytes.length;
-				if (index > bytes.length)
-					throw new UnicodeException("Terminator not found.");
-
-				int c1 = 0xff & bytes[index++];
-				if (c1 == 0)
-					return includeTerminator ? index : index - 1;
-				else if (c1 <= 0x7f)
-					continue;
-				else if (c1 <= 0xDF)
-				{
-					if (index >= bytes.length)
-						throw new UnicodeException("Invalid unicode.");
-
-					int c2 = 0xff & bytes[index++];
-					if (c2 < 0x80 || c2 > 0xBF)
-						throw new UnicodeException("Invalid code point.");
-				} else if (c1 <= 0xEF)
-				{
-					if (index >= bytes.length - 1)
-						throw new UnicodeException("Invalid unicode.");
-
-					int c2 = 0xff & bytes[index++];
-					if (c2 < 0x80 || c2 > 0xBF)
-						throw new UnicodeException("Invalid code point.");
-					int c3 = 0xff & bytes[index++];
-					if (c3 < 0x80 || c3 > 0xBF)
-						throw new UnicodeException("Invalid code point.");
-				} else if (c1 <= 0xF4)
-				{
-					if (index >= bytes.length - 2)
-						throw new UnicodeException("Invalid unicode.");
-
-					int c2 = 0xff & bytes[index++];
-					if (c2 < 0x80 || c2 > 0xBF)
-						throw new UnicodeException("Invalid code point.");
-					int c3 = 0xff & bytes[index++];
-					if (c3 < 0x80 || c3 > 0xBF)
-						throw new UnicodeException("Invalid code point.");
-					int c4 = 0xff & bytes[index++];
-					if (c4 < 0x80 || c4 > 0xBF)
-						throw new UnicodeException("Invalid code point.");
-				} else
-					throw new UnicodeException("Invalid code point.");
-			}
-		}
-	}
-
-	private abstract static class UnicodeMetricsUTF16 extends UnicodeUtils
-	{
-		protected static final int BYTE_ORDER_BIG_ENDIAN = 0;
-		protected static final int BYTE_ORDER_LITTLE_ENDIAN = 1;
-		protected int byteOrder = BYTE_ORDER_BIG_ENDIAN;
-
-		public UnicodeMetricsUTF16(int byteOrder)
-		{
-			this.byteOrder = byteOrder;
-		}
-
-		public boolean isValid(byte bytes[], int index,
-				boolean mayHaveTerminator, boolean mustHaveTerminator)
-				throws UnicodeException
-		{
-			// http://en.wikipedia.org/wiki/UTF-16/UCS-2
-
-			while (true)
-			{
-				if (index == bytes.length)
-				{
-					// end of buffer, no terminator found.
-					return !mustHaveTerminator;
-				}
-
-				if (index >= bytes.length - 1)
-				{
-					// end of odd-length buffer, no terminator found.
-					return false;
-				}
-
-				int c1 = 0xff & bytes[index++];
-				int c2 = 0xff & bytes[index++];
-				int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;
-
-				if (c1 == 0 && c2 == 0)
-				{
-					// terminator found.
-					return mayHaveTerminator;
-				}
-
-				if (msb1 >= 0xD8)
-				{
-					// Surrogate pair found.
-
-					if (msb1 >= 0xDC)
-					{
-						// invalid first surrogate.
-						return false;
-					}
-
-					if (index >= bytes.length - 1)
-					{
-						// missing second surrogate.
-						return false;
-					}
-
-					// second word.
-					int c3 = 0xff & bytes[index++];
-					int c4 = 0xff & bytes[index++];
-					int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
-					if (msb2 < 0xDC)
-					{
-						// invalid second surrogate.
-						return false;
-					}
-				}
-			}
-		}
-
-		public int findEnd(byte bytes[], int index, boolean includeTerminator)
-				throws UnicodeException
-		{
-			// http://en.wikipedia.org/wiki/UTF-16/UCS-2
-
-			while (true)
-			{
-				if (index == bytes.length)
-					return bytes.length;
-				if (index > bytes.length - 1)
-					throw new UnicodeException("Terminator not found.");
-
-				int c1 = 0xff & bytes[index++];
-				int c2 = 0xff & bytes[index++];
-				int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;
-
-				if (c1 == 0 && c2 == 0)
-				{
-					return includeTerminator ? index : index - 2;
-				} else if (msb1 >= 0xD8)
-				{
-					if (index > bytes.length - 1)
-						throw new UnicodeException("Terminator not found.");
-
-					// second word.
-					int c3 = 0xff & bytes[index++];
-					int c4 = 0xff & bytes[index++];
-					int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
-					if (msb2 < 0xDC)
-						throw new UnicodeException("Invalid code point.");
-				}
-			}
-		}
-	}
-
-	private static class UnicodeMetricsUTF16NoBOM extends UnicodeMetricsUTF16
-	{
-
-		public UnicodeMetricsUTF16NoBOM(final int byteOrder)
-		{
-			super(byteOrder);
-		}
-
-	}
-
-	private static class UnicodeMetricsUTF16WithBOM extends UnicodeMetricsUTF16
-	{
-
-		public UnicodeMetricsUTF16WithBOM()
-		{
-			super(BYTE_ORDER_BIG_ENDIAN);
-		}
-
-		public int findEnd(byte bytes[], int index, boolean includeTerminator)
-				throws UnicodeException
-		{
-			// http://en.wikipedia.org/wiki/UTF-16/UCS-2
-
-			if (index >= bytes.length - 1)
-				throw new UnicodeException("Missing BOM.");
-
-			int c1 = 0xff & bytes[index++];
-			int c2 = 0xff & bytes[index++];
-			if (c1 == 0xFF && c2 == 0xFE)
-				byteOrder = BYTE_ORDER_LITTLE_ENDIAN;
-			else if (c1 == 0xFE && c2 == 0xFF)
-				byteOrder = BYTE_ORDER_BIG_ENDIAN;
-			else
-				throw new UnicodeException("Invalid byte order mark.");
-
-			return super.findEnd(bytes, index, includeTerminator);
-		}
-	}
+    /**
+     * This class should never be instantiated.
+     */
+    private UnicodeUtils()
+    {
+    }
+
+    public static class UnicodeException extends Exception
+    {
+        public UnicodeException(String message)
+        {
+            super(message);
+        }
+    }
+
+    // A default single-byte charset.
+    public static final int CHAR_ENCODING_CODE_ISO_8859_1 = 0;
+    public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM = 1;
+    public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM = 2;
+    public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM = 3;
+    public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM = 4;
+    public static final int CHAR_ENCODING_CODE_UTF_8 = 5;
+    public static final int CHAR_ENCODING_CODE_AMBIGUOUS = -1;
+
+    // /*
+    // * Guess the character encoding of arbitrary character data in a data
+    // * buffer.
+    // *
+    // * The data may not run to the end of the buffer; it may be terminated.
+    // This
+    // * makes the problem much harder, since the character data may be followed
+    // * by arbitrary data.
+    // */
+    // public static int guessCharacterEncoding(byte bytes[], int index)
+    // {
+    // int length = bytes.length - index;
+    //
+    // if (length < 1)
+    // return CHAR_ENCODING_CODE_AMBIGUOUS;
+    //
+    // if (length >= 2)
+    // {
+    // // look for BOM.
+    //
+    // int c1 = 0xff & bytes[index];
+    // int c2 = 0xff & bytes[index + 1];
+    // if (c1 == 0xFF && c2 == 0xFE)
+    // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
+    // else if (c1 == 0xFE && c2 == 0xFF)
+    // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
+    // }
+    //
+    // }
+    //
+    // /*
+    // * Guess the character encoding of arbitrary character data in a data
+    // * buffer.
+    // *
+    // * The data fills the entire buffer. If it is terminated, the terminator
+    // * byte(s) will be the last bytes in the buffer.
+    // *
+    // * This makes the problem a bit easier.
+    // */
+    // public static int guessCharacterEncodingSimple(byte bytes[], int index)
+    // throws UnicodeException
+    // {
+    // int length = bytes.length - index;
+    //
+    // if (length < 1)
+    // return CHAR_ENCODING_CODE_AMBIGUOUS;
+    //
+    // if (length >= 2)
+    // {
+    // // identify or eliminate UTF-16 with a BOM.
+    //
+    // int c1 = 0xff & bytes[index];
+    // int c2 = 0xff & bytes[index + 1];
+    // if (c1 == 0xFF && c2 == 0xFE)
+    // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
+    // else if (c1 == 0xFE && c2 == 0xFF)
+    // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
+    // }
+    //
+    // if (length >= 2)
+    // {
+    // // look for optional double-byte terminator.
+    //
+    // int c1 = 0xff & bytes[bytes.length - 2];
+    // int c2 = 0xff & bytes[bytes.length - 1];
+    // if (c1 == 0 && c2 == 0)
+    // {
+    // // definitely a flavor of UTF-16.
+    // if (length % 2 != 0)
+    // throw new UnicodeException(
+    // "Character data with double-byte terminator has an odd length.");
+    //
+    // boolean mayHaveTerminator = true;
+    // boolean mustHaveTerminator = false;
+    // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
+    // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
+    // mayHaveTerminator, mustHaveTerminator);
+    // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
+    // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
+    // mayHaveTerminator, mustHaveTerminator);
+    // if ((!possibleBigEndian) && (!possibleLittleEndian))
+    // throw new UnicodeException(
+    // "Invalid character data, possibly UTF-16.");
+    // if (possibleBigEndian && possibleLittleEndian)
+    // return CHAR_ENCODING_CODE_AMBIGUOUS;
+    // if (possibleBigEndian)
+    // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
+    // if (possibleLittleEndian)
+    // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
+    // }
+    // }
+    //
+    // List possibleEncodings = new ArrayList();
+    // if (length % 2 == 0)
+    // {
+    // boolean mayHaveTerminator = true;
+    // boolean mustHaveTerminator = false;
+    // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
+    // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
+    // mayHaveTerminator, mustHaveTerminator);
+    // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
+    // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
+    // mayHaveTerminator, mustHaveTerminator);
+    //
+    // if (possibleBigEndian)
+    // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
+    // if (possibleLittleEndian)
+    // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
+    // }
+    //
+    // }
+
+    public static final boolean isValidISO_8859_1(String s)
+    {
+        try
+        {
+            String roundtrip = new String(s.getBytes("ISO-8859-1"),
+                    "ISO-8859-1");
+            return s.equals(roundtrip);
+        } catch (UnsupportedEncodingException e)
+        {
+            // should never be thrown.
+            throw new RuntimeException("Error parsing string.", e);
+        }
+    }
+
+    /*
+     * Return the index of the first utf-16 terminator (ie. two even-aligned
+     * nulls). If not found, return -1.
+     */
+    private static int findFirstDoubleByteTerminator(byte bytes[], int index)
+    {
+        for (int i = index; i < bytes.length - 1; i += 2)
+        {
+            int c1 = 0xff & bytes[index];
+            int c2 = 0xff & bytes[index + 1];
+            if (c1 == 0 && c2 == 0)
+                return i;
+        }
+        return -1;
+    }
+
+    public final int findEndWithTerminator(byte bytes[], int index)
+            throws UnicodeException
+    {
+        return findEnd(bytes, index, true);
+    }
+
+    public final int findEndWithoutTerminator(byte bytes[], int index)
+            throws UnicodeException
+    {
+        return findEnd(bytes, index, false);
+    }
+
+    protected abstract int findEnd(byte bytes[], int index,
+            boolean includeTerminator) throws UnicodeException;
+
+    public static UnicodeUtils getInstance(int charEncodingCode)
+            throws UnicodeException
+    {
+        switch (charEncodingCode)
+        {
+        case CHAR_ENCODING_CODE_ISO_8859_1:
+            return new UnicodeMetricsASCII();
+        case CHAR_ENCODING_CODE_UTF_8:
+            // Debug.debug("CHAR_ENCODING_CODE_UTF_8");
+            return new UnicodeMetricsUTF8();
+        case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM:
+        case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM:
+            // Debug.debug("CHAR_ENCODING_CODE_UTF_16_WITH_BOM");
+            return new UnicodeMetricsUTF16WithBOM();
+        case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM:
+            return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_BIG_ENDIAN);
+        case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM:
+            return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_LITTLE_ENDIAN);
+        default:
+            throw new UnicodeException("Unknown char encoding code: "
+                    + charEncodingCode);
+        }
+    }
+
+    private static class UnicodeMetricsASCII extends UnicodeUtils
+    {
+        public int findEnd(byte bytes[], int index, boolean includeTerminator)
+                throws UnicodeException
+        {
+            for (int i = index; i < bytes.length; i++)
+            {
+                if (bytes[i] == 0)
+                    return includeTerminator ? i + 1 : i;
+            }
+            return bytes.length;
+            // throw new UnicodeException("Terminator not found.");
+        }
+    }
+
+    // private static class UnicodeMetricsISO_8859_1 extends UnicodeUtils
+    // {
+    // public int findEnd(byte bytes[], int index, boolean includeTerminator)
+    // throws UnicodeException
+    // {
+    // for (int i = index; i < bytes.length; i++)
+    // {
+    // if (bytes[i] == 0)
+    // return includeTerminator ? i + 1 : i;
+    // }
+    // return bytes.length;
+    // // throw new UnicodeException("Terminator not found.");
+    // }
+    // }
+
+    private static class UnicodeMetricsUTF8 extends UnicodeUtils
+    {
+
+        public int findEnd(byte bytes[], int index, boolean includeTerminator)
+                throws UnicodeException
+        {
+            // http://en.wikipedia.org/wiki/UTF-8
+
+            while (true)
+            {
+                if (index == bytes.length)
+                    return bytes.length;
+                if (index > bytes.length)
+                    throw new UnicodeException("Terminator not found.");
+
+                int c1 = 0xff & bytes[index++];
+                if (c1 == 0)
+                    return includeTerminator ? index : index - 1;
+                else if (c1 <= 0x7f)
+                    continue;
+                else if (c1 <= 0xDF)
+                {
+                    if (index >= bytes.length)
+                        throw new UnicodeException("Invalid unicode.");
+
+                    int c2 = 0xff & bytes[index++];
+                    if (c2 < 0x80 || c2 > 0xBF)
+                        throw new UnicodeException("Invalid code point.");
+                } else if (c1 <= 0xEF)
+                {
+                    if (index >= bytes.length - 1)
+                        throw new UnicodeException("Invalid unicode.");
+
+                    int c2 = 0xff & bytes[index++];
+                    if (c2 < 0x80 || c2 > 0xBF)
+                        throw new UnicodeException("Invalid code point.");
+                    int c3 = 0xff & bytes[index++];
+                    if (c3 < 0x80 || c3 > 0xBF)
+                        throw new UnicodeException("Invalid code point.");
+                } else if (c1 <= 0xF4)
+                {
+                    if (index >= bytes.length - 2)
+                        throw new UnicodeException("Invalid unicode.");
+
+                    int c2 = 0xff & bytes[index++];
+                    if (c2 < 0x80 || c2 > 0xBF)
+                        throw new UnicodeException("Invalid code point.");
+                    int c3 = 0xff & bytes[index++];
+                    if (c3 < 0x80 || c3 > 0xBF)
+                        throw new UnicodeException("Invalid code point.");
+                    int c4 = 0xff & bytes[index++];
+                    if (c4 < 0x80 || c4 > 0xBF)
+                        throw new UnicodeException("Invalid code point.");
+                } else
+                    throw new UnicodeException("Invalid code point.");
+            }
+        }
+    }
+
+    private abstract static class UnicodeMetricsUTF16 extends UnicodeUtils
+    {
+        protected static final int BYTE_ORDER_BIG_ENDIAN = 0;
+        protected static final int BYTE_ORDER_LITTLE_ENDIAN = 1;
+        protected int byteOrder = BYTE_ORDER_BIG_ENDIAN;
+
+        public UnicodeMetricsUTF16(int byteOrder)
+        {
+            this.byteOrder = byteOrder;
+        }
+
+        public boolean isValid(byte bytes[], int index,
+                boolean mayHaveTerminator, boolean mustHaveTerminator)
+                throws UnicodeException
+        {
+            // http://en.wikipedia.org/wiki/UTF-16/UCS-2
+
+            while (true)
+            {
+                if (index == bytes.length)
+                {
+                    // end of buffer, no terminator found.
+                    return !mustHaveTerminator;
+                }
+
+                if (index >= bytes.length - 1)
+                {
+                    // end of odd-length buffer, no terminator found.
+                    return false;
+                }
+
+                int c1 = 0xff & bytes[index++];
+                int c2 = 0xff & bytes[index++];
+                int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;
+
+                if (c1 == 0 && c2 == 0)
+                {
+                    // terminator found.
+                    return mayHaveTerminator;
+                }
+
+                if (msb1 >= 0xD8)
+                {
+                    // Surrogate pair found.
+
+                    if (msb1 >= 0xDC)
+                    {
+                        // invalid first surrogate.
+                        return false;
+                    }
+
+                    if (index >= bytes.length - 1)
+                    {
+                        // missing second surrogate.
+                        return false;
+                    }
+
+                    // second word.
+                    int c3 = 0xff & bytes[index++];
+                    int c4 = 0xff & bytes[index++];
+                    int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
+                    if (msb2 < 0xDC)
+                    {
+                        // invalid second surrogate.
+                        return false;
+                    }
+                }
+            }
+        }
+
+        public int findEnd(byte bytes[], int index, boolean includeTerminator)
+                throws UnicodeException
+        {
+            // http://en.wikipedia.org/wiki/UTF-16/UCS-2
+
+            while (true)
+            {
+                if (index == bytes.length)
+                    return bytes.length;
+                if (index > bytes.length - 1)
+                    throw new UnicodeException("Terminator not found.");
+
+                int c1 = 0xff & bytes[index++];
+                int c2 = 0xff & bytes[index++];
+                int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;
+
+                if (c1 == 0 && c2 == 0)
+                {
+                    return includeTerminator ? index : index - 2;
+                } else if (msb1 >= 0xD8)
+                {
+                    if (index > bytes.length - 1)
+                        throw new UnicodeException("Terminator not found.");
+
+                    // second word.
+                    int c3 = 0xff & bytes[index++];
+                    int c4 = 0xff & bytes[index++];
+                    int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
+                    if (msb2 < 0xDC)
+                        throw new UnicodeException("Invalid code point.");
+                }
+            }
+        }
+    }
+
+    private static class UnicodeMetricsUTF16NoBOM extends UnicodeMetricsUTF16
+    {
+
+        public UnicodeMetricsUTF16NoBOM(final int byteOrder)
+        {
+            super(byteOrder);
+        }
+
+    }
+
+    private static class UnicodeMetricsUTF16WithBOM extends UnicodeMetricsUTF16
+    {
+
+        public UnicodeMetricsUTF16WithBOM()
+        {
+            super(BYTE_ORDER_BIG_ENDIAN);
+        }
+
+        public int findEnd(byte bytes[], int index, boolean includeTerminator)
+                throws UnicodeException
+        {
+            // http://en.wikipedia.org/wiki/UTF-16/UCS-2
+
+            if (index >= bytes.length - 1)
+                throw new UnicodeException("Missing BOM.");
+
+            int c1 = 0xff & bytes[index++];
+            int c2 = 0xff & bytes[index++];
+            if (c1 == 0xFF && c2 == 0xFE)
+                byteOrder = BYTE_ORDER_LITTLE_ENDIAN;
+            else if (c1 == 0xFE && c2 == 0xFF)
+                byteOrder = BYTE_ORDER_BIG_ENDIAN;
+            else
+                throw new UnicodeException("Invalid byte order mark.");
+
+            return super.findEnd(bytes, index, includeTerminator);
+        }
+    }
 
 }



Mime
View raw message