commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mt...@apache.org
Subject svn commit: r820998 - /commons/sandbox/runtime/trunk/src/main/native/shared/string.c
Date Fri, 02 Oct 2009 12:59:52 GMT
Author: mturk
Date: Fri Oct  2 12:59:51 2009
New Revision: 820998

URL: http://svn.apache.org/viewvc?rev=820998&view=rev
Log:
Add utf8/ucs2 conversion - Needs a test cases

Modified:
    commons/sandbox/runtime/trunk/src/main/native/shared/string.c

Modified: commons/sandbox/runtime/trunk/src/main/native/shared/string.c
URL: http://svn.apache.org/viewvc/commons/sandbox/runtime/trunk/src/main/native/shared/string.c?rev=820998&r1=820997&r2=820998&view=diff
==============================================================================
--- commons/sandbox/runtime/trunk/src/main/native/shared/string.c (original)
+++ commons/sandbox/runtime/trunk/src/main/native/shared/string.c Fri Oct  2 12:59:51 2009
@@ -358,6 +358,164 @@
     return ACR_SUCCESS;
 }
 
+static jsize java_ucs2_to_utf8_len(const jchar *in, jsize inwords)
+{
+    jsize need = 1;
+    int ch;
+
+    while (inwords) {
+        ch = (unsigned short)(*in++);
+        if (ch == 0)
+            need += 2;
+        if (ch < 0x80)
+            need += 1;
+        else  {
+           if (ch < 0x0800)
+                need += 2;
+            else
+                need += 3;
+        }
+        --inwords;
+    }
+    /* Buffer full 'errors' aren't errors, the client must inspect both
+     * the inwords and outbytes values
+     */
+    return need;
+}
+
+/* Modified version of UTF.
+ */
+static int java_ucs2_to_utf8(const jchar *in, jsize inwords,
+                             char *out, jsize *outbytes)
+{
+    int ch;
+
+    while (inwords && *outbytes) {
+        ch = (unsigned short)(*in++);
+        if (ch == 0) {
+            if (*outbytes < 2)
+                return ACR_INCOMPLETE;
+            *outbytes -= 2;
+            *(out++) = (unsigned char)0xC0;
+            *(out++) = (unsigned char)0x80;
+        }
+        if (ch < 0x80) {
+            --*outbytes;
+            *(out++) = (unsigned char)ch;
+        }
+        else  {
+            if (ch < 0x0800) {
+                /* Two byte sequence
+                 */
+                if (*outbytes < 2)
+                    return ACR_INCOMPLETE;
+                *outbytes -= 2;
+                *(out++) = (unsigned char)(0xDF & (ch >> 6));
+                *(out++) = (unsigned char)(0xBF & (ch));
+            }
+            else {
+                /* Three byte sequence
+                 */
+                if (*outbytes < 3)
+                    return ACR_INCOMPLETE;
+                *outbytes -= 3;
+                *(out++) = (unsigned char)(0xEF & (ch >> 12));
+                *(out++) = (unsigned char)(0xBF & (ch >>  6));
+                *(out++) = (unsigned char)(0xBF & (ch));
+            }
+        }
+       --inwords;
+    }
+    if (*outbytes) {
+        *(out++) = '\0';
+        --*outbytes;
+    }
+    return ACR_SUCCESS;
+}
+
+static int java_utf8_to_ucs2(const char *in, jsize inbytes,
+                             jchar *out, jsize *outwords)
+{
+    int i, ch;
+
+    while (inbytes && *outwords) {
+        ch = (unsigned char)(*in++);
+        if (!(ch & 0x80)) {
+            /* US-ASCII-7 plain text
+             */
+            --inbytes;
+        }
+        else if ((ch & 0xE0) == 0xC0) {
+            /* Two byte sequence */
+            if (inbytes < 2)
+                return ACR_INCOMPLETE;
+            inbytes -= 2;
+            ch = ch & 0x1F;
+            if ((*in & 0xC0) != 0x80)
+                return ACR_EILSEQ;
+            ch <<= 6;
+            ch |= (unsigned char)(*(in++) & 0x3F);
+        }
+        else if ((ch & 0xF0) == 0xE0) {
+            /* Three byte sequence */
+            if (inbytes < 3)
+                return ACR_INCOMPLETE;
+            inbytes -= 3;
+            ch = ch & 0x0F;
+            for (i = 0; i  < 2; i++) {
+                if ((*in & 0xC0) != 0x80)
+                    return ACR_EILSEQ;
+                ch <<= 6;
+                ch |= (unsigned char)(*(in++) & 0x3F);
+            }
+        }
+        else {
+            return ACR_EILSEQ;
+        }
+        *(out++) = (jchar)ch;
+        --*outwords;
+    }
+    return ACR_SUCCESS;
+}
+
+ACR_DECLARE(jchar *) ACR_Utf8ToUcs2(JNIEnv *_E, const char *str)
+{
+    int rc;
+    jsize len, out;
+    jchar *dst;
+
+    out = len = (jsize)strlen(str) + 1;
+    dst = ACR_MALLOC(jchar, len);
+    if (!dst)
+        return NULL;
+    if ((rc = java_utf8_to_ucs2(str, len, dst, &out))) {
+        /* Invalid UTF-8 string */
+        x_free(dst);
+        ACR_SET_OS_ERROR(rc);
+        return NULL;
+    }
+    return dst;
+}
+
+ACR_DECLARE(char *) ACR_Usc2ToUtf8(JNIEnv *_E, const jchar *str, jsize len)
+{
+    int rc;
+    jsize out;
+    char *dst;
+
+    out = java_ucs2_to_utf8_len(str, len);
+    dst = ACR_MALLOC(char, out);
+    if (!dst)
+        return NULL;
+    if ((rc = java_ucs2_to_utf8(str, len, dst, &out))) {
+        /* Invalid UTF-8 string */
+        x_free(dst);
+        ACR_SET_OS_ERROR(rc);
+        return NULL;
+    }
+    return dst;
+}
+
 static char *get_string_utf_8(JNIEnv *_E, jstring str, char *b)
 {
     jsize sl, nl;



Mime
View raw message