Return-Path: Delivered-To: apmail-ws-axis-dev-archive@www.apache.org Received: (qmail 36277 invoked from network); 20 Jul 2004 05:17:56 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (209.237.227.199) by minotaur-2.apache.org with SMTP; 20 Jul 2004 05:17:56 -0000 Received: (qmail 50875 invoked by uid 500); 20 Jul 2004 05:17:52 -0000 Delivered-To: apmail-ws-axis-dev-archive@ws.apache.org Received: (qmail 50736 invoked by uid 500); 20 Jul 2004 05:17:51 -0000 Mailing-List: contact axis-cvs-help@ws.apache.org; run by ezmlm Precedence: bulk list-help: list-unsubscribe: list-post: Delivered-To: mailing list axis-cvs@ws.apache.org Received: (qmail 50722 invoked by uid 99); 20 Jul 2004 05:17:51 -0000 X-ASF-Spam-Status: No, hits=0.5 required=10.0 tests=ALL_TRUSTED,NO_REAL_NAME X-Spam-Check-By: apache.org Received: from [209.237.227.194] (HELO minotaur.apache.org) (209.237.227.194) by apache.org (qpsmtpd/0.27.1) with SMTP; Mon, 19 Jul 2004 22:17:49 -0700 Received: (qmail 36199 invoked by uid 1683); 20 Jul 2004 05:17:48 -0000 Date: 20 Jul 2004 05:17:48 -0000 Message-ID: <20040720051748.36198.qmail@minotaur.apache.org> From: damitha@apache.org To: ws-axis-cvs@apache.org Subject: cvs commit: ws-axis/c/src/xml/txpp/lib spp.c spp.h spp_converter.c spp_converter.h spp_tokenizer.c spp_tokenizer.h Makefile.am xmltok.c xmltok.h xmltok_impl.c xmltok_impl.h xmltok_ns.c xpp.c xpp.h X-Virus-Checked: Checked X-Spam-Rating: minotaur-2.apache.org 1.6.2 0/1000/N damitha 2004/07/19 22:17:48 Modified: c/src/xml/txpp/lib Makefile.am Added: c/src/xml/txpp/lib spp.c spp.h spp_converter.c spp_converter.h spp_tokenizer.c spp_tokenizer.h Removed: c/src/xml/txpp/lib xmltok.c xmltok.h xmltok_impl.c xmltok_impl.h xmltok_ns.c xpp.c xpp.h Log: Revision Changes Path 1.3 +1 -1 ws-axis/c/src/xml/txpp/lib/Makefile.am Index: Makefile.am =================================================================== RCS file: /home/cvs/ws-axis/c/src/xml/txpp/lib/Makefile.am,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- Makefile.am 13 Jul 2004 07:38:54 -0000 1.2 +++ Makefile.am 20 Jul 2004 05:17:48 -0000 1.3 @@ -1,5 +1,5 @@ lib_LTLIBRARIES = libtxpp.la AM_CPPFLAGS = -Wall -g -DHAVE_XPP_CONFIG_H -libtxpp_la_SOURCES = xpp.c xmltok.c +libtxpp_la_SOURCES = spp.c spp_converter.c libtxpp_la_LIBADD = INCLUDES = -I./ -I../ 1.1 ws-axis/c/src/xml/txpp/lib/spp.c Index: spp.c =================================================================== /* * Copyright 2003-2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include "internal.h" #include "spp.h" #include "spp_converter.h" #ifdef HAVE_XPP_CONFIG_H #include "../xpp_config.h" #endif #include "tag.h" #define MALLOC(s) (ct->m_mem.mallocFcn((s))) #define MEMMOVE(p1, p2, s) (ct->m_mem.memMoveFcn((p1), (p2), (s))) #define REALLOC(p,s) (ct->m_mem.reallocFcn((p),(s))) #define FREE(p) (ct->m_mem.freeFcn((p))) #define MIN_BUFF_SZ 128 /* 0x10000 */ #define INIT_BUFFER_SIZE 128 /* keep INIT_BUFFER_SIZE <= MIN_BUFF_SZ */ #define protocolEncodingName (ct->m_protocolEncodingName) #define initEncoding (ct->m_initEncoding) #define encoding (ct->m_encoding) #define ns (ct->m_ns) #define tokState (ct->m_tokState) #define state (ct->m_state) #define numOfChars (ct->m_numOfChars) #define dataCounter (ct->m_dataCounter) #define namespaceSeparator (ct->m_namespaceSeparator) int isDone = 0; enum { PROLOG, CONTENT }; /** Struct which represents the parser object. All the member variables of this parser * object are prefixed with m_ to represent them as parser memeber variables through * out the code*/ typedef struct SPPContext { char *m_buff;/* Tokenized ptr data buffer*/ char *m_utf8Buff;/* Utf8 tokenized ptr data buffer*/ int m_buffSize;/* Tokenized ptr data buffer size*/ int m_utf8BuffSize;/* Utf8 8 tokenized ptr data buffer size*/ char *m_prevTokPoint;/* This points to the previous tokenizing point of the buffer. This means that when the buffer to be tokenized is passed to the xmltok_impl.c's tokenizeContent method this pointer keeps pointing to the end of data so far tokenized. Only m_currentTokPoint is increased inside the tokenizeContent method*/ char *m_currentTokPoint;/* This points to the current tokenizing point of the buffer*/ int m_numOfChars;/* Number of characters to be parsed in the buffer*/ int m_state; int m_tokState;/* Tokenizing state. Holds prolog or content*/ TokDataStruct m_data;/* Tokenized ptr data structure*/ int m_dataCounter; const XML_Char *m_protocolEncodingName;/* Encoding name*/ INIT_ENCODING m_initEncoding;/* Encoding structure*/ const ENCODING *m_encoding;/* Encoding structure*/ XML_Bool m_ns; const SppMemoryHandlingSuite m_mem;/* Memory handling suite*/ XML_Char m_namespaceSeparator; int (*getBlock)(char *buff, int buffSize, int *numchars); } SPPContext; static enum SPP_Error processXmlDecl(SPPParser* ct, int isGeneralTextEntity, const char *s, const char *currentTokPoint) { /* const XML_Char *encodingName = NULL; */ /* const XML_Char *storedEncName = NULL; */ const char *version = NULL; const char *versionend; /* const XML_Char *storedversion = NULL; */ int standalone = -1; if (!(XmlParseXmlDecl)(isGeneralTextEntity, /*Default encoding*/ encoding, s, currentTokPoint, 0, &version, &versionend, &protocolEncodingName, /*Encoding is taken from the xml file declaration*/ &encoding, &standalone)) return SPP_ERROR_SYNTAX; else { /* printf("version:%s\n", version); */ /* printf("versionend:%s\n", versionend); */ *(protocolEncodingName + (ct->m_currentTokPoint - protocolEncodingName) - 3) = '\0'; /*printf("encodingName:%s\n", protocolEncodingName);*/ /* if(standalone) */ /* printf("standalone:%s\n", standalone); */ initializeEncoding(ct); } } int ret_status; enum SPP_Error parseProlog(SPPParser* ct) { ct->m_data.numOfPtrs = 0; ct->m_data.numOfPtrsUtf8 = 0; ct->m_prevTokPoint = ct->m_currentTokPoint; do { ret_status = 0; if(PROLOG == tokState) { initializeEncoding(ct); /* printf("numOfChars:%d\n", numOfChars); */ /* XmlPrologTok is defined in spp_converter.h */ ret_status = XmlPrologTok(&state, &ct->m_data, encoding, &numOfChars, ct->m_prevTokPoint, &ct->m_currentTokPoint); if(SPP_ERROR_NONE == ret_status) { /* printf("ct->m_prevTokPoint:%s\n", ct->m_prevTokPoint); */ /* printf("ct->m_currentTokPoint:%s\n", ct->m_currentTokPoint); */ processXmlDecl(ct, 0, ct->m_prevTokPoint, ct->m_currentTokPoint); ct->m_prevTokPoint = ct->m_currentTokPoint; } } return SPP_ERROR_NONE; } while (loadBuffer(ct)); return SPP_ERROR_PARSE_FAILED; } enum SPP_Error parseContent(SPPParser* ct) { ct->m_data.numOfPtrs = 0; ct->m_data.numOfPtrsUtf8 = 0; /* Initialially m_preveTokPoint points to the currentTokPoint*/ ct->m_prevTokPoint = ct->m_currentTokPoint; do { ret_status = 0; if(PROLOG == tokState) { initializeEncoding(ct); /* printf("numOfChars:%d\n", numOfChars); */ /* XmlPrologTok is defined in spp_converter.h. While method is executed * prevTokPoint is not changed. currentTokPoint moves forward*/ ret_status = XmlPrologTok(&state, &ct->m_data, encoding, &numOfChars, ct->m_prevTokPoint, &ct->m_currentTokPoint); if(SPP_ERROR_NONE == ret_status) { /* printf("ct->m_prevTokPoint:%s\n", ct->m_prevTokPoint); */ /* printf("ct->m_currentTokPoint:%s\n", ct->m_currentTokPoint); */ processXmlDecl(ct, 0, ct->m_prevTokPoint, ct->m_currentTokPoint); ct->m_prevTokPoint = ct->m_currentTokPoint; } } if(SPP_ERROR_NONE == ret_status) { tokState = CONTENT; /* printf("numOfChars:%d\n", numOfChars); */ /* XmlContentTok is defined in spp_converter.h.*/ /* XmlContentTok is defined in spp_converter.h. While method is executed * prevTokPoint is not changed. currentTokPoint moves forward * until a valid tag element is completed*/ ret_status = XmlContentTok(&state, &ct->m_data, encoding, &numOfChars, ct->m_prevTokPoint, &ct->m_currentTokPoint); /* printf("tempStatus:%d\n", tempStatus); */ /* printf("ct->m_prevTokPoint:%s\n", ct->m_prevTokPoint); */ /* printf("ct->m_currentTokPoint:%s\n", ct->m_currentTokPoint); */ if(SPP_ERROR_NONE == ret_status) { return SPP_ERROR_NONE; } } } while (loadBuffer(ct)); return SPP_ERROR_PARSE_FAILED; } int loadBuffer(SPPParser *ct) { if(isDone) return SPP_ERROR_NONE; /* Holds the size from the parser buffer beginning to the * end of used data. Simply said, this is the already * used data to make a tag, remaining in the buffer*/ int usedData; /* Take the size of data that is remained from the previous parse * iteration failing to complete a tag element */ int unusedData = (int)(ct->m_currentTokPoint - ct->m_prevTokPoint); /* We can get rid of used data and use used data size and vacant size * to fill the buffer again*/ int toBeFilledSize = ct->m_buffSize - unusedData; /* We define MIN_BUFFER_SZ as the minimum value the toBeFilledSize * could assume. * If the toBeFilledSize is less than the MIN_BUFFER_SZ, * buffer size is doubled for performance reasons */ if (toBeFilledSize < MIN_BUFF_SZ) { int ii; ct->m_buffSize *= 2;/* Double the parser buffer size*/ /* Now since the buffer size is increased the fill * size is also increased*/ toBeFilledSize = ct->m_buffSize - unusedData; /* Move the used data to the beginning of the parser * buffer. Used data will be lost */ MEMMOVE(ct->m_buff, ct->m_prevTokPoint, unusedData); ct->m_buff = REALLOC(ct->m_buff, ct->m_buffSize); usedData = (int)(ct->m_prevTokPoint - ct->m_buff); /*MEMMOVE(ct->m_buff, ct->m_prevTokPoint, unusedData);*/ ct->m_prevTokPoint = ct->m_buff; ct->m_currentTokPoint = ct->m_prevTokPoint + unusedData; ii = 0; /* Move the already tokenized data pointers */ while (ii < ct->m_data.numOfPtrs) ct->m_data.ptrBuff[ii++] -= usedData; } else /* Don't double the buffer size. But get rid of the used data*/ { int ii; usedData = (int)(ct->m_prevTokPoint - ct->m_buff); MEMMOVE(ct->m_buff, ct->m_prevTokPoint, unusedData); ct->m_prevTokPoint = ct->m_buff; ct->m_currentTokPoint = ct->m_prevTokPoint + unusedData; ii = 0; while (ii < ct->m_data.numOfPtrs) ct->m_data.ptrBuff[ii++] -= usedData; } if(!ct->getBlock(ct->m_currentTokPoint, toBeFilledSize, &numOfChars)) { return SPP_ERROR_READ_BLOCK; } else return SPP_ERROR_NONE; } int getBlock(char *buff, int toBeFilledSize, int* numchars) { int len; int done; *numchars = 0; len = fread(buff, 1, toBeFilledSize, stdin); /*printf("len:%d\n", len);*/ *numchars += len; if (ferror(stdin)) { fprintf(stderr, "Read error\n"); exit(-1); } done = feof(stdin); if(done) isDone = 1; return SPP_ERROR_NONE; } SPPParser* parserCreate(const XML_Char *encodingName) { return parserCreate_mh(encodingName, NULL, NULL); } SPPParser* parserCreate_mh(const XML_Char *encodingName, const SppMemoryHandlingSuite *memsuite, XML_Char nsSep) { return parserCreate_in(encodingName, memsuite, nsSep); } /** Create parser*/ static SPPParser* parserCreate_in(const XML_Char *encodingName, const SppMemoryHandlingSuite *memsuite, XML_Char nsSep) { SPPParser* ct; /* If memory handling functions are externally provided*/ if (memsuite) { SppMemoryHandlingSuite *mtemp; ct = (SPPParser*) memsuite->mallocFcn(sizeof(struct SPPContext)); if (ct != NULL) { mtemp = (SppMemoryHandlingSuite *)&(ct->m_mem); mtemp->mallocFcn = memsuite->mallocFcn; mtemp->memMoveFcn = memsuite->memMoveFcn; mtemp->reallocFcn = memsuite->reallocFcn; mtemp->freeFcn = memsuite->freeFcn; } } else /* Use system memory handling functions*/ { SppMemoryHandlingSuite *mtemp; ct = (SPPParser*) malloc(sizeof(struct SPPContext)); if (ct != NULL) { mtemp = (SppMemoryHandlingSuite *)&(ct->m_mem); mtemp->mallocFcn = malloc; mtemp->memMoveFcn = memmove; mtemp->reallocFcn = realloc; mtemp->freeFcn = free; } } ct->m_buffSize = INIT_BUFFER_SIZE; ct->m_utf8BuffSize = ct->m_buffSize; char* buff = (char*) malloc(ct->m_buffSize * sizeof(char)); char* utf8Buff = (char*) malloc(ct->m_utf8BuffSize * sizeof(char)); if(buff == NULL || utf8Buff == NULL) return NULL; ct->m_buff = buff; ct->m_utf8Buff = utf8Buff; namespaceSeparator = '!'; ns = XML_FALSE; if(SPP_ERROR_NONE == parserInit(ct, encodingName)) return ct; else return NULL; } /** Initialize parser*/ static int parserInit(SPPParser* ct, const XML_Char *encodingName) { if(ct) { numOfChars = 0; ct->m_currentTokPoint = ct->m_buff; ct->m_prevTokPoint = ct->m_buff; state = S_0; tokState = PROLOG; ct->getBlock = getBlock; ct->m_data.ptrBuff = NULL; ct->m_data.utf8PtrBuff = NULL; ct->m_data.ptrBuffSize = 8; dataCounter = 0; protocolEncodingName = encodingName; return SPP_ERROR_NONE; } else return SPP_ERROR_PARSER_INIT_FAILED; } /** Initialize encoding*/ static int initializeEncoding(SPPParser* ct) { const char *s; #ifdef XML_UNICODE char encodingBuf[128]; if (!protocolEncodingName) s = NULL; else { int i; for (i = 0; protocolEncodingName[i]; i++) { if (i == sizeof(encodingBuf) - 1 || (protocolEncodingName[i] & ~0x7f) != 0) { encodingBuf[0] = '\0'; break; } encodingBuf[i] = (char)protocolEncodingName[i]; } encodingBuf[i] = '\0'; s = encodingBuf; } #else /* printf("protocolEncodingName:%s\n", protocolEncodingName); */ s = protocolEncodingName; #endif if (XmlInitEncoding(&initEncoding, &encoding, s)) return SPP_ERROR_NONE; /* return handleUnknownEncoding(parser, protocolEncodingName); */ } void* parserFree(SPPParser* ct) { free(ct); } TokDataStruct* next(SPPParser* ct) { dataCounter = 0; if(SPP_ERROR_NONE == parseContent(ct)) { /*processData(ct, encoding);*/ return &ct->m_data; } else return NULL; } static int addUtf8Ptr(char *ptr, TokDataStruct *data) { if (data->numOfPtrsUtf8 == data->ptrBuffSize || !data->utf8PtrBuff) { int sz = data->ptrBuffSize << 1; char **ptrBuff = (char **)malloc(sz << 2); if (!ptrBuff) return SPP_ERROR_NO_MEMORY; if (data->utf8PtrBuff) { memmove(ptrBuff, data->utf8PtrBuff, data->numOfPtrsUtf8 << 2); free(data->utf8PtrBuff); } data->utf8PtrBuff = ptrBuff; data->ptrBuffSize = sz; } data->utf8PtrBuff[data->numOfPtrsUtf8++] = ptr; return SPP_ERROR_NONE; } static void processData(SPPParser* ct) { int intCount = 0; char holder; int bufSize; int totLen = 0; int wordLen = 0; int tempDiff; XML_Char* toPtr = (XML_Char *) ct->m_utf8Buff; XML_Char* startPtr = (XML_Char *) ct->m_utf8Buff; XML_Char* endPtr = (XML_Char *) ct->m_utf8Buff; while(intCount < ct->m_data.numOfPtrs) { const char *rawNameEnd = &ct->m_data.ptrBuff[intCount + 1][1]; const char *fromPtr = ct->m_data.ptrBuff[intCount]; tempDiff = ct->m_data.ptrBuff[intCount+1] - ct->m_data.ptrBuff[intCount]; if(tempDiff <= 1) return; startPtr += wordLen; /* printf("fromPtr:\n\n%s\n\n", fromPtr); */ /* printf("rawNameEnd:\n\n%s\n\n", rawNameEnd); */ if(ct->m_utf8BuffSize < ct->m_buffSize) { ct->m_utf8BuffSize = 2 * ct->m_buffSize; /* printf("m_utf8BuffSize:%d\n", ct->m_utf8BuffSize); */ char *temp = (char *)REALLOC(ct->m_utf8Buff, ct->m_utf8BuffSize); if (temp == NULL) return SPP_ERROR_NO_MEMORY; ct->m_utf8Buff = temp; } SppUtf8Convert(encoding, &fromPtr, rawNameEnd, (ICHAR **)&toPtr, (ICHAR *)ct->m_utf8Buff + ct->m_utf8BuffSize); /* printf("startPtr:\n%s\n", startPtr); */ /* printf("toPtr:\n%s\n", toPtr); */ wordLen = toPtr - startPtr; /* printf("wordLen:%d\n", wordLen); */ totLen += wordLen; endPtr = toPtr - 1; addUtf8Ptr(startPtr, &ct->m_data); addUtf8Ptr(endPtr, &ct->m_data); intCount += 2; } return; } int getNextElementAsInt(SPPParser* ct, int* parseError) { int intTemp; char* temp; dataCounter = 0; temp = ct->m_data.ptrBuff[dataCounter + 1][1]; ct->m_data.ptrBuff[dataCounter + 1][1] = XML_T('\0'); /* printf("ct->m_data.ptrBuff[0]:%s\n", ct->m_data.ptrBuff[0]); */ sscanf(ct->m_data.ptrBuff[dataCounter], "%d", &intTemp); ct->m_data.ptrBuff[dataCounter + 1][1] = temp; *parseError = SPP_ERROR_NONE; return intTemp; } /* * Prier to call this method state should be START_TAG */ int getNextAttributeAsInt(SPPParser* ct, int* parseError) { int intTemp; char* temp; TokDataStruct* data; if (2 >= ct->m_data.numOfPtrsUtf8) { *parseError = SPP_ERROR_UNEXPECTED_TOKEN_CONTENT; return 0; } dataCounter += 2; temp = ct->m_data.ptrBuff[dataCounter + 1][1]; ct->m_data.ptrBuff[dataCounter+1][1] = '\0'; /* printf("ct->m_data.ptrBuff[dataCounter]:%s\n", * ct->m_data.ptrBuff[dataCounter]); */ sscanf(ct->m_data.ptrBuff[dataCounter], "%d", &intTemp); ct->m_data.ptrBuff[dataCounter+1][1] = temp; *parseError = SPP_ERROR_NONE; return intTemp; } 1.1 ws-axis/c/src/xml/txpp/lib/spp.h Index: spp.h =================================================================== /* * Copyright 2003-2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #ifndef XPP_H_OF_AXIS_INCLUDED #define XPP_H_OF_AXIS_INCLUDED #ifdef XML_UNICODE /* Information is UTF-16 encoded. */ #ifdef XML_UNICODE_WCHAR_T typedef wchar_t XML_Char; typedef wchar_t XML_LChar; #else typedef unsigned short XML_Char; typedef char XML_LChar; #endif /* XML_UNICODE_WCHAR_T */ #else /* Information is UTF-8 encoded. */ typedef char XML_Char; typedef char XML_LChar; #endif /* XML_UNICODE */ typedef unsigned char XML_Bool; #define XML_TRUE ((XML_Bool) 1) #define XML_FALSE ((XML_Bool) 0) #ifdef XML_UNICODE #ifdef XML_UNICODE_WCHAR_T #define XML_T(x) (const wchar_t)x #define XML_L(x) L ## x #else #define XML_T(x) (const unsigned short)x #define XML_L(x) x #endif #else #define XML_T(x) x #define XML_L(x) x #endif #ifdef XML_UNICODE #define XML_ENCODE_MAX XML_UTF16_ENCODE_MAX #define XmlConvert XmlUtf16Convert #define XmlGetInternalEncoding XmlGetUtf16InternalEncoding #define XmlEncode XmlUtf16Encode #define MUST_CONVERT(enc, s) (!(enc)->isUtf16 || (((unsigned long)s) & 1)) typedef unsigned short ICHAR; #else #define XML_ENCODE_MAX XML_UTF8_ENCODE_MAX #define XmlConvert XmlUtf8Convert #define XmlGetInternalEncoding XmlGetUtf8InternalEncoding #define XmlEncode XmlUtf8Encode #define MUST_CONVERT(enc, s) (!(enc)->isUtf8) typedef char ICHAR; #endif /** Tokenizing states*/ enum { S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, S_8, S_9, S_10, S_11, S_12, S_13, S_14 }; /** Special characters*/ enum { START_TAG, END_TAG, EMPTY_ELEMENT_TAG, PCDATA }; /** Error codes*/ enum SPP_Error { SPP_ERROR_NONE, SPP_ERROR_NO_MEMORY, SPP_ERROR_SYNTAX, SPP_ERROR_NO_ELEMENTS, SPP_ERROR_INVALID_TOKEN, SPP_ERROR_UNCLOSED_TOKEN, SPP_ERROR_TAG_MISMATCH, SPP_ERROR_DUPLICATE_ATTRIBUTE, SPP_ERROR_UNKNOWN_ENCODING, SPP_ERROR_INCORRECT_ENCODING, SPP_ERROR_NOT_STANDALONE, SPP_ERROR_UNEXPECTED_STATE, SPP_ERROR_TOKENIZER_FAILED, SPP_ERROR_UNEXPECTED_TOKEN_CONTENT, SPP_ERROR_PARSE_FAILED, SPP_ERROR_READ_BLOCK, SPP_ERROR_PARSER_INIT_FAILED, SPP_ERROR_UNKNOWN }; /** Memeory handling structure*/ typedef struct mm { void *( *mallocFcn)(size_t size); void *( *memMoveFcn)(void *ptrto, void *ptrfrom, size_t size); void *( *reallocFcn)(void *ptr, size_t size); void ( *freeFcn)(void *ptr); } SppMemoryHandlingSuite; /** Struct to hold ptrs to tokenized data*/ typedef struct data { int type; int ptrBuffSize; int numOfPtrs; int numOfPtrsUtf8; char **ptrBuff; char **utf8PtrBuff; } TokDataStruct; /* char *tn[] = {"START_TAG", "END_TAG", "EMPTY_ELEMENT_TAG", "PCDATA"}; */ struct SPPContext; /** SPP parser structure*/ typedef struct SPPContext SPPParser; /** Parser create with encoding*/ SPPParser* parserCreate(const XML_Char *encodingName); /** Parser create with encoding, namspace*/ SPPParser* parserCreate_ns(const XML_Char *encodingName, XML_Char nsSep); /** Parser create with encoding, external memory handling suite and namespace*/ SPPParser* parserCreate_mh(const XML_Char *encodingName, const SppMemoryHandlingSuite *memsuite, XML_Char nsSep); enum SPP_Error parseProlog(SPPParser* ct); void* parserFree(SPPParser* ct); /** Parse the next element tag*/ TokDataStruct* next(SPPParser* ct); /** Get next element as integer*/ int getNextElementAsInt(SPPParser* ct, int* parseError); /** Get next attribute as integer*/ int getNextAttributeAsInt(SPPParser* ct, int* parseError); #endif 1.1 ws-axis/c/src/xml/txpp/lib/spp_converter.c Index: spp_converter.c =================================================================== /* * Copyright 2003-2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifdef COMPILED_FROM_DSP #include "winconfig.h" #elif defined(MACOS_CLASSIC) #include "macconfig.h" #else #ifdef HAVE_XPP_CONFIG_H #include "../xpp_config.h" #endif #endif /* ndef COMPILED_FROM_DSP */ #include "internal.h" #include "spp_converter.h" #include "nametab.h" #ifdef XML_DTD #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) #else #define IGNORE_SECTION_TOK_VTABLE /* as nothing */ #endif /** * tokenizeProlog and tokenizeContent are defined in spp_tokenizer.c * tokenizeProlog tokenize the xml declaration * tokenizeContent tokenize the xml content */ #define VTABLE1 \ { PREFIX(tokenizeProlog), PREFIX(tokenizeContent)}, \ PREFIX(nameMatchesAscii) #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) #define UCS2_GET_NAMING(pages, hi, lo) \ (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) /* A 2 byte UTF-8 representation splits the characters 11 bits between * the bottom 5 and 6 bits of the bytes. We need 8 bits to index into * pages, 3 bits to add to that index and 5 bits to generate the mask. */ #define UTF8_GET_NAMING2(pages, byte) \ (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ + ((((byte)[0]) & 3) << 1) \ + ((((byte)[1]) >> 5) & 1)] \ & (1 << (((byte)[1]) & 0x1F))) /* A 3 byte UTF-8 representation splits the characters 16 bits between * the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index * into pages, 3 bits to add to that index and 5 bits to generate the * mask. */ #define UTF8_GET_NAMING3(pages, byte) \ (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ + ((((byte)[1]) >> 2) & 0xF)] \ << 3) \ + ((((byte)[1]) & 3) << 1) \ + ((((byte)[2]) >> 5) & 1)] \ & (1 << (((byte)[2]) & 0x1F))) #define UTF8_GET_NAMING(pages, p, n) \ ((n) == 2 \ ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ : ((n) == 3 \ ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ : 0)) /* Detection of invalid UTF-8 sequences is based on Table 3.1B * of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ * with the additional restriction of not allowing the Unicode * code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). * Implementation details: * (A & 0x80) == 0 means A < 0x80 * and * (A & 0xC0) == 0xC0 means A > 0xBF */ #define UTF8_INVALID2(p) \ ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) #define UTF8_INVALID3(p) \ (((p)[2] & 0x80) == 0 \ || \ ((*p) == 0xEF && (p)[1] == 0xBF \ ? \ (p)[2] > 0xBD \ : \ ((p)[2] & 0xC0) == 0xC0) \ || \ ((*p) == 0xE0 \ ? \ (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ : \ ((p)[1] & 0x80) == 0 \ || \ ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) #define UTF8_INVALID4(p) \ (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \ || \ ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \ || \ ((*p) == 0xF0 \ ? \ (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ : \ ((p)[1] & 0x80) == 0 \ || \ ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) static int PTRFASTCALL isNever(const ENCODING *enc, const char *p) { return 0; } static int PTRFASTCALL utf8_isName2(const ENCODING *enc, const char *p) { return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); } static int PTRFASTCALL utf8_isName3(const ENCODING *enc, const char *p) { return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); } #define utf8_isName4 isNever static int PTRFASTCALL utf8_isNmstrt2(const ENCODING *enc, const char *p) { return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); } static int PTRFASTCALL utf8_isNmstrt3(const ENCODING *enc, const char *p) { return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); } #define utf8_isNmstrt4 isNever static int PTRFASTCALL utf8_isInvalid2(const ENCODING *enc, const char *p) { return UTF8_INVALID2((const unsigned char *)p); } static int PTRFASTCALL utf8_isInvalid3(const ENCODING *enc, const char *p) { return UTF8_INVALID3((const unsigned char *)p); } static int PTRFASTCALL utf8_isInvalid4(const ENCODING *enc, const char *p) { return UTF8_INVALID4((const unsigned char *)p); } struct normal_encoding { ENCODING enc; unsigned char type[256]; #ifdef XML_MIN_SIZE int (PTRFASTCALL *byteType)(const ENCODING *, const char *); int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *); int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *); int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *); int (PTRCALL *charMatches)(const ENCODING *, const char *, int); #endif /* XML_MIN_SIZE */ int (PTRFASTCALL *isName2)(const ENCODING *, const char *); int (PTRFASTCALL *isName3)(const ENCODING *, const char *); int (PTRFASTCALL *isName4)(const ENCODING *, const char *); int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *); int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *); int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *); int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *); int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *); int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *); }; #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc)) #ifdef XML_MIN_SIZE #define STANDARD_VTABLE(E) \ E ## byteType, \ E ## isNameMin, \ E ## isNmstrtMin, \ E ## byteToAscii, \ E ## charMatches, #else #define STANDARD_VTABLE(E) /* as nothing */ #endif #define NORMAL_VTABLE(E) \ E ## isName2, \ E ## isName3, \ E ## isName4, \ E ## isNmstrt2, \ E ## isNmstrt3, \ E ## isNmstrt4, \ E ## isInvalid2, \ E ## isInvalid3, \ E ## isInvalid4 static int FASTCALL checkCharRefNumber(int); #include "spp_tokenizer.h" #include "ascii.h" #ifdef XML_MIN_SIZE #define sb_isNameMin isNever #define sb_isNmstrtMin isNever #endif #ifdef XML_MIN_SIZE #define MINBPC(enc) ((enc)->minBytesPerChar) #else /* minimum bytes per character */ #define MINBPC(enc) 1 #endif #define SB_BYTE_TYPE(enc, p) \ (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) #ifdef XML_MIN_SIZE static int PTRFASTCALL sb_byteType(const ENCODING *enc, const char *p) { return SB_BYTE_TYPE(enc, p); } #define BYTE_TYPE(enc, p) \ (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) #else #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) #endif #ifdef XML_MIN_SIZE #define BYTE_TO_ASCII(enc, p) \ (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p)) static int PTRFASTCALL sb_byteToAscii(const ENCODING *enc, const char *p) { return *p; } #else #define BYTE_TO_ASCII(enc, p) (*(p)) #endif #define IS_NAME_CHAR(enc, p, n) \ (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p)) #define IS_NMSTRT_CHAR(enc, p, n) \ (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p)) #define IS_INVALID_CHAR(enc, p, n) \ (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p)) #ifdef XML_MIN_SIZE #define IS_NAME_CHAR_MINBPC(enc, p) \ (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p)) #define IS_NMSTRT_CHAR_MINBPC(enc, p) \ (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p)) #else #define IS_NAME_CHAR_MINBPC(enc, p) (0) #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) #endif #ifdef XML_MIN_SIZE #define CHAR_MATCHES(enc, p, c) \ (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c)) static int PTRCALL sb_charMatches(const ENCODING *enc, const char *p, int c) { return *p == c; } #else /* c is an ASCII character */ #define CHAR_MATCHES(enc, p, c) (*(p) == c) #endif #define PREFIX(ident) normal_ ## ident #include "spp_tokenizer.c" #undef MINBPC #undef BYTE_TYPE #undef BYTE_TO_ASCII #undef CHAR_MATCHES #undef IS_NAME_CHAR #undef IS_NAME_CHAR_MINBPC #undef IS_NMSTRT_CHAR #undef IS_NMSTRT_CHAR_MINBPC #undef IS_INVALID_CHAR enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ UTF8_cval1 = 0x00, UTF8_cval2 = 0xc0, UTF8_cval3 = 0xe0, UTF8_cval4 = 0xf0 }; /** Converts from utf8 to utf8. * @param encoding type * @param * @param * @param * @param */ static void PTRCALL utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim) { /* printf("came\n"); */ char *to; const char *from; if (fromLim - *fromP > toLim - *toP) { /* Avoid copying partial characters. */ for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) break; } for (to = *toP, from = *fromP; from != fromLim; from++, to++) *to = *from; *fromP = from; *toP = to; /* printf("toP:%s\n", *toP); */ } static void PTRCALL utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim) { unsigned short *to = *toP; const char *from = *fromP; while (from != fromLim && to != toLim) { switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { case BT_LEAD2: *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); from += 2; break; case BT_LEAD3: *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f)); from += 3; break; case BT_LEAD4: { unsigned long n; if (to + 1 == toLim) goto after; n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); n -= 0x10000; to[0] = (unsigned short)((n >> 10) | 0xD800); to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); to += 2; from += 4; } break; default: *to++ = *from++; break; } } after: *fromP = from; *toP = to; } static const struct normal_encoding utf8_encoding = { { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, { #define BT_COLON BT_NMSTRT #include "asciitab.h" #undef BT_COLON #include "utf8tab.h" }, STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) }; static const struct normal_encoding internal_utf8_encoding = { { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, { #define BT_COLON BT_NMSTRT #include "iasciitab.h" #undef BT_COLON #include "utf8tab.h" }, STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) }; static void PTRCALL latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim) { for (;;) { unsigned char c; if (*fromP == fromLim) break; c = (unsigned char)**fromP; if (c & 0x80) { if (toLim - *toP < 2) break; *(*toP)++ = (char)((c >> 6) | UTF8_cval2); *(*toP)++ = (char)((c & 0x3f) | 0x80); (*fromP)++; } else { if (*toP == toLim) break; *(*toP)++ = *(*fromP)++; } } } static void PTRCALL latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim) { while (*fromP != fromLim && *toP != toLim) *(*toP)++ = (unsigned char)*(*fromP)++; } static const struct normal_encoding latin1_encoding = { { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, { #define BT_COLON BT_NMSTRT #include "asciitab.h" #undef BT_COLON #include "latin1tab.h" }, STANDARD_VTABLE(sb_) }; static void PTRCALL ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim) { while (*fromP != fromLim && *toP != toLim) *(*toP)++ = *(*fromP)++; } static const struct normal_encoding ascii_encoding = { { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, { #define BT_COLON BT_NMSTRT #include "asciitab.h" #undef BT_COLON /* BT_NONXML == 0 */ }, STANDARD_VTABLE(sb_) }; static int PTRFASTCALL unicode_byte_type(char hi, char lo) { switch ((unsigned char)hi) { case 0xD8: case 0xD9: case 0xDA: case 0xDB: return BT_LEAD4; case 0xDC: case 0xDD: case 0xDE: case 0xDF: return BT_TRAIL; case 0xFF: switch ((unsigned char)lo) { case 0xFF: case 0xFE: return BT_NONXML; } break; } return BT_NONASCII; } #define DEFINE_UTF16_TO_UTF8(E) \ static void PTRCALL \ E ## toUtf8(const ENCODING *enc, \ const char **fromP, const char *fromLim, \ char **toP, const char *toLim) \ { \ const char *from; \ for (from = *fromP; from != fromLim; from += 2) { \ int plane; \ unsigned char lo2; \ unsigned char lo = GET_LO(from); \ unsigned char hi = GET_HI(from); \ switch (hi) { \ case 0: \ if (lo < 0x80) { \ if (*toP == toLim) { \ *fromP = from; \ return; \ } \ *(*toP)++ = lo; \ break; \ } \ /* fall through */ \ case 0x1: case 0x2: case 0x3: \ case 0x4: case 0x5: case 0x6: case 0x7: \ if (toLim - *toP < 2) { \ *fromP = from; \ return; \ } \ *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ *(*toP)++ = ((lo & 0x3f) | 0x80); \ break; \ default: \ if (toLim - *toP < 3)\ { \ *fromP = from; \ return; \ } \ /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ *(*toP)++ = ((lo & 0x3f) | 0x80); \ break; \ case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ if (toLim - *toP < 4)\ { \ *fromP = from; \ return; \ } \ plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ from += 2; \ lo2 = GET_LO(from); \ *(*toP)++ = (((lo & 0x3) << 4) \ | ((GET_HI(from) & 0x3) << 2) \ | (lo2 >> 6) \ | 0x80); \ *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ break; \ } \ } \ *fromP = from; \ } #define DEFINE_UTF16_TO_UTF16(E) \ static void PTRCALL \ E ## toUtf16(const ENCODING *enc, \ const char **fromP, const char *fromLim, \ unsigned short **toP, const unsigned short *toLim) \ { \ /* Avoid copying first half only of surrogate */ \ if (fromLim - *fromP > ((toLim - *toP) << 1) \ && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \ fromLim -= 2; \ for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \ *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ } #define SET2(ptr, ch) \ (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) #define GET_LO(ptr) ((unsigned char)(ptr)[0]) #define GET_HI(ptr) ((unsigned char)(ptr)[1]) DEFINE_UTF16_TO_UTF8(little2_) DEFINE_UTF16_TO_UTF16(little2_) #undef SET2 #undef GET_LO #undef GET_HI #define SET2(ptr, ch) \ (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) #define GET_LO(ptr) ((unsigned char)(ptr)[1]) #define GET_HI(ptr) ((unsigned char)(ptr)[0]) DEFINE_UTF16_TO_UTF8(big2_) DEFINE_UTF16_TO_UTF16(big2_) #undef SET2 #undef GET_LO #undef GET_HI #define LITTLE2_BYTE_TYPE(enc, p) \ ((p)[1] == 0 \ ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ : unicode_byte_type((p)[1], (p)[0])) #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \ UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) #ifdef XML_MIN_SIZE static int PTRFASTCALL little2_byteType(const ENCODING *enc, const char *p) { return LITTLE2_BYTE_TYPE(enc, p); } static int PTRFASTCALL little2_byteToAscii(const ENCODING *enc, const char *p) { return LITTLE2_BYTE_TO_ASCII(enc, p); } static int PTRCALL little2_charMatches(const ENCODING *enc, const char *p, int c) { return LITTLE2_CHAR_MATCHES(enc, p, c); } static int PTRFASTCALL little2_isNameMin(const ENCODING *enc, const char *p) { return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p); } static int PTRFASTCALL little2_isNmstrtMin(const ENCODING *enc, const char *p) { return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p); } #undef VTABLE #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 #else /* not XML_MIN_SIZE */ #undef PREFIX #define PREFIX(ident) little2_ ## ident #define MINBPC(enc) 2 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p) #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c) #define IS_NAME_CHAR(enc, p, n) 0 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) #define IS_NMSTRT_CHAR(enc, p, n) (0) #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) #include "spp_tokenizer.c" #undef MINBPC #undef BYTE_TYPE #undef BYTE_TO_ASCII #undef CHAR_MATCHES #undef IS_NAME_CHAR #undef IS_NAME_CHAR_MINBPC #undef IS_NMSTRT_CHAR #undef IS_NMSTRT_CHAR_MINBPC #undef IS_INVALID_CHAR #endif /* not XML_MIN_SIZE */ #ifdef XML_NS static const struct normal_encoding little2_encoding_ns = { { VTABLE, 2, 0, #if BYTEORDER == 1234 1 #else 0 #endif }, { #include "asciitab.h" #include "latin1tab.h" }, STANDARD_VTABLE(little2_) }; #endif static const struct normal_encoding little2_encoding = { { VTABLE, 2, 0, #if BYTEORDER == 1234 1 #else 0 #endif }, { #define BT_COLON BT_NMSTRT #include "asciitab.h" #undef BT_COLON #include "latin1tab.h" }, STANDARD_VTABLE(little2_) }; #if BYTEORDER != 4321 #ifdef XML_NS static const struct normal_encoding internal_little2_encoding_ns = { { VTABLE, 2, 0, 1 }, { #include "iasciitab.h" #include "latin1tab.h" }, STANDARD_VTABLE(little2_) }; #endif static const struct normal_encoding internal_little2_encoding = { { VTABLE, 2, 0, 1 }, { #define BT_COLON BT_NMSTRT #include "iasciitab.h" #undef BT_COLON #include "latin1tab.h" }, STANDARD_VTABLE(little2_) }; #endif #define BIG2_BYTE_TYPE(enc, p) \ ((p)[0] == 0 \ ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ : unicode_byte_type((p)[0], (p)[1])) #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \ UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) #ifdef XML_MIN_SIZE static int PTRFASTCALL big2_byteType(const ENCODING *enc, const char *p) { return BIG2_BYTE_TYPE(enc, p); } static int PTRFASTCALL big2_byteToAscii(const ENCODING *enc, const char *p) { return BIG2_BYTE_TO_ASCII(enc, p); } static int PTRCALL big2_charMatches(const ENCODING *enc, const char *p, int c) { return BIG2_CHAR_MATCHES(enc, p, c); } static int PTRFASTCALL big2_isNameMin(const ENCODING *enc, const char *p) { return BIG2_IS_NAME_CHAR_MINBPC(enc, p); } static int PTRFASTCALL big2_isNmstrtMin(const ENCODING *enc, const char *p) { return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p); } #undef VTABLE #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 #else /* not XML_MIN_SIZE */ #undef PREFIX #define PREFIX(ident) big2_ ## ident #define MINBPC(enc) 2 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p) #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c) #define IS_NAME_CHAR(enc, p, n) 0 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p) #define IS_NMSTRT_CHAR(enc, p, n) (0) #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) #include "spp_tokenizer.c" #undef MINBPC #undef BYTE_TYPE #undef BYTE_TO_ASCII #undef CHAR_MATCHES #undef IS_NAME_CHAR #undef IS_NAME_CHAR_MINBPC #undef IS_NMSTRT_CHAR #undef IS_NMSTRT_CHAR_MINBPC #undef IS_INVALID_CHAR #endif /* not XML_MIN_SIZE */ #ifdef XML_NS static const struct normal_encoding big2_encoding_ns = { { VTABLE, 2, 0, #if BYTEORDER == 4321 1 #else 0 #endif }, { #include "asciitab.h" #include "latin1tab.h" }, STANDARD_VTABLE(big2_) }; #endif static const struct normal_encoding big2_encoding = { { VTABLE, 2, 0, #if BYTEORDER == 4321 1 #else 0 #endif }, { #define BT_COLON BT_NMSTRT #include "asciitab.h" #undef BT_COLON #include "latin1tab.h" }, STANDARD_VTABLE(big2_) }; #if BYTEORDER != 1234 #ifdef XML_NS static const struct normal_encoding internal_big2_encoding_ns = { { VTABLE, 2, 0, 1 }, { #include "iasciitab.h" #include "latin1tab.h" }, STANDARD_VTABLE(big2_) }; #endif static const struct normal_encoding internal_big2_encoding = { { VTABLE, 2, 0, 1 }, { #define BT_COLON BT_NMSTRT #include "iasciitab.h" #undef BT_COLON #include "latin1tab.h" }, STANDARD_VTABLE(big2_) }; #endif #undef PREFIX static int FASTCALL streqci(const char *s1, const char *s2) { for (;;) { char c1 = *s1++; char c2 = *s2++; if (ASCII_a <= c1 && c1 <= ASCII_z) c1 += ASCII_A - ASCII_a; if (ASCII_a <= c2 && c2 <= ASCII_z) c2 += ASCII_A - ASCII_a; if (c1 != c2) return 0; if (!c1) break; } return 1; } /*static void PTRCALL initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end, POSITION *pos) { normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); }*/ static int toAscii(const ENCODING *enc, const char *ptr, const char *end) { char buf[1]; char *p = buf; XmlUtf8Convert(enc, &ptr, end, &p, p + 1); if (p == buf) return -1; else return buf[0]; } static int FASTCALL isSpace(int c) { switch (c) { case 0x20: case 0xD: case 0xA: case 0x9: return 1; } return 0; } /* Return 1 if there's just optional white space or there's an S * followed by name=val. */ static int parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end, const char **namePtr, const char **nameEndPtr, const char **valPtr, const char **nextTokPtr) { int c; char open; if (ptr == end) { *namePtr = NULL; return 1; } if (!isSpace(toAscii(enc, ptr, end))) { *nextTokPtr = ptr; return 0; } do { ptr += enc->minBytesPerChar; } while (isSpace(toAscii(enc, ptr, end))); if (ptr == end) { *namePtr = NULL; return 1; } *namePtr = ptr; for (;;) { c = toAscii(enc, ptr, end); if (c == -1) { *nextTokPtr = ptr; return 0; } if (c == ASCII_EQUALS) { *nameEndPtr = ptr; break; } if (isSpace(c)) { *nameEndPtr = ptr; do { ptr += enc->minBytesPerChar; } while (isSpace(c = toAscii(enc, ptr, end))); if (c != ASCII_EQUALS) { *nextTokPtr = ptr; return 0; } break; } ptr += enc->minBytesPerChar; } if (ptr == *namePtr) { *nextTokPtr = ptr; return 0; } ptr += enc->minBytesPerChar; c = toAscii(enc, ptr, end); while (isSpace(c)) { ptr += enc->minBytesPerChar; c = toAscii(enc, ptr, end); } if (c != ASCII_QUOT && c != ASCII_APOS) { *nextTokPtr = ptr; return 0; } open = (char)c; ptr += enc->minBytesPerChar; *valPtr = ptr; for (;; ptr += enc->minBytesPerChar) { c = toAscii(enc, ptr, end); if (c == open) break; if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z) && !(ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD && c != ASCII_MINUS && c != ASCII_UNDERSCORE) { *nextTokPtr = ptr; return 0; } } *nextTokPtr = ptr + enc->minBytesPerChar; return 1; } static const char KW_version[] = { ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0' }; static const char KW_encoding[] = { ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0' }; static const char KW_standalone[] = { ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0' }; static const char KW_yes[] = { ASCII_y, ASCII_e, ASCII_s, '\0' }; static const char KW_no[] = { ASCII_n, ASCII_o, '\0' }; static int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *, const char *), int isGeneralTextEntity, const ENCODING *enc, const char *ptr, const char *end, const char **badPtr, const char **versionPtr, const char **versionEndPtr, const char **encodingName, const ENCODING **encoding, int *standalone) { /* enc = &utf8_encoding_ns;//this is temporarily hard coded by damitha */ const char *val = NULL; const char *name = NULL; const char *nameEnd = NULL; /* printf("minBytesPerChar:%d\n", enc->minBytesPerChar); */ ptr += 5 * enc->minBytesPerChar; end -= 2 * enc->minBytesPerChar; if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) || !name) { *badPtr = ptr; return 0; } if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) { if (!isGeneralTextEntity) { *badPtr = name; return 0; } } else { if (versionPtr) *versionPtr = val; if (versionEndPtr) *versionEndPtr = ptr; if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { *badPtr = ptr; return 0; } if (!name) { if (isGeneralTextEntity) { /* a TextDecl must have an EncodingDecl */ *badPtr = ptr; return 0; } return 1; } } if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) { int c = toAscii(enc, val, end); if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) { *badPtr = val; return 0; } if (encodingName) *encodingName = val; if (encoding) *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { *badPtr = ptr; return 0; } if (!name) return 1; } if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) || isGeneralTextEntity) { *badPtr = name; return 0; } if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) { if (standalone) *standalone = 1; } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) { if (standalone) *standalone = 0; } else { *badPtr = val; return 0; } while (isSpace(toAscii(enc, ptr, end))) ptr += enc->minBytesPerChar; if (ptr != end) { *badPtr = ptr; return 0; } return 1; } static int FASTCALL checkCharRefNumber(int result) { switch (result >> 8) { case 0xD8: case 0xD9: case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE: case 0xDF: return -1; case 0: if (latin1_encoding.type[result] == BT_NONXML) return -1; break; case 0xFF: if (result == 0xFFFE || result == 0xFFFF) return -1; break; } return result; } int FASTCALL XmlUtf8Encode(int c, char *buf) { enum { /* minN is minimum legal resulting value for N byte sequence */ min2 = 0x80, min3 = 0x800, min4 = 0x10000 }; if (c < 0) return 0; if (c < min2) { buf[0] = (char)(c | UTF8_cval1); return 1; } if (c < min3) { buf[0] = (char)((c >> 6) | UTF8_cval2); buf[1] = (char)((c & 0x3f) | 0x80); return 2; } if (c < min4) { buf[0] = (char)((c >> 12) | UTF8_cval3); buf[1] = (char)(((c >> 6) & 0x3f) | 0x80); buf[2] = (char)((c & 0x3f) | 0x80); return 3; } if (c < 0x110000) { buf[0] = (char)((c >> 18) | UTF8_cval4); buf[1] = (char)(((c >> 12) & 0x3f) | 0x80); buf[2] = (char)(((c >> 6) & 0x3f) | 0x80); buf[3] = (char)((c & 0x3f) | 0x80); return 4; } return 0; } int FASTCALL XmlUtf16Encode(int charNum, unsigned short *buf) { if (charNum < 0) return 0; if (charNum < 0x10000) { buf[0] = (unsigned short)charNum; return 1; } if (charNum < 0x110000) { charNum -= 0x10000; buf[0] = (unsigned short)((charNum >> 10) + 0xD800); buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00); return 2; } return 0; } struct unknown_encoding { struct normal_encoding normal; int (*convert)(void *userData, const char *p); void *userData; unsigned short utf16[256]; char utf8[256][4]; }; #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc)) int XmlSizeOfUnknownEncoding(void) { return sizeof(struct unknown_encoding); } static int PTRFASTCALL unknown_isName(const ENCODING *enc, const char *p) { const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); int c = uenc->convert(uenc->userData, p); if (c & ~0xFFFF) return 0; return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); } static int PTRFASTCALL unknown_isNmstrt(const ENCODING *enc, const char *p) { const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); int c = uenc->convert(uenc->userData, p); if (c & ~0xFFFF) return 0; return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); } static int PTRFASTCALL unknown_isInvalid(const ENCODING *enc, const char *p) { const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); int c = uenc->convert(uenc->userData, p); return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; } static void PTRCALL unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim) { const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); char buf[XML_UTF8_ENCODE_MAX]; for (;;) { const char *utf8; int n; if (*fromP == fromLim) break; utf8 = uenc->utf8[(unsigned char)**fromP]; n = *utf8++; if (n == 0) { int c = uenc->convert(uenc->userData, *fromP); n = XmlUtf8Encode(c, buf); if (n > toLim - *toP) break; utf8 = buf; *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] - (BT_LEAD2 - 2)); } else { if (n > toLim - *toP) break; (*fromP)++; } do { *(*toP)++ = *utf8++; } while (--n != 0); } } static void PTRCALL unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim) { const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); while (*fromP != fromLim && *toP != toLim) { unsigned short c = uenc->utf16[(unsigned char)**fromP]; if (c == 0) { c = (unsigned short) uenc->convert(uenc->userData, *fromP); *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] - (BT_LEAD2 - 2)); } else (*fromP)++; *(*toP)++ = c; } } ENCODING * XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert, void *userData) { int i; struct unknown_encoding *e = (struct unknown_encoding *)mem; for (i = 0; i < (int)sizeof(struct normal_encoding); i++) ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; for (i = 0; i < 128; i++) if (latin1_encoding.type[i] != BT_OTHER && latin1_encoding.type[i] != BT_NONXML && table[i] != i) return 0; for (i = 0; i < 256; i++) { int c = table[i]; if (c == -1) { e->normal.type[i] = BT_MALFORM; /* This shouldn't really get used. */ e->utf16[i] = 0xFFFF; e->utf8[i][0] = 1; e->utf8[i][1] = 0; } else if (c < 0) { if (c < -4) return 0; e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2)); e->utf8[i][0] = 0; e->utf16[i] = 0; } else if (c < 0x80) { if (latin1_encoding.type[c] != BT_OTHER && latin1_encoding.type[c] != BT_NONXML && c != i) return 0; e->normal.type[i] = latin1_encoding.type[c]; e->utf8[i][0] = 1; e->utf8[i][1] = (char)c; e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c); } else if (checkCharRefNumber(c) < 0) { e->normal.type[i] = BT_NONXML; /* This shouldn't really get used. */ e->utf16[i] = 0xFFFF; e->utf8[i][0] = 1; e->utf8[i][1] = 0; } else { if (c > 0xFFFF) return 0; if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) e->normal.type[i] = BT_NMSTRT; else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) e->normal.type[i] = BT_NAME; else e->normal.type[i] = BT_OTHER; e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); e->utf16[i] = (unsigned short)c; } } e->userData = userData; e->convert = convert; if (convert) { e->normal.isName2 = unknown_isName; e->normal.isName3 = unknown_isName; e->normal.isName4 = unknown_isName; e->normal.isNmstrt2 = unknown_isNmstrt; e->normal.isNmstrt3 = unknown_isNmstrt; e->normal.isNmstrt4 = unknown_isNmstrt; e->normal.isInvalid2 = unknown_isInvalid; e->normal.isInvalid3 = unknown_isInvalid; e->normal.isInvalid4 = unknown_isInvalid; } e->normal.enc.utf8Convert = unknown_toUtf8; e->normal.enc.utf16Convert = unknown_toUtf16; return &(e->normal.enc); } /* If this enumeration is changed, getEncodingIndex and encodings * must also be changed. */ enum { UNKNOWN_ENC = -1, ISO_8859_1_ENC = 0, US_ASCII_ENC, UTF_8_ENC, UTF_16_ENC, UTF_16BE_ENC, UTF_16LE_ENC, /* must match encodingNames up to here */ NO_ENC }; static const char KW_ISO_8859_1[] = { ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0' }; static const char KW_US_ASCII[] = { ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, '\0' }; static const char KW_UTF_8[] = { ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0' }; static const char KW_UTF_16[] = { ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0' }; static const char KW_UTF_16BE[] = { ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, '\0' }; static const char KW_UTF_16LE[] = { ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, '\0' }; static int FASTCALL getEncodingIndex(const char *name) { static const char *encodingNames[] = { KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE, }; int i; if (name == NULL) return NO_ENC; for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++) if (streqci(name, encodingNames[i])) return i; return UNKNOWN_ENC; } /* For binary compatibility, we store the index of the encoding * specified at initialization in the isUtf16 member. */ #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) /* This is what detects the encoding. encodingTable maps from * encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of * the external (protocol) specified encoding; state is * XML_CONTENT_STATE if we're parsing an external text entity, and * XML_PROLOG_STATE otherwise. */ static int initScan(int* parserState, TokDataStruct* data, const ENCODING **encodingTable, const INIT_ENCODING *enc, int state, int *numOfChars, char* end, const char **nextTokPtr) { char *ptr = *nextTokPtr; const ENCODING **encPtr; /* printf("in xmltok.c\n"); */ /* * start temp code */ encPtr = enc->encPtr; *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; return XmlTok(parserState, data,*encPtr, state, numOfChars, end, nextTokPtr); /* end temp code */ if (ptr == end) return XML_TOK_NONE; encPtr = enc->encPtr; if (ptr + 1 == end) { /* only a single byte available for auto-detection */ #ifndef XML_DTD /* FIXME */ /* a well-formed document entity must have more than one byte */ if (state != XML_CONTENT_STATE) return XML_TOK_PARTIAL; #endif /* so we're parsing an external text entity... */ /* if UTF-16 was externally specified, then we need at least 2 bytes */ switch (INIT_ENC_INDEX(enc)) { case UTF_16_ENC: case UTF_16LE_ENC: case UTF_16BE_ENC: return XML_TOK_PARTIAL; } switch ((unsigned char)*ptr) { case 0xFE: case 0xFF: case 0xEF: /* possibly first byte of UTF-8 BOM */ if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) break; /* fall through */ case 0x00: case 0x3C: return XML_TOK_PARTIAL; } } else { switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { case 0xFEFF: if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) break; *nextTokPtr = ptr + 2; *encPtr = encodingTable[UTF_16BE_ENC]; return XML_TOK_BOM; /* 00 3C is handled in the default case */ case 0x3C00: if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC || INIT_ENC_INDEX(enc) == UTF_16_ENC) && state == XML_CONTENT_STATE) break; *encPtr = encodingTable[UTF_16LE_ENC]; return XmlTok(parserState, data,*encPtr, state, numOfChars, end, nextTokPtr); case 0xFFFE: if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) break; *nextTokPtr = ptr + 2; *encPtr = encodingTable[UTF_16LE_ENC]; return XML_TOK_BOM; case 0xEFBB: /* Maybe a UTF-8 BOM (EF BB BF) */ /* If there's an explicitly specified (external) encoding * of ISO-8859-1 or some flavour of UTF-16 * and this is an external text entity, * don't look for the BOM, * because it might be a legal data. */ if (state == XML_CONTENT_STATE) { int e = INIT_ENC_INDEX(enc); if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC) break; } if (ptr + 2 == end) return XML_TOK_PARTIAL; if ((unsigned char)ptr[2] == 0xBF) { *nextTokPtr = ptr + 3; *encPtr = encodingTable[UTF_8_ENC]; return XML_TOK_BOM; } break; default: if (ptr[0] == '\0') { /* 0 isn't a legal data character. Furthermore a document * entity can only start with ASCII characters. So the only * way this can fail to be big-endian UTF-16 if it it's an * external parsed general entity that's labelled as * UTF-16LE. */ if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) break; *encPtr = encodingTable[UTF_16BE_ENC]; return XmlTok(parserState, data, *encPtr, state, numOfChars, end, nextTokPtr); } else if (ptr[1] == '\0') { /* We could recover here in the case: * - parsing an external entity * - second byte is 0 * - no externally specified encoding * - no encoding declaration * by assuming UTF-16LE. But we don't, because this would mean when * presented just with a single byte, we couldn't reliably determine * whether we needed further bytes. */ if (state == XML_CONTENT_STATE) break; *encPtr = encodingTable[UTF_16LE_ENC]; return XmlTok(parserState, data,*encPtr, state, numOfChars, end, nextTokPtr); } break; } } *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; return XmlTok(parserState, data,*encPtr, state, numOfChars, end, nextTokPtr); } #define NS(x) x #define ns(x) x #undef NS #undef ns #ifdef XML_NS #define NS(x) x ## NS #define ns(x) x ## _ns #undef NS #undef ns ENCODING * XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert, void *userData) { ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); if (enc) ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON; return enc; } #endif /* XML_NS */ const ENCODING * XmlGetUtf8InternalEncoding(void) { return &(internal_utf8_encoding).enc; } const ENCODING * XmlGetUtf16InternalEncoding(void) { #if BYTEORDER == 1234 return &(internal_little2_encoding).enc; #elif BYTEORDER == 4321 return &(internal_big2_encoding).enc; #else const short n = 1; return (*(const char *)&n ? &(internal_little2_encoding).enc : &(internal_big2_encoding).enc); #endif } static const ENCODING *encodings[] = { &(latin1_encoding).enc, &(ascii_encoding).enc, &(utf8_encoding).enc, &(big2_encoding).enc, &(big2_encoding).enc, &(little2_encoding).enc, &(utf8_encoding).enc /* NO_ENC */ }; static int PTRCALL initScanProlog(int* parserState, TokDataStruct* data,const ENCODING *enc, int* numOfChars, char *end, const char **nextTokPtr) { /* printf("XML_PROLOG_STATE:%d\n", XML_PROLOG_STATE); */ return initScan(parserState, data, encodings,(const INIT_ENCODING *)enc, XML_PROLOG_STATE, numOfChars, end, nextTokPtr); } static int PTRCALL initScanContent(int* parserState, TokDataStruct* data,const ENCODING *enc, int* numOfChars, char *end, const char **nextTokPtr) { /* printf("XML_CONTENT_STATE:%d\n", XML_PROLOG_STATE); */ return initScan(parserState, data, encodings,(const INIT_ENCODING *)enc, XML_CONTENT_STATE, numOfChars, end, nextTokPtr); } int XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr, const char *name) { int i = getEncodingIndex(name); if (i == UNKNOWN_ENC) return 0; SET_INIT_ENC_INDEX(p, i); p->initEnc.scanners[XML_PROLOG_STATE] = initScanProlog; p->initEnc.scanners[XML_CONTENT_STATE] = initScanContent; p->encPtr = encPtr; *encPtr = &(p->initEnc); /*printf("encoding:%d\n", i);*/ /** This is where the encoding table(defined in xmltol.c * eg: * #ifdef XML_NS * static const struct normal_encoding utf8_encoding_ns = * { * { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, * { * #include "asciitab.h" * #include "utf8tab.h" * }, * STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) * }; * #endif) * * is assigned to the encoding pointer. * */ *encPtr = encodings[i]; return 1; } static const ENCODING * findEncoding(const ENCODING *enc, const char *ptr, const char *end) { #define ENCODING_MAX 128 char buf[ENCODING_MAX]; char *p = buf; int i; XmlUtf8Convert(enc, &ptr, end, &p, p + ENCODING_MAX - 1); if (ptr != end) return 0; *p = 0; if (streqci(buf, KW_UTF_16) && enc->minBytesPerChar == 2) return enc; i = getEncodingIndex(buf); if (i == UNKNOWN_ENC) return 0; return encodings[i]; } int XmlParseXmlDecl(int isGeneralTextEntity, const ENCODING *enc, const char *ptr, const char *end, const char **badPtr, const char **versionPtr, const char **versionEndPtr, const char **encodingName, const ENCODING **encoding, int *standalone) { return doParseXmlDecl(findEncoding, isGeneralTextEntity, enc, ptr, end, badPtr, versionPtr, versionEndPtr, encodingName, encoding, standalone); } int SppUtf8Convert(const ENCODING *enc, const char **fromPtr, const char *rawNameEnd, const char **toPtr, const char *bufEnd) { /* printf("fromPtr:%s\n", *fromPtr); */ /* printf("rawNameEnd:%s\n", rawNameEnd); */ /*This method is defined in spp_converter.h*/ XmlUtf8Convert(enc, fromPtr, rawNameEnd, toPtr, bufEnd); } 1.1 ws-axis/c/src/xml/txpp/lib/spp_converter.h Index: spp_converter.h =================================================================== /* * Copyright 2003-2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "spp.h" #ifndef XmlTok_OF_AXIS_INCLUDED #define XmlTok_OF_AXIS_INCLUDED 1 #ifdef __cplusplus extern "C" { #endif /* The following token may be returned by XmlContentTok */ #define XML_TOK_TRAILING_RSQB -5 /* ] or ]] at the end of the scan; might be * start of illegal ]]> sequence */ /* The following tokens may be returned by both XmlPrologTok and * XmlContentTok. */ #define XML_TOK_NONE -4 /* The string to be scanned is empty */ #define XML_TOK_TRAILING_CR -3 /* A CR at the end of the scan; might be part of CRLF sequence */ #define XML_TOK_PARTIAL_CHAR -2 /* only part of a multibyte sequence */ #define XML_TOK_PARTIAL -1 /* only part of a token */ #define XML_TOK_INVALID 0 /* The following tokens are returned by XmlContentTok; some are also * returned by XmlAttributeValueTok, XmlEntityTok, XmlCdataSectionTok. */ #define XML_TOK_START_TAG_WITH_ATTS 1 #define XML_TOK_START_TAG_NO_ATTS 2 #define XML_TOK_EMPTY_ELEMENT_WITH_ATTS 3 /* empty element tag */ #define XML_TOK_EMPTY_ELEMENT_NO_ATTS 4 #define XML_TOK_END_TAG 5 #define XML_TOK_DATA_CHARS 6 #define XML_TOK_DATA_NEWLINE 7 #define XML_TOK_CDATA_SECT_OPEN 8 #define XML_TOK_ENTITY_REF 9 #define XML_TOK_CHAR_REF 10 /* numeric character reference */ /* The following tokens may be returned by both XmlPrologTok and * XmlContentTok. */ #define XML_TOK_PI 11 /* processing instruction */ #define XML_TOK_XML_DECL 12 /* XML decl or text decl */ #define XML_TOK_COMMENT 13 #define XML_TOK_BOM 14 /* Byte order mark */ /* The following tokens are returned only by XmlPrologTok */ #define XML_TOK_PROLOG_S 15 #define XML_TOK_DECL_OPEN 16 /* */ #define XML_TOK_NAME 18 #define XML_TOK_NMTOKEN 19 #define XML_TOK_POUND_NAME 20 /* #name */ #define XML_TOK_OR 21 /* | */ #define XML_TOK_PERCENT 22 #define XML_TOK_OPEN_PAREN 23 #define XML_TOK_CLOSE_PAREN 24 #define XML_TOK_OPEN_BRACKET 25 #define XML_TOK_CLOSE_BRACKET 26 #define XML_TOK_LITERAL 27 #define XML_TOK_PARAM_ENTITY_REF 28 #define XML_TOK_INSTANCE_START 29 /* The following occur only in element type declarations */ #define XML_TOK_NAME_QUESTION 30 /* name? */ #define XML_TOK_NAME_ASTERISK 31 /* name* */ #define XML_TOK_NAME_PLUS 32 /* name+ */ #define XML_TOK_COND_SECT_OPEN 33 /* */ #define XML_TOK_CLOSE_PAREN_QUESTION 35 /* )? */ #define XML_TOK_CLOSE_PAREN_ASTERISK 36 /* )* */ #define XML_TOK_CLOSE_PAREN_PLUS 37 /* )+ */ #define XML_TOK_COMMA 38 /* The following token is returned only by XmlAttributeValueTok */ #define XML_TOK_ATTRIBUTE_VALUE_S 39 /* The following token is returned only by XmlCdataSectionTok */ #define XML_TOK_CDATA_SECT_CLOSE 40 /* With namespace processing this is returned by XmlPrologTok for a * name with a colon. */ #define XML_TOK_PREFIXED_NAME 41 #define XML_N_STATES 3 #define XML_PROLOG_STATE 0 #define XML_CONTENT_STATE 1 #define XML_CDATA_SECTION_STATE 2 #define XML_N_LITERAL_TYPES 2 #define XML_ATTRIBUTE_VALUE_LITERAL 0 #define XML_ENTITY_VALUE_LITERAL 1 /* The size of the buffer passed to XmlUtf8Encode must be at least this. */ #define XML_UTF8_ENCODE_MAX 4 /* The size of the buffer passed to XmlUtf16Encode must be at least this. */ #define XML_UTF16_ENCODE_MAX 2 typedef struct position { /* first line and first column are 0 not 1 */ unsigned long lineNumber; unsigned long columnNumber; } POSITION; typedef struct { const char *name; const char *valuePtr; const char *valueEnd; char normalized; } ATTRIBUTE; struct encoding; typedef struct encoding ENCODING; /* typedef int (PTRCALL *SCANNER)(int *, TokDataStruct *, const ENCODING *, * const char *, * const char *, * const char **); */ typedef int (PTRCALL *SCANNER)(int *, TokDataStruct *, const ENCODING *, int *, char*, const char **); /** * This struct is the type which represents encoding. This is struct is assigned * values in xmltok.c. * @see for examplestatic const struct normal_encoding internal_utf8_encoding_ns * in xmltok.c */ struct encoding { SCANNER scanners[XML_N_STATES]; int (PTRCALL *nameMatchesAscii)(const ENCODING *, const char *, const char *, const char *); /* Converts to utf8 which is the parse output encoding which * is decided in * xpp_context_t* ct = (xpp_context_t*) parser_create("UTF-8"); */ void (PTRCALL *utf8Convert)(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim); void (PTRCALL *utf16Convert)(const ENCODING *enc, const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim); int minBytesPerChar; char isUtf8; char isUtf16; }; /* Scan the string starting at ptr until the end of the next complete * token, but do not scan past eptr. Return an integer giving the * type of token. * * Return XML_TOK_NONE when ptr == eptr; nextTokPtr will not be set. * * Return XML_TOK_PARTIAL when the string does not contain a complete * token; nextTokPtr will not be set. * * Return XML_TOK_INVALID when the string does not start a valid * token; nextTokPtr will be set to point to the character which made * the token invalid. * * Otherwise the string starts with a valid token; nextTokPtr will be * set to point to the character following the end of that token. * * Each data character counts as a single token, but adjacent data * characters may be returned together. Similarly for characters in * the prolog outside literals, comments and processing instructions. */ #define XmlTok(parserState, data, enc, state, numOfChars, end, ptr) \ (((enc)->scanners[state])(parserState, data, enc, numOfChars, end, ptr)) #define XmlPrologTok(parserState, data, enc, numOfChars, end, ptr) \ XmlTok(parserState, data, enc, XML_PROLOG_STATE, numOfChars, end, ptr) #define XmlContentTok(parserState, data, enc, numOfChars, end, ptr) \ XmlTok(parserState, data,enc, XML_CONTENT_STATE, numOfChars, end, ptr) #define XmlNameMatchesAscii(enc, ptr1, end1, ptr2) \ (((enc)->nameMatchesAscii)(enc, ptr1, end1, ptr2)) #define XmlUtf8Convert(enc, fromP, fromLim, toP, toLim) \ (((enc)->utf8Convert)(enc, fromP, fromLim, toP, toLim)) #define XmlUtf16Convert(enc, fromP, fromLim, toP, toLim) \ (((enc)->utf16Convert)(enc, fromP, fromLim, toP, toLim)) typedef struct { ENCODING initEnc; const ENCODING **encPtr; } INIT_ENCODING; int XmlParseXmlDecl(int isGeneralTextEntity, const ENCODING *enc, const char *ptr, const char *end, const char **badPtr, const char **versionPtr, const char **versionEndPtr, const char **encodingNamePtr, const ENCODING **namedEncodingPtr, int *standalonePtr); int SppUtf8Convert(const ENCODING *, const char **, const char *, const char **, const char *); int XmlInitEncoding(INIT_ENCODING *, const ENCODING **, const char *name); const ENCODING *XmlGetUtf8InternalEncoding(void); const ENCODING *XmlGetUtf16InternalEncoding(void); int FASTCALL XmlUtf8Encode(int charNumber, char *buf); int FASTCALL XmlUtf16Encode(int charNumber, unsigned short *buf); int XmlSizeOfUnknownEncoding(void); typedef int (*CONVERTER)(void *userData, const char *p); ENCODING * XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert, void *userData); int XmlInitEncoding(INIT_ENCODING *, const ENCODING **, const char *name); const ENCODING *XmlGetUtf8InternalEncodingNS(void); const ENCODING *XmlGetUtf16InternalEncodingNS(void); ENCODING * XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert, void *userData); #ifdef __cplusplus } #endif #endif /* not XmlTok_OF_AXIS_INCLUDED */ 1.1 ws-axis/c/src/xml/txpp/lib/spp_tokenizer.c Index: spp_tokenizer.c =================================================================== /* * Copyright 2003-2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "spp.h" #ifndef PREFIX #define PREFIX(ident) ident #endif static int PTRCALL PREFIX(add_ptr)(char *ptr, TokDataStruct *data) { if (data->numOfPtrs == data->ptrBuffSize || !data->ptrBuff) { int sz = data->ptrBuffSize << 1; char **ptrBuff = (char **)malloc(sz << 2); if (!ptrBuff) return SPP_ERROR_NO_MEMORY; if (data->ptrBuff) { memmove(ptrBuff, data->ptrBuff, data->numOfPtrs << 2); free(data->ptrBuff); } data->ptrBuff = ptrBuff; data->ptrBuffSize = sz; /* printf("sz:%d\n", sz); */ } data->ptrBuff[data->numOfPtrs++] = ptr; return SPP_ERROR_NONE; } static int PTRCALL PREFIX(is_letter)(int byteType) { if(BT_NMSTRT == byteType || BT_HEX == byteType) return -1; else return 0; } static int PTRCALL PREFIX(is_name_start_char)(int byteType) { if(PREFIX(is_letter)(byteType) || BT_COLON == byteType) return -1; else return 0; } static int PTRCALL PREFIX(is_name_char)(int byteType) { if(PREFIX(is_letter)(byteType) || BT_DIGIT == byteType || BT_COLON == byteType || BT_MINUS == byteType || BT_NAME == byteType) return -1; else return 0; } static int PTRCALL PREFIX(doHomeWork)(TokDataStruct* data, int* parserState, char** ptr, int* numOfChars) { numOfChars -= MINBPC(enc); *ptr += MINBPC(enc); data->numOfPtrs = 0; *parserState = S_0; return SPP_ERROR_NONE; } static int PTRCALL PREFIX(tokenizeProlog)(int* parserState, TokDataStruct* data, const ENCODING *enc, int *numOfChars, char* end, const char **ptr) { /* printf("*numOfChars:%d\n", *numOfChars); */ int byteType; while(*numOfChars >= MINBPC(enc)) { byteType = BYTE_TYPE(enc, *ptr); /* printf("*ptr:%s\n", *ptr); */ /* printf("byteType:%d\n", byteType); */ /* printf("*parserState:%d\n", *parserState); */ switch (*parserState) { case S_0: if(BT_LT == byteType) { *parserState = S_13; } else data->type = PCDATA; break; case S_1: if(BT_QUEST == byteType) { data->type = END_TAG; *parserState = S_10; } else if (PREFIX(is_name_start_char)(byteType)) { data->type = START_TAG; *parserState = S_2; } else *parserState = S_14; break; case S_2: if (BT_GT == byteType) { *parserState = S_0; } else if (BT_QUEST == byteType) { *parserState = S_9; } else if (BT_S == byteType) { *parserState = S_3; } else if (!PREFIX(is_name_char)(byteType)) *parserState = S_14; break; case S_3: if (BT_GT == byteType) { PREFIX(doHomeWork)(data, parserState, ptr, numOfChars); return SPP_ERROR_NONE; } else if (BT_QUEST == byteType) *parserState = S_9; else if (PREFIX(is_name_start_char)(byteType)) { *parserState = S_4; } else if (BT_S != byteType) *parserState = S_14; break; case S_4: if (BT_EQUALS == byteType) { *parserState = S_6; } else if (BT_S == byteType) { *parserState = S_5; } else if (!PREFIX(is_name_char)(byteType)) *parserState = S_14; break; case S_5: if (BT_EQUALS == byteType) *parserState = S_6; else if (BT_S != byteType) *parserState = S_14; break; case S_6: if (BT_APOS == byteType) { *parserState = S_8; } else if (BT_QUOT == byteType) { *parserState = S_7; } else if (BT_S != byteType) *parserState = S_14; break; case S_7: if (BT_QUOT == byteType) { *parserState = S_3; } else if(BT_LT == byteType || BT_AMP == byteType) { *parserState = S_14; /* some problem exists. fix! */ } break; case S_8: if (BT_APOS == byteType) { *parserState = S_3; } else if (BT_LT == byteType || BT_AMP == byteType) *parserState = S_14; break; case S_9: data->type = EMPTY_ELEMENT_TAG; if (BT_GT == byteType) { PREFIX(doHomeWork)(data, parserState, ptr, numOfChars); return SPP_ERROR_NONE; } else *parserState = S_14; break; case S_10: if (PREFIX(is_name_start_char)(byteType)) { *parserState = S_11; } else *parserState = S_14; break; case S_11: if (BT_GT == byteType) { *parserState = S_0; } else if (BT_S == byteType) { *parserState = S_12; } else if (!PREFIX(is_name_char)(byteType)) *parserState = S_14; break; case S_12: if (BT_GT == byteType) { *parserState = S_0; } else if (BT_S != byteType) *parserState = S_14; break; case S_13: if(BT_QUEST == byteType) *parserState = S_1; else *parserState = S_14; break; case S_14: printf("some problem exists. fix it!\n"); default: return -(*parserState); } *numOfChars -= MINBPC(enc); *ptr += MINBPC(enc); } return SPP_ERROR_TOKENIZER_FAILED; } static int PTRCALL PREFIX(tokenizeContent)(int* parserState, TokDataStruct* data, const ENCODING *enc, int *numOfChars, char* end, const char **ptr) { /* printf("*numOfChars:%d\n", *numOfChars); */ int byteType; while(*numOfChars >= MINBPC(enc)) { byteType = BYTE_TYPE(enc, *ptr); /*printf("*ptr:%s\n", *ptr);*/ /* printf("byteType:%d\n", byteType); */ /* printf("*parserState:%d\n", *parserState); */ switch (*parserState) { case S_0: /* if ('<' == ch) { */ if(BT_LT == byteType) { if (data->numOfPtrs) { PREFIX(add_ptr)(*ptr - 1, data); return SPP_ERROR_NONE; } *parserState = S_1; } else if (!data->numOfPtrs) { data->type = PCDATA; PREFIX(add_ptr)(*ptr, data); } break; case S_1: /* if ('/' == ch) { */ if(BT_SOL == byteType) { data->type = END_TAG; *parserState = S_10; } else if (PREFIX(is_name_start_char)(byteType)) { data->type = START_TAG; PREFIX(add_ptr)(*ptr, data); *parserState = S_2; } else *parserState = S_13; break; case S_2: if (BT_GT == byteType) { if (data->numOfPtrs) { PREFIX(add_ptr)(*ptr - 1, data); return SPP_ERROR_NONE; } *parserState = S_0; } else if (BT_SOL == byteType) { PREFIX(add_ptr)(*ptr - 1, data); *parserState = S_9; } /* else if (is_white_space(ch)) { */ else if (BT_S == byteType) { PREFIX(add_ptr)(*ptr - 1, data); *parserState = S_3; } /* else if (!is_name_char(ch)) */ else if (!PREFIX(is_name_char)(byteType)) *parserState = S_13; break; case S_3: if (BT_GT == byteType) { if (data->numOfPtrs) return SPP_ERROR_NONE; *parserState = S_0; } /* else if ('/' == ch) */ else if (BT_SOL == byteType) *parserState = S_9; else if (PREFIX(is_name_start_char)(byteType)) { PREFIX(add_ptr)(*ptr, data); *parserState = S_4; } else if (BT_S != byteType) *parserState = S_13; break; case S_4: /* if ('=' == ch) { */ if (BT_EQUALS == byteType) { PREFIX(add_ptr)(*ptr - 1, data); *parserState = S_6; } else if (BT_S == byteType) { PREFIX(add_ptr)(*ptr - 1, data); *parserState = S_5; } else if (!PREFIX(is_name_char)(byteType)) *parserState = S_13; break; case S_5: /* if ('=' == ch) */ if (BT_EQUALS == byteType) *parserState = S_6; else if (BT_S != byteType) *parserState = S_13; break; case S_6: /* if ('\'' == ch) { */ if (BT_APOS == byteType) { PREFIX(add_ptr)(*ptr, data); *parserState = S_8; } /* else if ('"' == ch) { */ else if (BT_QUOT == byteType) { PREFIX(add_ptr)(*ptr, data); *parserState = S_7; } else if (BT_S != byteType) *parserState = S_13; break; case S_7: /* if ('"' == ch) { */ if (BT_QUOT == byteType) { PREFIX(add_ptr)(*ptr, data); *parserState = S_3; } /* else if ('<' == ch || '&' == ch) */ else if (BT_LT == byteType || BT_AMP == byteType) *parserState = S_13; /* some problem exists. fix it!. */ break; case S_8: /* if ('\'' == byteType) { */ if (BT_APOS == byteType) { PREFIX(add_ptr)(*ptr, data); *parserState = S_3; } /* else if ('<' == ch || '&' == ch) */ else if (BT_LT == byteType || BT_AMP == byteType) *parserState = S_13; break; case S_9: data->type = EMPTY_ELEMENT_TAG; /* if ('>' == ch) { */ if (BT_GT == byteType) { if (data->numOfPtrs) return SPP_ERROR_NONE; *parserState = S_0; } else *parserState = S_13; break; case S_10: if (PREFIX(is_name_start_char)(byteType)) { PREFIX(add_ptr)(*ptr, data); *parserState = S_11; } else *parserState = S_13; break; case S_11: /* if ('>' == ch) { */ if (BT_GT == byteType) { if (data->numOfPtrs) { PREFIX(add_ptr)(*ptr - 1, data); return SPP_ERROR_NONE; } *parserState = S_0; } else if (BT_S == byteType) { PREFIX(add_ptr)(*ptr - 1, data); *parserState = S_12; } else if (!PREFIX(is_name_char)(byteType)) *parserState = S_13; break; case S_12: /* if ('>' == ch) { */ if (BT_GT == byteType) { if (data->numOfPtrs) return SPP_ERROR_NONE; *parserState = S_0; } else if (BT_S != byteType) *parserState = S_13; break; case S_13: /* some problem exists. fix it!. */ default: return -(*parserState); } *numOfChars -= MINBPC(enc); *ptr += MINBPC(enc); } return SPP_ERROR_TOKENIZER_FAILED; } static int PTRCALL PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1, const char *end1, const char *ptr2) { for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) { if (ptr1 == end1) return 0; if (!CHAR_MATCHES(enc, ptr1, *ptr2)) return 0; } return ptr1 == end1; } 1.1 ws-axis/c/src/xml/txpp/lib/spp_tokenizer.h Index: spp_tokenizer.h =================================================================== /* * Copyright 2003-2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ enum { BT_NONXML, /* 0: */ BT_MALFORM, /* 1: */ BT_LT, /* 2:less than */ BT_AMP, /* 3:ampersand */ BT_RSQB, /* 4: */ BT_LEAD2, /* 5: */ BT_LEAD3, /* 6: */ BT_LEAD4, /* 7: */ BT_TRAIL, /* 8: */ BT_CR, /* 9: */ BT_LF, /* 10:carriage return(new line) */ BT_GT, /* 11:greater than */ BT_QUOT, /* 12:double quote */ BT_APOS, /* 13:single quote */ BT_EQUALS, /* 14:equal sign */ BT_QUEST, /* 15:question mark(?) */ BT_EXCL, /* 16: */ BT_SOL, /* 17:slash */ BT_SEMI, /* 18: */ BT_NUM, /* 19: */ BT_LSQB, /* 20: */ BT_S, /* 21:space */ BT_NMSTRT, /* //22:_, */ BT_COLON, /*23:colon */ BT_HEX, /* 24:hexadecimal characters */ BT_DIGIT, /* 25:digit */ BT_NAME, /* 26:dot, */ BT_MINUS, /* 27:dash(minus sign) */ BT_OTHER, /*28: known not to be a name or name start character... for ex ` character */ BT_NONASCII, /* 29: might be a name or name start character */ BT_PERCNT, /* 30: */ BT_LPAR, /* 31: */ BT_RPAR, /* 32: */ BT_AST, /* 33: */ BT_PLUS, /* 34: */ BT_COMMA, /* 35: */ BT_VERBAR /* 36: */ }; #include