axis-java-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dami...@apache.org
Subject cvs commit: ws-axis/c/src/xml/txpp/lib spp.c spp.h spp_converter.c spp_converter.h spp_tokenizer.c spp_tokenizer.h Makefile.am xmltok.c xmltok.h xmltok_impl.c xmltok_impl.h xmltok_ns.c xpp.c xpp.h
Date Tue, 20 Jul 2004 05:17:48 GMT
damitha     2004/07/19 22:17:48

  Modified:    c/src/xml/txpp/lib Makefile.am
  Added:       c/src/xml/txpp/lib spp.c spp.h spp_converter.c
                        spp_converter.h spp_tokenizer.c spp_tokenizer.h
  Removed:     c/src/xml/txpp/lib xmltok.c xmltok.h xmltok_impl.c
                        xmltok_impl.h xmltok_ns.c xpp.c xpp.h
  Log:
  
  
  Revision  Changes    Path
  1.3       +1 -1      ws-axis/c/src/xml/txpp/lib/Makefile.am
  
  Index: Makefile.am
  ===================================================================
  RCS file: /home/cvs/ws-axis/c/src/xml/txpp/lib/Makefile.am,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- Makefile.am	13 Jul 2004 07:38:54 -0000	1.2
  +++ Makefile.am	20 Jul 2004 05:17:48 -0000	1.3
  @@ -1,5 +1,5 @@
   lib_LTLIBRARIES = libtxpp.la
   AM_CPPFLAGS = -Wall -g -DHAVE_XPP_CONFIG_H
  -libtxpp_la_SOURCES = xpp.c xmltok.c
  +libtxpp_la_SOURCES = spp.c spp_converter.c
   libtxpp_la_LIBADD =
   INCLUDES = -I./ -I../
  
  
  
  1.1                  ws-axis/c/src/xml/txpp/lib/spp.c
  
  Index: spp.c
  ===================================================================
  /*
   *   Copyright 2003-2004 The Apache Software Foundation.
   *
   *   Licensed under the Apache License, Version 2.0 (the "License");
   *   you may not use this file except in compliance with the License.
   *   You may obtain a copy of the License at
   *
   *       http://www.apache.org/licenses/LICENSE-2.0
   *
   *   Unless required by applicable law or agreed to in writing, software
   *   distributed under the License is distributed on an "AS IS" BASIS,
   *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   *   See the License for the specific language governing permissions and
   *   limitations under the License.
   */
  
  #include <string.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include "internal.h"
  #include "spp.h"
  #include "spp_converter.h"
  #ifdef HAVE_XPP_CONFIG_H
  #include "../xpp_config.h"
  #endif
  #include "tag.h"
  
  
  #define MALLOC(s) (ct->m_mem.mallocFcn((s)))
  #define MEMMOVE(p1, p2, s) (ct->m_mem.memMoveFcn((p1), (p2), (s)))
  #define REALLOC(p,s) (ct->m_mem.reallocFcn((p),(s)))
  #define FREE(p) (ct->m_mem.freeFcn((p)))
  #define MIN_BUFF_SZ 128    /* 0x10000 */
  #define INIT_BUFFER_SIZE 128 /* keep INIT_BUFFER_SIZE <= MIN_BUFF_SZ */
  #define protocolEncodingName (ct->m_protocolEncodingName)
  #define initEncoding (ct->m_initEncoding)
  #define encoding (ct->m_encoding)
  #define ns (ct->m_ns)
  #define tokState (ct->m_tokState)
  #define state (ct->m_state)
  #define numOfChars (ct->m_numOfChars)
  #define dataCounter (ct->m_dataCounter)
  #define namespaceSeparator (ct->m_namespaceSeparator)
  
  
  int isDone = 0;
  
  enum 
  {
      PROLOG, CONTENT
  };
  
  /** Struct which represents the parser object. All the member variables of this parser
    * object are prefixed with m_ to represent them as parser memeber variables through
    * out the code*/
  typedef struct SPPContext
  {
      char *m_buff;/* Tokenized ptr data buffer*/
      char *m_utf8Buff;/* Utf8 tokenized ptr data buffer*/
      int m_buffSize;/* Tokenized ptr data buffer size*/
      int m_utf8BuffSize;/* Utf8 8 tokenized ptr data buffer size*/
      char *m_prevTokPoint;/* This points to the previous tokenizing point of the buffer.
                              This means that when the buffer to be tokenized is passed
                              to the xmltok_impl.c's tokenizeContent method this pointer keeps
                              pointing to the end of data so far tokenized. Only m_currentTokPoint
                              is increased inside the tokenizeContent method*/
      char *m_currentTokPoint;/* This points to the current tokenizing point of the buffer*/
      int m_numOfChars;/* Number of characters to be parsed in the buffer*/
      int m_state;
      int m_tokState;/* Tokenizing state. Holds prolog or content*/
      TokDataStruct m_data;/* Tokenized ptr data structure*/
      int m_dataCounter;
      const XML_Char *m_protocolEncodingName;/* Encoding name*/
      INIT_ENCODING m_initEncoding;/* Encoding structure*/
      const ENCODING *m_encoding;/* Encoding structure*/
      XML_Bool m_ns;
      const SppMemoryHandlingSuite m_mem;/* Memory handling suite*/
      XML_Char m_namespaceSeparator;
      
      int (*getBlock)(char *buff, int buffSize, int *numchars);
  
  } SPPContext;
  
  
  static enum SPP_Error
  processXmlDecl(SPPParser* ct, int isGeneralTextEntity,
                 const char *s, const char *currentTokPoint)
  {
      /* const XML_Char *encodingName = NULL; */
      /* const XML_Char *storedEncName = NULL; */
      const char *version = NULL;
      const char *versionend;
      /* const XML_Char *storedversion = NULL; */
      int standalone = -1;
  
      if (!(XmlParseXmlDecl)(isGeneralTextEntity,
          /*Default encoding*/
          encoding,
          s,
          currentTokPoint,
          0,
          &version,
          &versionend,
          &protocolEncodingName,
          /*Encoding is taken from the xml file declaration*/
          &encoding,
          &standalone))
  
      return SPP_ERROR_SYNTAX;
      else
      {
          /* printf("version:%s\n", version); */
          /* printf("versionend:%s\n", versionend); */
          *(protocolEncodingName + (ct->m_currentTokPoint - protocolEncodingName) - 3) = '\0';
      
          /*printf("encodingName:%s\n", protocolEncodingName);*/
          /* if(standalone) */
          /* printf("standalone:%s\n", standalone); */
          initializeEncoding(ct);
      }
  }
  
  int ret_status;
  enum SPP_Error parseProlog(SPPParser* ct)
  {
      ct->m_data.numOfPtrs = 0;
      ct->m_data.numOfPtrsUtf8 = 0;
      ct->m_prevTokPoint = ct->m_currentTokPoint;
      do 
      {
          ret_status = 0;    
          if(PROLOG == tokState)
          {
              initializeEncoding(ct);
              /* printf("numOfChars:%d\n", numOfChars); */
              /* XmlPrologTok is defined in spp_converter.h
               */
              ret_status = XmlPrologTok(&state, &ct->m_data, encoding, &numOfChars, 
                  ct->m_prevTokPoint, &ct->m_currentTokPoint);
              if(SPP_ERROR_NONE == ret_status)
              {
                  /* printf("ct->m_prevTokPoint:%s\n", ct->m_prevTokPoint); */
                  /* printf("ct->m_currentTokPoint:%s\n", ct->m_currentTokPoint); */
                  processXmlDecl(ct, 0, ct->m_prevTokPoint, ct->m_currentTokPoint);
                  ct->m_prevTokPoint = ct->m_currentTokPoint;
              }
          }
          return SPP_ERROR_NONE;
      
      } while (loadBuffer(ct));
  
      return SPP_ERROR_PARSE_FAILED;
  }
  
  enum SPP_Error parseContent(SPPParser* ct)
  {
      ct->m_data.numOfPtrs = 0;
      ct->m_data.numOfPtrsUtf8 = 0;
      /* Initialially m_preveTokPoint points to the currentTokPoint*/
      ct->m_prevTokPoint = ct->m_currentTokPoint;
      do
      {
          ret_status = 0;    
          if(PROLOG == tokState)
          {
              initializeEncoding(ct);
              /* printf("numOfChars:%d\n", numOfChars); */
  
              /* XmlPrologTok is defined in spp_converter.h. While method is executed
               * prevTokPoint is not changed. currentTokPoint moves forward*/
              ret_status = XmlPrologTok(&state, &ct->m_data, encoding, &numOfChars, 
              ct->m_prevTokPoint, &ct->m_currentTokPoint);
              if(SPP_ERROR_NONE == ret_status)
              {
                  /* printf("ct->m_prevTokPoint:%s\n", ct->m_prevTokPoint); */
                  /* printf("ct->m_currentTokPoint:%s\n", ct->m_currentTokPoint); */
                  processXmlDecl(ct, 0, ct->m_prevTokPoint, ct->m_currentTokPoint);
                  ct->m_prevTokPoint = ct->m_currentTokPoint;
              }
          }
          if(SPP_ERROR_NONE == ret_status)
          {
              tokState = CONTENT;
              /* printf("numOfChars:%d\n", numOfChars); */
              /* XmlContentTok is defined in spp_converter.h.*/
  
              
              /* XmlContentTok is defined in spp_converter.h. While method is executed
               * prevTokPoint is not changed. currentTokPoint moves forward 
               * until a valid tag element is completed*/
              ret_status = XmlContentTok(&state, &ct->m_data, encoding,
                  &numOfChars, ct->m_prevTokPoint, &ct->m_currentTokPoint);
              /* printf("tempStatus:%d\n", tempStatus); */
              /* printf("ct->m_prevTokPoint:%s\n", ct->m_prevTokPoint); */
              /* printf("ct->m_currentTokPoint:%s\n", ct->m_currentTokPoint); */
              if(SPP_ERROR_NONE == ret_status)
              {
                  return SPP_ERROR_NONE;
              }
          }    
      
      } while (loadBuffer(ct));
  
      return SPP_ERROR_PARSE_FAILED;
  }
  
  int loadBuffer(SPPParser *ct)
  {
      if(isDone)
          return SPP_ERROR_NONE;
      /* Holds the size from the parser buffer beginning to the
       * end of used data. Simply said, this is the already
       * used data to make a tag, remaining in the buffer*/
      int usedData;
      /* Take the size of data that is remained from the previous parse
       * iteration failing to complete a tag element */
      int unusedData = (int)(ct->m_currentTokPoint - ct->m_prevTokPoint);
      /* We can get rid of used data and use used data size and vacant size
       * to fill the buffer again*/
      int toBeFilledSize = ct->m_buffSize - unusedData;
      /* We define MIN_BUFFER_SZ as the minimum value the toBeFilledSize
       * could assume.
       * If the toBeFilledSize is less than the MIN_BUFFER_SZ,
       * buffer size is doubled for performance reasons */
      if (toBeFilledSize < MIN_BUFF_SZ)
      {
          int ii;
          ct->m_buffSize *= 2;/* Double the parser buffer size*/
          /* Now since the buffer size is increased the fill
           * size is also increased*/
          toBeFilledSize = ct->m_buffSize - unusedData;
          /* Move the used data to the beginning of the parser
           * buffer. Used data will be lost */
          MEMMOVE(ct->m_buff, ct->m_prevTokPoint, unusedData);
          ct->m_buff = REALLOC(ct->m_buff, ct->m_buffSize);
          usedData = (int)(ct->m_prevTokPoint - ct->m_buff);
          /*MEMMOVE(ct->m_buff, ct->m_prevTokPoint, unusedData);*/
          ct->m_prevTokPoint = ct->m_buff;
          ct->m_currentTokPoint = ct->m_prevTokPoint + unusedData;
          ii = 0;
          /* Move the already tokenized data pointers */
          while (ii < ct->m_data.numOfPtrs)
              ct->m_data.ptrBuff[ii++] -= usedData;
      }
      else /* Don't double the buffer size. But get rid of the used data*/
      {
          int ii;
  
          usedData = (int)(ct->m_prevTokPoint - ct->m_buff);
          MEMMOVE(ct->m_buff, ct->m_prevTokPoint, unusedData);
          ct->m_prevTokPoint = ct->m_buff;
          ct->m_currentTokPoint = ct->m_prevTokPoint + unusedData;
          ii = 0;
          while (ii < ct->m_data.numOfPtrs)
              ct->m_data.ptrBuff[ii++] -= usedData;
      }
  
      if(!ct->getBlock(ct->m_currentTokPoint, toBeFilledSize, &numOfChars))
      {
          return SPP_ERROR_READ_BLOCK;
      }
      else
          return SPP_ERROR_NONE; 
  }
  
  int getBlock(char *buff, int toBeFilledSize, int* numchars)
  {
      int len;
      int done;
      *numchars = 0;
      len = fread(buff, 1, toBeFilledSize, stdin);
      /*printf("len:%d\n", len);*/
      *numchars += len;
          if (ferror(stdin))
          {
              fprintf(stderr, "Read error\n");
              exit(-1);
          }
          done = feof(stdin);
      if(done) 
          isDone = 1;
      return SPP_ERROR_NONE;
  }
  
  
  SPPParser* parserCreate(const XML_Char *encodingName)
  {
      return parserCreate_mh(encodingName, NULL, NULL);
  }
  
  SPPParser* parserCreate_mh(const XML_Char *encodingName, 
      const SppMemoryHandlingSuite *memsuite,
      XML_Char nsSep)
  {
      return parserCreate_in(encodingName, memsuite, nsSep);
  }
  
  /** Create parser*/
  static SPPParser* parserCreate_in(const XML_Char *encodingName,
      const SppMemoryHandlingSuite *memsuite,
      XML_Char nsSep)
  {
      SPPParser* ct;
      /* If memory handling functions are externally provided*/
      if (memsuite) 
      {
          SppMemoryHandlingSuite *mtemp;
          ct = (SPPParser*) memsuite->mallocFcn(sizeof(struct SPPContext));
          if (ct != NULL) 
          {
              mtemp = (SppMemoryHandlingSuite *)&(ct->m_mem);
              mtemp->mallocFcn = memsuite->mallocFcn;
              mtemp->memMoveFcn = memsuite->memMoveFcn;
              mtemp->reallocFcn = memsuite->reallocFcn;
              mtemp->freeFcn = memsuite->freeFcn;
          }
      }
      else /* Use system memory handling functions*/
      {
          SppMemoryHandlingSuite *mtemp;
          ct = (SPPParser*) malloc(sizeof(struct SPPContext));
          if (ct != NULL) 
          {
              mtemp = (SppMemoryHandlingSuite *)&(ct->m_mem);
              mtemp->mallocFcn = malloc;
              mtemp->memMoveFcn = memmove;
              mtemp->reallocFcn = realloc;
              mtemp->freeFcn = free;
          }
      }
  
      ct->m_buffSize = INIT_BUFFER_SIZE;
      ct->m_utf8BuffSize = ct->m_buffSize;
      char* buff = (char*) malloc(ct->m_buffSize * sizeof(char));
      char* utf8Buff = (char*) malloc(ct->m_utf8BuffSize * sizeof(char));
      if(buff == NULL || utf8Buff == NULL)
          return NULL;
      ct->m_buff = buff;
      ct->m_utf8Buff = utf8Buff;
      
      namespaceSeparator = '!';
      ns = XML_FALSE;
  
      if(SPP_ERROR_NONE == parserInit(ct, encodingName))
          return ct;
      else
          return NULL;
  }
  
  /** Initialize parser*/
  static int parserInit(SPPParser* ct, const XML_Char *encodingName)
  {
      if(ct)
      {
          numOfChars = 0;
          ct->m_currentTokPoint = ct->m_buff;
          ct->m_prevTokPoint = ct->m_buff;
          state = S_0;
          tokState = PROLOG;
          ct->getBlock = getBlock;
          ct->m_data.ptrBuff = NULL;
          ct->m_data.utf8PtrBuff = NULL;
          ct->m_data.ptrBuffSize = 8;
          dataCounter = 0;
          protocolEncodingName = encodingName;
  
          return SPP_ERROR_NONE;
      }
      else
          return SPP_ERROR_PARSER_INIT_FAILED;
  }
  
  /** Initialize encoding*/
  static int initializeEncoding(SPPParser* ct)
  {
      const char *s;
      #ifdef XML_UNICODE
          char encodingBuf[128];
          if (!protocolEncodingName)
              s = NULL;
          else 
          {
              int i;
              for (i = 0; protocolEncodingName[i]; i++) 
              {
                  if (i == sizeof(encodingBuf) - 1
                      || (protocolEncodingName[i] & ~0x7f) != 0) 
                  {
                      encodingBuf[0] = '\0';
                      break;
                  }
                  encodingBuf[i] = (char)protocolEncodingName[i];
              }
              encodingBuf[i] = '\0';
              s = encodingBuf;
          }
      #else
      /* printf("protocolEncodingName:%s\n", protocolEncodingName); */
      s = protocolEncodingName;
      #endif
      if (XmlInitEncoding(&initEncoding,
          &encoding, s))
          return SPP_ERROR_NONE;
      /* return handleUnknownEncoding(parser, protocolEncodingName); */
  }
  
  void* parserFree(SPPParser* ct)
  {
      free(ct);
  }
  
  TokDataStruct* next(SPPParser* ct)
  {
      dataCounter = 0;
      if(SPP_ERROR_NONE == parseContent(ct))
      {
          /*processData(ct, encoding);*/
          return &ct->m_data;
      }
      else
          return NULL;    
  
  }
  
  static int addUtf8Ptr(char *ptr, TokDataStruct *data)
  {
      if (data->numOfPtrsUtf8 == data->ptrBuffSize
          || !data->utf8PtrBuff) 
      {
          int sz = data->ptrBuffSize << 1;
          char **ptrBuff = (char **)malloc(sz << 2);
          if (!ptrBuff)
              return SPP_ERROR_NO_MEMORY;
          if (data->utf8PtrBuff) 
          {
              memmove(ptrBuff, data->utf8PtrBuff, data->numOfPtrsUtf8 << 2);
              free(data->utf8PtrBuff);
          }
          data->utf8PtrBuff = ptrBuff;
          data->ptrBuffSize = sz;
      }
  
      data->utf8PtrBuff[data->numOfPtrsUtf8++] = ptr;
  
      return SPP_ERROR_NONE;
  }
  
  
  static void processData(SPPParser* ct)
  {
      int intCount = 0;
      char holder;
      int bufSize;
      int totLen = 0;
      int wordLen = 0;
      int tempDiff;
      XML_Char* toPtr = (XML_Char *) ct->m_utf8Buff;    
      XML_Char* startPtr = (XML_Char *) ct->m_utf8Buff;
      XML_Char* endPtr = (XML_Char *) ct->m_utf8Buff;
      while(intCount < ct->m_data.numOfPtrs)
      {
          const char *rawNameEnd = &ct->m_data.ptrBuff[intCount + 1][1];
          const char *fromPtr = ct->m_data.ptrBuff[intCount];
          tempDiff = ct->m_data.ptrBuff[intCount+1] - ct->m_data.ptrBuff[intCount];
          if(tempDiff <= 1)
              return;
          startPtr += wordLen;
          /* printf("fromPtr:\n\n%s\n\n", fromPtr); */
          /* printf("rawNameEnd:\n\n%s\n\n", rawNameEnd); */
          
          if(ct->m_utf8BuffSize < ct->m_buffSize)
          {
              ct->m_utf8BuffSize = 2 * ct->m_buffSize;
              /* printf("m_utf8BuffSize:%d\n", ct->m_utf8BuffSize); */
              char *temp = (char *)REALLOC(ct->m_utf8Buff, ct->m_utf8BuffSize);
              if (temp == NULL)
                  return SPP_ERROR_NO_MEMORY;
              ct->m_utf8Buff = temp;
          }
          SppUtf8Convert(encoding, &fromPtr, rawNameEnd,
              (ICHAR **)&toPtr, (ICHAR *)ct->m_utf8Buff + ct->m_utf8BuffSize);
          /* printf("startPtr:\n%s\n", startPtr); */
          /* printf("toPtr:\n%s\n", toPtr); */
          wordLen = toPtr - startPtr;
          /* printf("wordLen:%d\n", wordLen); */
          totLen += wordLen;
              
          endPtr = toPtr - 1;
          addUtf8Ptr(startPtr, &ct->m_data);
          addUtf8Ptr(endPtr, &ct->m_data);
          intCount += 2;
      }
  
      return;
          
  }
  
  int getNextElementAsInt(SPPParser* ct, int* parseError)
  {
      int intTemp;
      char* temp;
  
      dataCounter = 0;
          temp = ct->m_data.ptrBuff[dataCounter +  1][1];
          ct->m_data.ptrBuff[dataCounter + 1][1] = XML_T('\0');
      /* printf("ct->m_data.ptrBuff[0]:%s\n", ct->m_data.ptrBuff[0]); */
          sscanf(ct->m_data.ptrBuff[dataCounter], "%d", &intTemp);
          ct->m_data.ptrBuff[dataCounter + 1][1] = temp;
                          
      *parseError = SPP_ERROR_NONE;    
      return intTemp;
  
  }
  
  /*
   *   Prier to call this method state should be START_TAG
   */
  int getNextAttributeAsInt(SPPParser* ct, int* parseError)
  {
      int intTemp;
      char* temp;
      TokDataStruct* data;
  
      if (2 >= ct->m_data.numOfPtrsUtf8) 
      {
          *parseError = SPP_ERROR_UNEXPECTED_TOKEN_CONTENT;
          return 0;
      }
      dataCounter += 2;
      temp = ct->m_data.ptrBuff[dataCounter + 1][1];
      ct->m_data.ptrBuff[dataCounter+1][1] = '\0';
      /* printf("ct->m_data.ptrBuff[dataCounter]:%s\n", 
       * ct->m_data.ptrBuff[dataCounter]); 
       */
      sscanf(ct->m_data.ptrBuff[dataCounter], "%d", &intTemp);
      ct->m_data.ptrBuff[dataCounter+1][1] = temp;
      
      *parseError = SPP_ERROR_NONE;
      return intTemp;
  
  }
  
  
  
  
  1.1                  ws-axis/c/src/xml/txpp/lib/spp.h
  
  Index: spp.h
  ===================================================================
  /*
   *   Copyright 2003-2004 The Apache Software Foundation.
   *
   *   Licensed under the Apache License, Version 2.0 (the "License");
   *   you may not use this file except in compliance with the License.
   *   You may obtain a copy of the License at
   *
   *       http://www.apache.org/licenses/LICENSE-2.0
   *
   *   Unless required by applicable law or agreed to in writing, software
   *   distributed under the License is distributed on an "AS IS" BASIS,
   *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   *   See the License for the specific language governing permissions and
   *   limitations under the License.
   */
  
  #include <stdlib.h>
  
  #ifndef XPP_H_OF_AXIS_INCLUDED
  #define XPP_H_OF_AXIS_INCLUDED
  #ifdef XML_UNICODE     /* Information is UTF-16 encoded. */
  #ifdef XML_UNICODE_WCHAR_T
  typedef wchar_t XML_Char;
  typedef wchar_t XML_LChar;
  #else
  typedef unsigned short XML_Char;
  typedef char XML_LChar;
  #endif /* XML_UNICODE_WCHAR_T */
  #else                  /* Information is UTF-8 encoded. */
  typedef char XML_Char;
  typedef char XML_LChar;
  #endif /* XML_UNICODE */
  
  typedef unsigned char XML_Bool;
  #define XML_TRUE   ((XML_Bool) 1)
  #define XML_FALSE  ((XML_Bool) 0)
  
  
  #ifdef XML_UNICODE
  
  #ifdef XML_UNICODE_WCHAR_T
  #define XML_T(x) (const wchar_t)x
  #define XML_L(x) L ## x
  #else
  #define XML_T(x) (const unsigned short)x
  #define XML_L(x) x
  #endif
  
  #else
  
  #define XML_T(x) x
  #define XML_L(x) x
  
  #endif
  
  
  #ifdef XML_UNICODE
  #define XML_ENCODE_MAX XML_UTF16_ENCODE_MAX
  #define XmlConvert XmlUtf16Convert
  #define XmlGetInternalEncoding XmlGetUtf16InternalEncoding
  #define XmlEncode XmlUtf16Encode
  #define MUST_CONVERT(enc, s) (!(enc)->isUtf16 || (((unsigned long)s) & 1))
  typedef unsigned short ICHAR;
  #else
  #define XML_ENCODE_MAX XML_UTF8_ENCODE_MAX
  #define XmlConvert XmlUtf8Convert
  #define XmlGetInternalEncoding XmlGetUtf8InternalEncoding
  #define XmlEncode XmlUtf8Encode
  #define MUST_CONVERT(enc, s) (!(enc)->isUtf8)
  typedef char ICHAR;
  #endif
  
  
  /** Tokenizing states*/
  enum 
  {
      S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,
      S_8, S_9, S_10, S_11, S_12, S_13, S_14
  };        
  
  /** Special characters*/
  enum 
  {
      START_TAG, END_TAG, EMPTY_ELEMENT_TAG, PCDATA
  };
  
  /** Error codes*/
  enum SPP_Error 
  {
      SPP_ERROR_NONE,
      SPP_ERROR_NO_MEMORY,
      SPP_ERROR_SYNTAX,
      SPP_ERROR_NO_ELEMENTS,
      SPP_ERROR_INVALID_TOKEN,
      SPP_ERROR_UNCLOSED_TOKEN,
      SPP_ERROR_TAG_MISMATCH,
      SPP_ERROR_DUPLICATE_ATTRIBUTE,
      SPP_ERROR_UNKNOWN_ENCODING,
      SPP_ERROR_INCORRECT_ENCODING,
      SPP_ERROR_NOT_STANDALONE,
      SPP_ERROR_UNEXPECTED_STATE,
      SPP_ERROR_TOKENIZER_FAILED,
      SPP_ERROR_UNEXPECTED_TOKEN_CONTENT,
      SPP_ERROR_PARSE_FAILED,
      SPP_ERROR_READ_BLOCK,
      SPP_ERROR_PARSER_INIT_FAILED,
      SPP_ERROR_UNKNOWN
  };
  
  /** Memeory handling structure*/
  typedef struct mm 
  {
      void *( *mallocFcn)(size_t size);
      void *( *memMoveFcn)(void *ptrto, void *ptrfrom, size_t size);
      void *( *reallocFcn)(void *ptr, size_t size);
      void ( *freeFcn)(void *ptr);
  } SppMemoryHandlingSuite;
  
  /** Struct to hold ptrs to tokenized data*/
  typedef struct data
  {
      int type;
      int ptrBuffSize;
      int numOfPtrs;
      int numOfPtrsUtf8;
      char **ptrBuff;
      char **utf8PtrBuff;
  } TokDataStruct;
  
  /* char *tn[] = {"START_TAG", "END_TAG", "EMPTY_ELEMENT_TAG", "PCDATA"}; */
  
  struct SPPContext;
  
  /** SPP parser structure*/
  typedef struct SPPContext SPPParser;
  
  /** Parser create with encoding*/
  SPPParser* parserCreate(const XML_Char *encodingName);
  
  /** Parser create with encoding, namspace*/
  SPPParser* parserCreate_ns(const XML_Char *encodingName, XML_Char nsSep);
  
  /** Parser create with encoding, external memory handling suite and namespace*/
  SPPParser* parserCreate_mh(const XML_Char *encodingName, 
      const SppMemoryHandlingSuite *memsuite,
      XML_Char nsSep);
  enum SPP_Error parseProlog(SPPParser* ct);
  
  void* parserFree(SPPParser* ct);
  
  /** Parse the next element tag*/
  TokDataStruct* next(SPPParser* ct);
  
  /** Get next element as integer*/
  int getNextElementAsInt(SPPParser* ct, int* parseError);
  
  /** Get next attribute as integer*/
  int getNextAttributeAsInt(SPPParser* ct, int* parseError);
  
  #endif
  
  
  
  
  1.1                  ws-axis/c/src/xml/txpp/lib/spp_converter.c
  
  Index: spp_converter.c
  ===================================================================
  /*
   *   Copyright 2003-2004 The Apache Software Foundation.
   *
   *   Licensed under the Apache License, Version 2.0 (the "License");
   *   you may not use this file except in compliance with the License.
   *   You may obtain a copy of the License at
   *
   *       http://www.apache.org/licenses/LICENSE-2.0
   *
   *   Unless required by applicable law or agreed to in writing, software
   *   distributed under the License is distributed on an "AS IS" BASIS,
   *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   *   See the License for the specific language governing permissions and
   *   limitations under the License.
   */
  
  #ifdef COMPILED_FROM_DSP
  #include "winconfig.h"
  #elif defined(MACOS_CLASSIC)
  #include "macconfig.h"
  #else
  #ifdef HAVE_XPP_CONFIG_H
  #include "../xpp_config.h"
  #endif
  #endif /* ndef COMPILED_FROM_DSP */
  
  #include "internal.h"
  #include "spp_converter.h"
  #include "nametab.h"
  
  #ifdef XML_DTD
  #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
  #else
  #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
  #endif
  
  /**
   * tokenizeProlog and tokenizeContent are defined in spp_tokenizer.c
   * tokenizeProlog tokenize the xml declaration
   * tokenizeContent tokenize the xml content
   */
  #define VTABLE1 \
    { PREFIX(tokenizeProlog), PREFIX(tokenizeContent)}, \
    PREFIX(nameMatchesAscii)
  
  #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
  
  #define UCS2_GET_NAMING(pages, hi, lo) \
     (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
  
  /* A 2 byte UTF-8 representation splits the characters 11 bits between
   * the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
   * pages, 3 bits to add to that index and 5 bits to generate the mask.
   */
  
  #define UTF8_GET_NAMING2(pages, byte) \
      (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
                        + ((((byte)[0]) & 3) << 1) \
                        + ((((byte)[1]) >> 5) & 1)] \
           & (1 << (((byte)[1]) & 0x1F)))
  
  /* A 3 byte UTF-8 representation splits the characters 16 bits between
   * the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
   * into pages, 3 bits to add to that index and 5 bits to generate the
   * mask.
   */
  
  #define UTF8_GET_NAMING3(pages, byte) \
    (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
                               + ((((byte)[1]) >> 2) & 0xF)] \
                         << 3) \
                        + ((((byte)[1]) & 3) << 1) \
                        + ((((byte)[2]) >> 5) & 1)] \
           & (1 << (((byte)[2]) & 0x1F)))
  
  #define UTF8_GET_NAMING(pages, p, n) \
    ((n) == 2 \
    ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
    : ((n) == 3 \
       ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
       : 0))
  
  /* Detection of invalid UTF-8 sequences is based on Table 3.1B
   * of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
   * with the additional restriction of not allowing the Unicode
   * code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
   * Implementation details:
   * (A & 0x80) == 0     means A < 0x80
   * and
   * (A & 0xC0) == 0xC0  means A > 0xBF
   */
  
  #define UTF8_INVALID2(p) \
    ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
  
  #define UTF8_INVALID3(p) \
    (((p)[2] & 0x80) == 0 \
    || \
    ((*p) == 0xEF && (p)[1] == 0xBF \
      ? \
      (p)[2] > 0xBD \
      : \
      ((p)[2] & 0xC0) == 0xC0) \
    || \
    ((*p) == 0xE0 \
      ? \
      (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
      : \
      ((p)[1] & 0x80) == 0 \
      || \
      ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
  
  #define UTF8_INVALID4(p) \
    (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
    || \
    ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
    || \
    ((*p) == 0xF0 \
      ? \
      (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
      : \
      ((p)[1] & 0x80) == 0 \
      || \
      ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
  
  static int PTRFASTCALL
  isNever(const ENCODING *enc, const char *p)
  {
    return 0;
  }
  
  static int PTRFASTCALL
  utf8_isName2(const ENCODING *enc, const char *p)
  {
    return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
  }
  
  static int PTRFASTCALL
  utf8_isName3(const ENCODING *enc, const char *p)
  {
    return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
  }
  
  #define utf8_isName4 isNever
  
  static int PTRFASTCALL
  utf8_isNmstrt2(const ENCODING *enc, const char *p)
  {
    return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
  }
  
  static int PTRFASTCALL
  utf8_isNmstrt3(const ENCODING *enc, const char *p)
  {
    return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
  }
  
  #define utf8_isNmstrt4 isNever
  
  static int PTRFASTCALL
  utf8_isInvalid2(const ENCODING *enc, const char *p)
  {
    return UTF8_INVALID2((const unsigned char *)p);
  }
  
  static int PTRFASTCALL
  utf8_isInvalid3(const ENCODING *enc, const char *p)
  {
    return UTF8_INVALID3((const unsigned char *)p);
  }
  
  static int PTRFASTCALL
  utf8_isInvalid4(const ENCODING *enc, const char *p)
  {
    return UTF8_INVALID4((const unsigned char *)p);
  }
  
  struct normal_encoding 
  {
    ENCODING enc;
    unsigned char type[256];
  #ifdef XML_MIN_SIZE
    int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
    int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
    int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
    int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
    int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
  #endif /* XML_MIN_SIZE */
    int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
    int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
    int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
    int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
    int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
    int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
    int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
    int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
    int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
  };
  
  #define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc))
  
  #ifdef XML_MIN_SIZE
  
  #define STANDARD_VTABLE(E) \
   E ## byteType, \
   E ## isNameMin, \
   E ## isNmstrtMin, \
   E ## byteToAscii, \
   E ## charMatches,
  
  #else
  
  #define STANDARD_VTABLE(E) /* as nothing */
  
  #endif
  
  #define NORMAL_VTABLE(E) \
   E ## isName2, \
   E ## isName3, \
   E ## isName4, \
   E ## isNmstrt2, \
   E ## isNmstrt3, \
   E ## isNmstrt4, \
   E ## isInvalid2, \
   E ## isInvalid3, \
   E ## isInvalid4
  
  static int FASTCALL checkCharRefNumber(int);
  
  #include "spp_tokenizer.h"
  #include "ascii.h"
  
  #ifdef XML_MIN_SIZE
  #define sb_isNameMin isNever
  #define sb_isNmstrtMin isNever
  #endif
  
  #ifdef XML_MIN_SIZE
  #define MINBPC(enc) ((enc)->minBytesPerChar)
  #else
  /* minimum bytes per character */
  #define MINBPC(enc) 1
  #endif
  
  #define SB_BYTE_TYPE(enc, p) \
    (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
  
  #ifdef XML_MIN_SIZE
  static int PTRFASTCALL
  sb_byteType(const ENCODING *enc, const char *p)
  {
    return SB_BYTE_TYPE(enc, p);
  }
  #define BYTE_TYPE(enc, p) \
   (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
  #else
  #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
  #endif
  
  #ifdef XML_MIN_SIZE
  #define BYTE_TO_ASCII(enc, p) \
   (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
  static int PTRFASTCALL
  sb_byteToAscii(const ENCODING *enc, const char *p)
  {
    return *p;
  }
  #else
  #define BYTE_TO_ASCII(enc, p) (*(p))
  #endif
  
  #define IS_NAME_CHAR(enc, p, n) \
   (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
  #define IS_NMSTRT_CHAR(enc, p, n) \
   (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
  #define IS_INVALID_CHAR(enc, p, n) \
   (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
  
  #ifdef XML_MIN_SIZE
  #define IS_NAME_CHAR_MINBPC(enc, p) \
   (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
  #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
   (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
  #else
  #define IS_NAME_CHAR_MINBPC(enc, p) (0)
  #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
  #endif
  
  #ifdef XML_MIN_SIZE
  #define CHAR_MATCHES(enc, p, c) \
   (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
  static int PTRCALL
  sb_charMatches(const ENCODING *enc, const char *p, int c)
  {
    return *p == c;
  }
  #else
  /* c is an ASCII character */
  #define CHAR_MATCHES(enc, p, c) (*(p) == c)
  #endif
  
  #define PREFIX(ident) normal_ ## ident
  #include "spp_tokenizer.c"
  
  #undef MINBPC
  #undef BYTE_TYPE
  #undef BYTE_TO_ASCII
  #undef CHAR_MATCHES
  #undef IS_NAME_CHAR
  #undef IS_NAME_CHAR_MINBPC
  #undef IS_NMSTRT_CHAR
  #undef IS_NMSTRT_CHAR_MINBPC
  #undef IS_INVALID_CHAR
  
  enum 
  {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
    UTF8_cval1 = 0x00,
    UTF8_cval2 = 0xc0,
    UTF8_cval3 = 0xe0,
    UTF8_cval4 = 0xf0
  };
  
  /** Converts from utf8 to utf8.
    * @param encoding type
    * @param 
    * @param
    * @param
    * @param
    */
  static void PTRCALL
  utf8_toUtf8(const ENCODING *enc,
              const char **fromP, const char *fromLim,
              char **toP, const char *toLim)
  {
    /* printf("came\n"); */
    char *to;
    const char *from;
    if (fromLim - *fromP > toLim - *toP) {
      /* Avoid copying partial characters. */
      for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
        if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
          break;
    }
    for (to = *toP, from = *fromP; from != fromLim; from++, to++)
      *to = *from;
    *fromP = from;
    *toP = to;
    /* printf("toP:%s\n", *toP); */
  }
  
  static void PTRCALL
  utf8_toUtf16(const ENCODING *enc,
               const char **fromP, const char *fromLim,
               unsigned short **toP, const unsigned short *toLim)
  {
    unsigned short *to = *toP;
    const char *from = *fromP;
    while (from != fromLim && to != toLim) 
    {
      switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) 
      {
      case BT_LEAD2:
        *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
        from += 2;
        break;
      case BT_LEAD3:
        *to++ = (unsigned short)(((from[0] & 0xf) << 12)
                                 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
        from += 3;
        break;
      case BT_LEAD4:
        {
          unsigned long n;
          if (to + 1 == toLim)
            goto after;
          n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
              | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
          n -= 0x10000;
          to[0] = (unsigned short)((n >> 10) | 0xD800);
          to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
          to += 2;
          from += 4;
        }
        break;
      default:
        *to++ = *from++;
        break;
      }
    }
  after:
    *fromP = from;
    *toP = to;
  }
  
  
  static const struct normal_encoding utf8_encoding = 
  {
    { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
    {
  #define BT_COLON BT_NMSTRT
  #include "asciitab.h"
  #undef BT_COLON
  #include "utf8tab.h"
    },
    STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  };
  
  
  
  static const struct normal_encoding internal_utf8_encoding = 
  {
    { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
    {
  #define BT_COLON BT_NMSTRT
  #include "iasciitab.h"
  #undef BT_COLON
  #include "utf8tab.h"
    },
    STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  };
  
  static void PTRCALL
  latin1_toUtf8(const ENCODING *enc,
                const char **fromP, const char *fromLim,
                char **toP, const char *toLim)
  {
    for (;;) 
    {
      unsigned char c;
      if (*fromP == fromLim)
        break;
      c = (unsigned char)**fromP;
      if (c & 0x80) 
      {
        if (toLim - *toP < 2)
          break;
        *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
        *(*toP)++ = (char)((c & 0x3f) | 0x80);
        (*fromP)++;
      }
      else 
      {
        if (*toP == toLim)
          break;
        *(*toP)++ = *(*fromP)++;
      }
    }
  }
  
  static void PTRCALL
  latin1_toUtf16(const ENCODING *enc,
                 const char **fromP, const char *fromLim,
                 unsigned short **toP, const unsigned short *toLim)
  {
    while (*fromP != fromLim && *toP != toLim)
      *(*toP)++ = (unsigned char)*(*fromP)++;
  }
  
  
  static const struct normal_encoding latin1_encoding = 
  {
    { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
    {
  #define BT_COLON BT_NMSTRT
  #include "asciitab.h"
  #undef BT_COLON
  #include "latin1tab.h"
    },
    STANDARD_VTABLE(sb_)
  };
  
  static void PTRCALL
  ascii_toUtf8(const ENCODING *enc,
               const char **fromP, const char *fromLim,
               char **toP, const char *toLim)
  {
    while (*fromP != fromLim && *toP != toLim)
      *(*toP)++ = *(*fromP)++;
  }
  
  
  static const struct normal_encoding ascii_encoding = 
  {
    { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
    {
  #define BT_COLON BT_NMSTRT
  #include "asciitab.h"
  #undef BT_COLON
  /* BT_NONXML == 0 */
    },
    STANDARD_VTABLE(sb_)
  };
  
  static int PTRFASTCALL
  unicode_byte_type(char hi, char lo)
  {
    switch ((unsigned char)hi) 
    {
    case 0xD8: case 0xD9: case 0xDA: case 0xDB:
      return BT_LEAD4;
    case 0xDC: case 0xDD: case 0xDE: case 0xDF:
      return BT_TRAIL;
    case 0xFF:
      switch ((unsigned char)lo) 
      {
      case 0xFF:
      case 0xFE:
        return BT_NONXML;
      }
      break;
    }
    return BT_NONASCII;
  }
  
  #define DEFINE_UTF16_TO_UTF8(E) \
  static void  PTRCALL \
  E ## toUtf8(const ENCODING *enc, \
              const char **fromP, const char *fromLim, \
              char **toP, const char *toLim) \
  { \
    const char *from; \
    for (from = *fromP; from != fromLim; from += 2) { \
      int plane; \
      unsigned char lo2; \
      unsigned char lo = GET_LO(from); \
      unsigned char hi = GET_HI(from); \
      switch (hi) { \
      case 0: \
        if (lo < 0x80) { \
          if (*toP == toLim) { \
            *fromP = from; \
            return; \
          } \
          *(*toP)++ = lo; \
          break; \
        } \
        /* fall through */ \
      case 0x1: case 0x2: case 0x3: \
      case 0x4: case 0x5: case 0x6: case 0x7: \
        if (toLim -  *toP < 2) { \
          *fromP = from; \
          return; \
        } \
        *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
        *(*toP)++ = ((lo & 0x3f) | 0x80); \
        break; \
      default: \
        if (toLim -  *toP < 3)\
        { \
          *fromP = from; \
          return; \
        } \
        /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
        *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
        *(*toP)++ = ((lo & 0x3f) | 0x80); \
        break; \
      case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
        if (toLim -  *toP < 4)\
        { \
          *fromP = from; \
          return; \
        } \
        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
        *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
        from += 2; \
        lo2 = GET_LO(from); \
        *(*toP)++ = (((lo & 0x3) << 4) \
                     | ((GET_HI(from) & 0x3) << 2) \
                     | (lo2 >> 6) \
                     | 0x80); \
        *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
        break; \
      } \
    } \
    *fromP = from; \
  }
  
  #define DEFINE_UTF16_TO_UTF16(E) \
  static void  PTRCALL \
  E ## toUtf16(const ENCODING *enc, \
               const char **fromP, const char *fromLim, \
               unsigned short **toP, const unsigned short *toLim) \
  { \
    /* Avoid copying first half only of surrogate */ \
    if (fromLim - *fromP > ((toLim - *toP) << 1) \
        && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
      fromLim -= 2; \
    for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
      *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
  }
  
  #define SET2(ptr, ch) \
    (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
  #define GET_LO(ptr) ((unsigned char)(ptr)[0])
  #define GET_HI(ptr) ((unsigned char)(ptr)[1])
  
  DEFINE_UTF16_TO_UTF8(little2_)
  DEFINE_UTF16_TO_UTF16(little2_)
  
  #undef SET2
  #undef GET_LO
  #undef GET_HI
  
  #define SET2(ptr, ch) \
    (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
  #define GET_LO(ptr) ((unsigned char)(ptr)[1])
  #define GET_HI(ptr) ((unsigned char)(ptr)[0])
  
  DEFINE_UTF16_TO_UTF8(big2_)
  DEFINE_UTF16_TO_UTF16(big2_)
  
  #undef SET2
  #undef GET_LO
  #undef GET_HI
  
  #define LITTLE2_BYTE_TYPE(enc, p) \
   ((p)[1] == 0 \
    ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
    : unicode_byte_type((p)[1], (p)[0]))
  #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
  #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
  #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
    UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
  #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
    UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
  
  #ifdef XML_MIN_SIZE
  
  static int PTRFASTCALL
  little2_byteType(const ENCODING *enc, const char *p)
  {
    return LITTLE2_BYTE_TYPE(enc, p);
  }
  
  static int PTRFASTCALL
  little2_byteToAscii(const ENCODING *enc, const char *p)
  {
    return LITTLE2_BYTE_TO_ASCII(enc, p);
  }
  
  static int PTRCALL
  little2_charMatches(const ENCODING *enc, const char *p, int c)
  {
    return LITTLE2_CHAR_MATCHES(enc, p, c);
  }
  
  static int PTRFASTCALL
  little2_isNameMin(const ENCODING *enc, const char *p)
  {
    return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
  }
  
  static int PTRFASTCALL
  little2_isNmstrtMin(const ENCODING *enc, const char *p)
  {
    return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
  }
  
  #undef VTABLE
  #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
  
  #else /* not XML_MIN_SIZE */
  
  #undef PREFIX
  #define PREFIX(ident) little2_ ## ident
  #define MINBPC(enc) 2
  /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
  #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
  #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
  #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
  #define IS_NAME_CHAR(enc, p, n) 0
  #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
  #define IS_NMSTRT_CHAR(enc, p, n) (0)
  #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
  
  #include "spp_tokenizer.c"
  
  #undef MINBPC
  #undef BYTE_TYPE
  #undef BYTE_TO_ASCII
  #undef CHAR_MATCHES
  #undef IS_NAME_CHAR
  #undef IS_NAME_CHAR_MINBPC
  #undef IS_NMSTRT_CHAR
  #undef IS_NMSTRT_CHAR_MINBPC
  #undef IS_INVALID_CHAR
  
  #endif /* not XML_MIN_SIZE */
  
  #ifdef XML_NS
  
  static const struct normal_encoding little2_encoding_ns = 
  {
    { VTABLE, 2, 0,
  #if BYTEORDER == 1234
      1
  #else
      0
  #endif
    },
    {
  #include "asciitab.h"
  #include "latin1tab.h"
    },
    STANDARD_VTABLE(little2_)
  };
  
  #endif
  
  static const struct normal_encoding little2_encoding = 
  {
    { VTABLE, 2, 0,
  #if BYTEORDER == 1234
      1
  #else
      0
  #endif
    },
    {
  #define BT_COLON BT_NMSTRT
  #include "asciitab.h"
  #undef BT_COLON
  #include "latin1tab.h"
    },
    STANDARD_VTABLE(little2_)
  };
  
  #if BYTEORDER != 4321
  
  #ifdef XML_NS
  
  static const struct normal_encoding internal_little2_encoding_ns = 
  {
    { VTABLE, 2, 0, 1 },
    {
  #include "iasciitab.h"
  #include "latin1tab.h"
    },
    STANDARD_VTABLE(little2_)
  };
  
  #endif
  
  static const struct normal_encoding internal_little2_encoding = 
  {
    { VTABLE, 2, 0, 1 },
    {
  #define BT_COLON BT_NMSTRT
  #include "iasciitab.h"
  #undef BT_COLON
  #include "latin1tab.h"
    },
    STANDARD_VTABLE(little2_)
  };
  
  #endif
  
  
  #define BIG2_BYTE_TYPE(enc, p) \
   ((p)[0] == 0 \
    ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
    : unicode_byte_type((p)[0], (p)[1]))
  #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
  #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
  #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
    UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
  #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
    UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
  
  #ifdef XML_MIN_SIZE
  
  static int PTRFASTCALL
  big2_byteType(const ENCODING *enc, const char *p)
  {
    return BIG2_BYTE_TYPE(enc, p);
  }
  
  static int PTRFASTCALL
  big2_byteToAscii(const ENCODING *enc, const char *p)
  {
    return BIG2_BYTE_TO_ASCII(enc, p);
  }
  
  static int PTRCALL
  big2_charMatches(const ENCODING *enc, const char *p, int c)
  {
    return BIG2_CHAR_MATCHES(enc, p, c);
  }
  
  static int PTRFASTCALL
  big2_isNameMin(const ENCODING *enc, const char *p)
  {
    return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
  }
  
  static int PTRFASTCALL
  big2_isNmstrtMin(const ENCODING *enc, const char *p)
  {
    return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
  }
  
  #undef VTABLE
  #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
  
  #else /* not XML_MIN_SIZE */
  
  #undef PREFIX
  #define PREFIX(ident) big2_ ## ident
  #define MINBPC(enc) 2
  /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
  #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
  #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
  #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
  #define IS_NAME_CHAR(enc, p, n) 0
  #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
  #define IS_NMSTRT_CHAR(enc, p, n) (0)
  #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
  
  #include "spp_tokenizer.c"
  
  #undef MINBPC
  #undef BYTE_TYPE
  #undef BYTE_TO_ASCII
  #undef CHAR_MATCHES
  #undef IS_NAME_CHAR
  #undef IS_NAME_CHAR_MINBPC
  #undef IS_NMSTRT_CHAR
  #undef IS_NMSTRT_CHAR_MINBPC
  #undef IS_INVALID_CHAR
  
  #endif /* not XML_MIN_SIZE */
  
  #ifdef XML_NS
  
  
  static const struct normal_encoding big2_encoding_ns = 
  {
    { VTABLE, 2, 0,
  #if BYTEORDER == 4321
    1
  #else
    0
  #endif
    },
    {
  #include "asciitab.h"
  #include "latin1tab.h"
    },
    STANDARD_VTABLE(big2_)
  };
  
  #endif
  
  static const struct normal_encoding big2_encoding = 
  {
    { VTABLE, 2, 0,
  #if BYTEORDER == 4321
    1
  #else
    0
  #endif
    },
    {
  #define BT_COLON BT_NMSTRT
  #include "asciitab.h"
  #undef BT_COLON
  #include "latin1tab.h"
    },
    STANDARD_VTABLE(big2_)
  };
  
  #if BYTEORDER != 1234
  
  #ifdef XML_NS
  
  static const struct normal_encoding internal_big2_encoding_ns = 
  {
    { VTABLE, 2, 0, 1 },
    {
  #include "iasciitab.h"
  #include "latin1tab.h"
    },
    STANDARD_VTABLE(big2_)
  };
  
  #endif
  
  static const struct normal_encoding internal_big2_encoding = 
  {
    { VTABLE, 2, 0, 1 },
    {
  #define BT_COLON BT_NMSTRT
  #include "iasciitab.h"
  #undef BT_COLON
  #include "latin1tab.h"
    },
    STANDARD_VTABLE(big2_)
  };
  
  #endif
  
  #undef PREFIX
  
  static int FASTCALL
  streqci(const char *s1, const char *s2)
  {
    for (;;) 
    {
      char c1 = *s1++;
      char c2 = *s2++;
      if (ASCII_a <= c1 && c1 <= ASCII_z)
        c1 += ASCII_A - ASCII_a;
      if (ASCII_a <= c2 && c2 <= ASCII_z)
        c2 += ASCII_A - ASCII_a;
      if (c1 != c2)
        return 0;
      if (!c1)
        break;
    }
    return 1;
  }
  
  /*static void PTRCALL
  initUpdatePosition(const ENCODING *enc, const char *ptr,
                     const char *end, POSITION *pos)
  {
    normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
  }*/
  
  static int
  toAscii(const ENCODING *enc, const char *ptr, const char *end)
  {
    char buf[1];
    char *p = buf;
    XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
    if (p == buf)
      return -1;
    else
      return buf[0];
  }
  
  static int FASTCALL
  isSpace(int c)
  {
    switch (c) 
    {
    case 0x20:
    case 0xD:
    case 0xA:
    case 0x9:
      return 1;
    }
    return 0;
  }
  
  /* Return 1 if there's just optional white space or there's an S
   * followed by name=val.
   */
  static int
  parsePseudoAttribute(const ENCODING *enc,
                       const char *ptr,
                       const char *end,
                       const char **namePtr,
                       const char **nameEndPtr,
                       const char **valPtr,
                       const char **nextTokPtr)
  {
    int c;
    char open;
    if (ptr == end) 
    {
      *namePtr = NULL;
      return 1;
    }
    if (!isSpace(toAscii(enc, ptr, end))) 
    {
      *nextTokPtr = ptr;
      return 0;
    }
    do 
    {
      ptr += enc->minBytesPerChar;
    } while (isSpace(toAscii(enc, ptr, end)));
    if (ptr == end) 
    {
      *namePtr = NULL;
      return 1;
    }
    *namePtr = ptr;
    for (;;) 
    {
      c = toAscii(enc, ptr, end);
      if (c == -1) 
      {
        *nextTokPtr = ptr;
        return 0;
      }
      if (c == ASCII_EQUALS) 
      {
        *nameEndPtr = ptr;
        break;
      }
      if (isSpace(c)) 
      {
        *nameEndPtr = ptr;
        do 
        {
          ptr += enc->minBytesPerChar;
        } while (isSpace(c = toAscii(enc, ptr, end)));
        if (c != ASCII_EQUALS) 
        {
          *nextTokPtr = ptr;
          return 0;
        }
        break;
      }
      ptr += enc->minBytesPerChar;
    }
    if (ptr == *namePtr) 
    {
      *nextTokPtr = ptr;
      return 0;
    }
    ptr += enc->minBytesPerChar;
    c = toAscii(enc, ptr, end);
    while (isSpace(c)) 
    {
      ptr += enc->minBytesPerChar;
      c = toAscii(enc, ptr, end);
    }
    if (c != ASCII_QUOT && c != ASCII_APOS) 
    {
      *nextTokPtr = ptr;
      return 0;
    }
    open = (char)c;
    ptr += enc->minBytesPerChar;
    *valPtr = ptr;
    for (;; ptr += enc->minBytesPerChar) 
    {
      c = toAscii(enc, ptr, end);
      if (c == open)
        break;
      if (!(ASCII_a <= c && c <= ASCII_z)
          && !(ASCII_A <= c && c <= ASCII_Z)
          && !(ASCII_0 <= c && c <= ASCII_9)
          && c != ASCII_PERIOD
          && c != ASCII_MINUS
          && c != ASCII_UNDERSCORE) 
      {
        *nextTokPtr = ptr;
        return 0;
      }
    }
    *nextTokPtr = ptr + enc->minBytesPerChar;
    return 1;
  }
  
  static const char KW_version[] = 
  {
    ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
  };
  
  static const char KW_encoding[] = 
  {
    ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
  };
  
  static const char KW_standalone[] = 
  {
    ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
    ASCII_n, ASCII_e, '\0'
  };
  
  static const char KW_yes[] = 
  {
    ASCII_y, ASCII_e, ASCII_s,  '\0'
  };
  
  static const char KW_no[] = 
  {
    ASCII_n, ASCII_o,  '\0'
  };
  
  static int
  doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
                                                   const char *,
                                                   const char *),
                 int isGeneralTextEntity,
                 const ENCODING *enc,
                 const char *ptr,
                 const char *end,
                 const char **badPtr,
                 const char **versionPtr,
                 const char **versionEndPtr,
                 const char **encodingName,
                 const ENCODING **encoding,
                 int *standalone)
  {
    /* enc = &utf8_encoding_ns;//this is temporarily hard coded by damitha */
    const char *val = NULL;
    const char *name = NULL;
    const char *nameEnd = NULL;
    /* printf("minBytesPerChar:%d\n", enc->minBytesPerChar); */
    ptr += 5 * enc->minBytesPerChar;
    end -= 2 * enc->minBytesPerChar;
    if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
        || !name) 
    {
      *badPtr = ptr;
      return 0;
    }
    if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) 
    {
      if (!isGeneralTextEntity) 
      {
        *badPtr = name;
        return 0;
      }
    }
    else 
    {
      if (versionPtr)
        *versionPtr = val;
      if (versionEndPtr)
        *versionEndPtr = ptr;
      if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) 
      {
        *badPtr = ptr;
        return 0;
      }
      if (!name) {
        if (isGeneralTextEntity) 
        {
          /* a TextDecl must have an EncodingDecl */
          *badPtr = ptr;
          return 0;
        }
        return 1;
      }
    }
    if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) 
    {
      int c = toAscii(enc, val, end);
      if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) 
      {
        *badPtr = val;
        return 0;
      }
      if (encodingName)
        *encodingName = val;
      if (encoding)
        *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
      if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) 
      {
        *badPtr = ptr;
        return 0;
      }
      if (!name)
        return 1;
    }
    if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
        || isGeneralTextEntity) 
    {
      *badPtr = name;
      return 0;
    }
    if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) 
    {
      if (standalone)
        *standalone = 1;
    }
    else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) 
    {
      if (standalone)
        *standalone = 0;
    }
    else 
    {
      *badPtr = val;
      return 0;
    }
    while (isSpace(toAscii(enc, ptr, end)))
      ptr += enc->minBytesPerChar;
    if (ptr != end) 
    {
      *badPtr = ptr;
      return 0;
    }
    return 1;
  }
  
  static int FASTCALL
  checkCharRefNumber(int result)
  {
    switch (result >> 8) 
    {
    case 0xD8: case 0xD9: case 0xDA: case 0xDB:
    case 0xDC: case 0xDD: case 0xDE: case 0xDF:
      return -1;
    case 0:
      if (latin1_encoding.type[result] == BT_NONXML)
        return -1;
      break;
    case 0xFF:
      if (result == 0xFFFE || result == 0xFFFF)
        return -1;
      break;
    }
    return result;
  }
  
  int FASTCALL
  XmlUtf8Encode(int c, char *buf)
  {
    enum 
    {
      /* minN is minimum legal resulting value for N byte sequence */
      min2 = 0x80,
      min3 = 0x800,
      min4 = 0x10000
    };
  
    if (c < 0)
      return 0;
    if (c < min2) {
      buf[0] = (char)(c | UTF8_cval1);
      return 1;
    }
    if (c < min3) 
    {
      buf[0] = (char)((c >> 6) | UTF8_cval2);
      buf[1] = (char)((c & 0x3f) | 0x80);
      return 2;
    }
    if (c < min4) 
    {
      buf[0] = (char)((c >> 12) | UTF8_cval3);
      buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
      buf[2] = (char)((c & 0x3f) | 0x80);
      return 3;
    }
    if (c < 0x110000) 
    {
      buf[0] = (char)((c >> 18) | UTF8_cval4);
      buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
      buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
      buf[3] = (char)((c & 0x3f) | 0x80);
      return 4;
    }
    return 0;
  }
  
  int FASTCALL
  XmlUtf16Encode(int charNum, unsigned short *buf)
  {
    if (charNum < 0)
      return 0;
    if (charNum < 0x10000) 
    {
      buf[0] = (unsigned short)charNum;
      return 1;
    }
    if (charNum < 0x110000) 
    {
      charNum -= 0x10000;
      buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
      buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
      return 2;
    }
    return 0;
  }
  
  struct unknown_encoding 
  {
    struct normal_encoding normal;
    int (*convert)(void *userData, const char *p);
    void *userData;
    unsigned short utf16[256];
    char utf8[256][4];
  };
  
  #define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc))
  
  int
  XmlSizeOfUnknownEncoding(void)
  {
    return sizeof(struct unknown_encoding);
  }
  
  static int PTRFASTCALL
  unknown_isName(const ENCODING *enc, const char *p)
  {
    const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
    int c = uenc->convert(uenc->userData, p);
    if (c & ~0xFFFF)
      return 0;
    return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
  }
  
  static int PTRFASTCALL
  unknown_isNmstrt(const ENCODING *enc, const char *p)
  {
    const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
    int c = uenc->convert(uenc->userData, p);
    if (c & ~0xFFFF)
      return 0;
    return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
  }
  
  static int PTRFASTCALL
  unknown_isInvalid(const ENCODING *enc, const char *p)
  {
    const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
    int c = uenc->convert(uenc->userData, p);
    return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
  }
  
  static void PTRCALL
  unknown_toUtf8(const ENCODING *enc,
                 const char **fromP, const char *fromLim,
                 char **toP, const char *toLim)
  {
    const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
    char buf[XML_UTF8_ENCODE_MAX];
    for (;;) 
    {
      const char *utf8;
      int n;
      if (*fromP == fromLim)
        break;
      utf8 = uenc->utf8[(unsigned char)**fromP];
      n = *utf8++;
      if (n == 0) 
      {
        int c = uenc->convert(uenc->userData, *fromP);
        n = XmlUtf8Encode(c, buf);
        if (n > toLim - *toP)
          break;
        utf8 = buf;
        *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
                   - (BT_LEAD2 - 2));
      }
      else 
      {
        if (n > toLim - *toP)
          break;
        (*fromP)++;
      }
      do 
      {
        *(*toP)++ = *utf8++;
      } while (--n != 0);
    }
  }
  
  static void PTRCALL
  unknown_toUtf16(const ENCODING *enc,
                  const char **fromP, const char *fromLim,
                  unsigned short **toP, const unsigned short *toLim)
  {
    const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
    while (*fromP != fromLim && *toP != toLim) 
    {
      unsigned short c = uenc->utf16[(unsigned char)**fromP];
      if (c == 0) 
      {
        c = (unsigned short)
            uenc->convert(uenc->userData, *fromP);
        *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
                   - (BT_LEAD2 - 2));
      }
      else
        (*fromP)++;
      *(*toP)++ = c;
    }
  }
  
  ENCODING *
  XmlInitUnknownEncoding(void *mem,
                         int *table,
                         CONVERTER convert, 
                         void *userData)
  {
    int i;
    struct unknown_encoding *e = (struct unknown_encoding *)mem;
    for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
      ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
    for (i = 0; i < 128; i++)
      if (latin1_encoding.type[i] != BT_OTHER
          && latin1_encoding.type[i] != BT_NONXML
          && table[i] != i)
        return 0;
    for (i = 0; i < 256; i++) 
    {
      int c = table[i];
      if (c == -1) {
        e->normal.type[i] = BT_MALFORM;
        /* This shouldn't really get used. */
        e->utf16[i] = 0xFFFF;
        e->utf8[i][0] = 1;
        e->utf8[i][1] = 0;
      }
      else if (c < 0) 
      {
        if (c < -4)
          return 0;
        e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
        e->utf8[i][0] = 0;
        e->utf16[i] = 0;
      }
      else if (c < 0x80) 
      {
        if (latin1_encoding.type[c] != BT_OTHER
            && latin1_encoding.type[c] != BT_NONXML
            && c != i)
          return 0;
        e->normal.type[i] = latin1_encoding.type[c];
        e->utf8[i][0] = 1;
        e->utf8[i][1] = (char)c;
        e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
      }
      else if (checkCharRefNumber(c) < 0) 
      {
        e->normal.type[i] = BT_NONXML;
        /* This shouldn't really get used. */
        e->utf16[i] = 0xFFFF;
        e->utf8[i][0] = 1;
        e->utf8[i][1] = 0;
      }
      else 
      {
        if (c > 0xFFFF)
          return 0;
        if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
          e->normal.type[i] = BT_NMSTRT;
        else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
          e->normal.type[i] = BT_NAME;
        else
          e->normal.type[i] = BT_OTHER;
        e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
        e->utf16[i] = (unsigned short)c;
      }
    }
    e->userData = userData;
    e->convert = convert;
    if (convert) 
    {
      e->normal.isName2 = unknown_isName;
      e->normal.isName3 = unknown_isName;
      e->normal.isName4 = unknown_isName;
      e->normal.isNmstrt2 = unknown_isNmstrt;
      e->normal.isNmstrt3 = unknown_isNmstrt;
      e->normal.isNmstrt4 = unknown_isNmstrt;
      e->normal.isInvalid2 = unknown_isInvalid;
      e->normal.isInvalid3 = unknown_isInvalid;
      e->normal.isInvalid4 = unknown_isInvalid;
    }
    e->normal.enc.utf8Convert = unknown_toUtf8;
    e->normal.enc.utf16Convert = unknown_toUtf16;
    return &(e->normal.enc);
  }
  
  /* If this enumeration is changed, getEncodingIndex and encodings
   * must also be changed. 
   */
  enum 
  {
    UNKNOWN_ENC = -1,
    ISO_8859_1_ENC = 0,
    US_ASCII_ENC,
    UTF_8_ENC,
    UTF_16_ENC,
    UTF_16BE_ENC,
    UTF_16LE_ENC,
    /* must match encodingNames up to here */
    NO_ENC
  };
  
  static const char KW_ISO_8859_1[] = 
  {
    ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
    ASCII_MINUS, ASCII_1, '\0'
  };
  static const char KW_US_ASCII[] = 
  {
    ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
    '\0'
  };
  static const char KW_UTF_8[] =  
  {
    ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
  };
  static const char KW_UTF_16[] = 
  {
    ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
  };
  static const char KW_UTF_16BE[] = 
  {
    ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
    '\0'
  };
  static const char KW_UTF_16LE[] = 
  {
    ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
    '\0'
  };
  
  static int FASTCALL
  getEncodingIndex(const char *name)
  {
    static const char *encodingNames[] = 
    {
      KW_ISO_8859_1,
      KW_US_ASCII,
      KW_UTF_8,
      KW_UTF_16,
      KW_UTF_16BE,
      KW_UTF_16LE,
    };
    int i;
    if (name == NULL)
      return NO_ENC;
    for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
      if (streqci(name, encodingNames[i]))
        return i;
    return UNKNOWN_ENC;
  }
  
  /* For binary compatibility, we store the index of the encoding
   * specified at initialization in the isUtf16 member.
   */
  
  #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
  #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
  
  /* This is what detects the encoding.  encodingTable maps from
   * encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
   * the external (protocol) specified encoding; state is
   * XML_CONTENT_STATE if we're parsing an external text entity, and
   * XML_PROLOG_STATE otherwise.
   */
  
  static int
  initScan(int* parserState, TokDataStruct* data, const ENCODING **encodingTable,
           const INIT_ENCODING *enc,
           int state,
           int *numOfChars,
       char* end,
           const char **nextTokPtr)
  {
    char *ptr = *nextTokPtr;
    const ENCODING **encPtr;
      /* printf("in xmltok.c\n"); */
  /*
   *    start temp code
   */
        encPtr = enc->encPtr;
        *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
        return XmlTok(parserState, data,*encPtr, state, numOfChars, end, nextTokPtr);
  /*    end temp code
   */
    if (ptr == end)
          return XML_TOK_NONE;
    encPtr = enc->encPtr;
    if (ptr + 1 == end) 
    {
      /* only a single byte available for auto-detection */
  #ifndef XML_DTD /* FIXME */
      /* a well-formed document entity must have more than one byte */
      if (state != XML_CONTENT_STATE)
        return XML_TOK_PARTIAL;
  #endif
      /* so we're parsing an external text entity... */
      /* if UTF-16 was externally specified, then we need at least 2 bytes */
      switch (INIT_ENC_INDEX(enc)) 
      {
      case UTF_16_ENC:
      case UTF_16LE_ENC:
      case UTF_16BE_ENC:
        return XML_TOK_PARTIAL;
      }
      switch ((unsigned char)*ptr) 
      {
      case 0xFE:
      case 0xFF:
      case 0xEF: /* possibly first byte of UTF-8 BOM */
        if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
            && state == XML_CONTENT_STATE)
          break;
        /* fall through */
      case 0x00:
      case 0x3C:
        return XML_TOK_PARTIAL;
      }
    }
    else 
    {
      switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) 
      {
      case 0xFEFF:
        if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
            && state == XML_CONTENT_STATE)
          break;
        *nextTokPtr = ptr + 2;
        *encPtr = encodingTable[UTF_16BE_ENC];
        return XML_TOK_BOM;
      /* 00 3C is handled in the default case */
      case 0x3C00:
        if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
             || INIT_ENC_INDEX(enc) == UTF_16_ENC)
            && state == XML_CONTENT_STATE)
          break;
        *encPtr = encodingTable[UTF_16LE_ENC];
        return XmlTok(parserState, data,*encPtr, state, numOfChars, end,
            nextTokPtr);
      case 0xFFFE:
        if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
            && state == XML_CONTENT_STATE)
          break;
        *nextTokPtr = ptr + 2;
        *encPtr = encodingTable[UTF_16LE_ENC];
        return XML_TOK_BOM;
      case 0xEFBB:
        /* Maybe a UTF-8 BOM (EF BB BF) */
        /* If there's an explicitly specified (external) encoding
         * of ISO-8859-1 or some flavour of UTF-16
         * and this is an external text entity,
         * don't look for the BOM,
         * because it might be a legal data.
         */
        if (state == XML_CONTENT_STATE) 
        {
          int e = INIT_ENC_INDEX(enc);
          if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
              || e == UTF_16LE_ENC || e == UTF_16_ENC)
            break;
        }
        if (ptr + 2 == end)
          return XML_TOK_PARTIAL;
        if ((unsigned char)ptr[2] == 0xBF) 
        {
          *nextTokPtr = ptr + 3;
          *encPtr = encodingTable[UTF_8_ENC];
          return XML_TOK_BOM;
        }
        break;
      default:
        if (ptr[0] == '\0') 
        {
          /* 0 isn't a legal data character. Furthermore a document
           * entity can only start with ASCII characters.  So the only
           * way this can fail to be big-endian UTF-16 if it it's an
           * external parsed general entity that's labelled as
           * UTF-16LE.
           */
          if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
            break;
          *encPtr = encodingTable[UTF_16BE_ENC];
          return XmlTok(parserState, data, *encPtr, state, numOfChars,
              end, nextTokPtr);
        }
        else if (ptr[1] == '\0') 
        {
          /* We could recover here in the case:
           *  - parsing an external entity
           *  - second byte is 0
           *  - no externally specified encoding
           *  - no encoding declaration
           * by assuming UTF-16LE.  But we don't, because this would mean when
           * presented just with a single byte, we couldn't reliably determine
           * whether we needed further bytes.
           */
          if (state == XML_CONTENT_STATE)
            break;
          *encPtr = encodingTable[UTF_16LE_ENC];
          return XmlTok(parserState, data,*encPtr, state, numOfChars,
              end, nextTokPtr);
        }
        break;
      }
    }
    *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
    return XmlTok(parserState, data,*encPtr, state, numOfChars, end, nextTokPtr);
  }
  
  
  #define NS(x) x
  #define ns(x) x
  #undef NS
  #undef ns
  
  #ifdef XML_NS
  
  #define NS(x) x ## NS
  #define ns(x) x ## _ns
  
  
  #undef NS
  #undef ns
  
  ENCODING *
  XmlInitUnknownEncodingNS(void *mem,
                           int *table,
                           CONVERTER convert, 
                           void *userData)
  {
    ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
    if (enc)
      ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
    return enc;
  }
  
  #endif /* XML_NS */
  
  
  
  
  
  const ENCODING *
  XmlGetUtf8InternalEncoding(void)
  {
    return &(internal_utf8_encoding).enc;
  }
  
  const ENCODING *
  XmlGetUtf16InternalEncoding(void)
  {
  #if BYTEORDER == 1234
    return &(internal_little2_encoding).enc;
  #elif BYTEORDER == 4321
    return &(internal_big2_encoding).enc;
  #else
    const short n = 1;
    return (*(const char *)&n
            ? &(internal_little2_encoding).enc
            : &(internal_big2_encoding).enc);
  #endif
  }
  
  static const ENCODING *encodings[] = 
  {
    &(latin1_encoding).enc,
    &(ascii_encoding).enc,
    &(utf8_encoding).enc,
    &(big2_encoding).enc,
    &(big2_encoding).enc,
    &(little2_encoding).enc,
    &(utf8_encoding).enc /* NO_ENC */
  };
  
  static int PTRCALL
  initScanProlog(int* parserState, TokDataStruct* data,const ENCODING *enc, 
                     int* numOfChars, char *end,
                     const char **nextTokPtr)
  {
    /* printf("XML_PROLOG_STATE:%d\n", XML_PROLOG_STATE); */
    return initScan(parserState, data, encodings,(const INIT_ENCODING *)enc,
                    XML_PROLOG_STATE, numOfChars, end, nextTokPtr);
  }
  
  static int PTRCALL
  initScanContent(int* parserState, TokDataStruct* data,const ENCODING *enc, 
                      int* numOfChars, char *end,
                      const char **nextTokPtr)
  {
    /* printf("XML_CONTENT_STATE:%d\n", XML_PROLOG_STATE); */
    return initScan(parserState, data, encodings,(const INIT_ENCODING *)enc,
                    XML_CONTENT_STATE, numOfChars, end, nextTokPtr);
  }
  
  int
  XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr,
                      const char *name)
  {
    int i = getEncodingIndex(name);
    if (i == UNKNOWN_ENC)
      return 0;
    SET_INIT_ENC_INDEX(p, i);
    p->initEnc.scanners[XML_PROLOG_STATE] = initScanProlog;
    p->initEnc.scanners[XML_CONTENT_STATE] = initScanContent;
    p->encPtr = encPtr;
    *encPtr = &(p->initEnc);
    /*printf("encoding:%d\n", i);*/
    /** This is where the encoding table(defined in xmltol.c
      * eg:
      *     #ifdef XML_NS
      *     static const struct normal_encoding utf8_encoding_ns =
      *     {
      *         { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
      *         {
      *             #include "asciitab.h"
      *             #include "utf8tab.h"
      *         },
      *         STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
      *      };
      *      #endif)
      *
      * is assigned to the encoding pointer.
      *
      */
    *encPtr = encodings[i];
  
    return 1;
  }
  
  static const ENCODING *
  findEncoding(const ENCODING *enc, const char *ptr, const char *end)
  {
  #define ENCODING_MAX 128
    char buf[ENCODING_MAX];
    char *p = buf;
    int i;
    XmlUtf8Convert(enc, &ptr, end, &p, p + ENCODING_MAX - 1);
    if (ptr != end)
      return 0;
    *p = 0;
    if (streqci(buf, KW_UTF_16) && enc->minBytesPerChar == 2)
      return enc;
    i = getEncodingIndex(buf);
    if (i == UNKNOWN_ENC)
      return 0;
    return encodings[i];
  }
  
  int
  XmlParseXmlDecl(int isGeneralTextEntity,
                      const ENCODING *enc,
                      const char *ptr,
                      const char *end,
                      const char **badPtr,
                      const char **versionPtr,
                      const char **versionEndPtr,
                      const char **encodingName,
                      const ENCODING **encoding,
                      int *standalone)
  {
    return doParseXmlDecl(findEncoding,
                          isGeneralTextEntity,
                          enc,
                          ptr,
                          end,
                          badPtr,
                          versionPtr,
                          versionEndPtr,
                          encodingName,
                          encoding,
                          standalone);
  }
  
  int SppUtf8Convert(const ENCODING *enc, const char **fromPtr,
                        const char *rawNameEnd,
      const char **toPtr, const char *bufEnd)
  {
      /* printf("fromPtr:%s\n", *fromPtr); */
      /* printf("rawNameEnd:%s\n", rawNameEnd); */
      /*This method is defined in spp_converter.h*/
      XmlUtf8Convert(enc, fromPtr, rawNameEnd, toPtr, bufEnd);
  }
  
  
  
  
  1.1                  ws-axis/c/src/xml/txpp/lib/spp_converter.h
  
  Index: spp_converter.h
  ===================================================================
  /*
   *   Copyright 2003-2004 The Apache Software Foundation.
   *
   *   Licensed under the Apache License, Version 2.0 (the "License");
   *   you may not use this file except in compliance with the License.
   *   You may obtain a copy of the License at
   *
   *       http://www.apache.org/licenses/LICENSE-2.0
   *
   *   Unless required by applicable law or agreed to in writing, software
   *   distributed under the License is distributed on an "AS IS" BASIS,
   *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   *   See the License for the specific language governing permissions and
   *   limitations under the License.
   */
  
  #include "spp.h"
  #ifndef XmlTok_OF_AXIS_INCLUDED
  #define XmlTok_OF_AXIS_INCLUDED 1
  
  #ifdef __cplusplus
  extern "C" {
  #endif
  
  /* The following token may be returned by XmlContentTok */
  #define XML_TOK_TRAILING_RSQB -5 /* ] or ]] at the end of the scan; might be
                                    *  start of illegal ]]> sequence 
                                    */
  /* The following tokens may be returned by both XmlPrologTok and
   * XmlContentTok.
   */
  #define XML_TOK_NONE -4          /* The string to be scanned is empty */
  #define XML_TOK_TRAILING_CR -3   /* A CR at the end of the scan;
                                      might be part of CRLF sequence */
  #define XML_TOK_PARTIAL_CHAR -2  /* only part of a multibyte sequence */
  #define XML_TOK_PARTIAL -1       /* only part of a token */
  #define XML_TOK_INVALID 0
  
  /* The following tokens are returned by XmlContentTok; some are also
   * returned by XmlAttributeValueTok, XmlEntityTok, XmlCdataSectionTok.
   */
  #define XML_TOK_START_TAG_WITH_ATTS 1
  #define XML_TOK_START_TAG_NO_ATTS 2
  #define XML_TOK_EMPTY_ELEMENT_WITH_ATTS 3 /* empty element tag <e/> */
  #define XML_TOK_EMPTY_ELEMENT_NO_ATTS 4
  #define XML_TOK_END_TAG 5
  #define XML_TOK_DATA_CHARS 6
  #define XML_TOK_DATA_NEWLINE 7
  #define XML_TOK_CDATA_SECT_OPEN 8
  #define XML_TOK_ENTITY_REF 9
  #define XML_TOK_CHAR_REF 10               /* numeric character reference */
  
  /* The following tokens may be returned by both XmlPrologTok and
   * XmlContentTok.
   */
  #define XML_TOK_PI 11                     /* processing instruction */
  #define XML_TOK_XML_DECL 12               /* XML decl or text decl */
  #define XML_TOK_COMMENT 13
  #define XML_TOK_BOM 14                    /* Byte order mark */
  
  /* The following tokens are returned only by XmlPrologTok */
  #define XML_TOK_PROLOG_S 15
  #define XML_TOK_DECL_OPEN 16              /* <!foo */
  #define XML_TOK_DECL_CLOSE 17             /* > */
  #define XML_TOK_NAME 18
  #define XML_TOK_NMTOKEN 19
  #define XML_TOK_POUND_NAME 20             /* #name */
  #define XML_TOK_OR 21                     /* | */
  #define XML_TOK_PERCENT 22
  #define XML_TOK_OPEN_PAREN 23
  #define XML_TOK_CLOSE_PAREN 24
  #define XML_TOK_OPEN_BRACKET 25
  #define XML_TOK_CLOSE_BRACKET 26
  #define XML_TOK_LITERAL 27
  #define XML_TOK_PARAM_ENTITY_REF 28
  #define XML_TOK_INSTANCE_START 29
  
  /* The following occur only in element type declarations */
  #define XML_TOK_NAME_QUESTION 30          /* name? */
  #define XML_TOK_NAME_ASTERISK 31          /* name* */
  #define XML_TOK_NAME_PLUS 32              /* name+ */
  #define XML_TOK_COND_SECT_OPEN 33         /* <![ */
  #define XML_TOK_COND_SECT_CLOSE 34        /* ]]> */
  #define XML_TOK_CLOSE_PAREN_QUESTION 35   /* )? */
  #define XML_TOK_CLOSE_PAREN_ASTERISK 36   /* )* */
  #define XML_TOK_CLOSE_PAREN_PLUS 37       /* )+ */
  #define XML_TOK_COMMA 38
  
  /* The following token is returned only by XmlAttributeValueTok */
  #define XML_TOK_ATTRIBUTE_VALUE_S 39
  
  /* The following token is returned only by XmlCdataSectionTok */
  #define XML_TOK_CDATA_SECT_CLOSE 40
  
  /* With namespace processing this is returned by XmlPrologTok for a
   * name with a colon.
   */
  
  #define XML_TOK_PREFIXED_NAME 41
  
  
  #define XML_N_STATES 3
  
  #define XML_PROLOG_STATE 0
  #define XML_CONTENT_STATE 1
  #define XML_CDATA_SECTION_STATE 2
  
  #define XML_N_LITERAL_TYPES 2
  #define XML_ATTRIBUTE_VALUE_LITERAL 0
  #define XML_ENTITY_VALUE_LITERAL 1
  
  /* The size of the buffer passed to XmlUtf8Encode must be at least this. */
  #define XML_UTF8_ENCODE_MAX 4
  /* The size of the buffer passed to XmlUtf16Encode must be at least this. */
  #define XML_UTF16_ENCODE_MAX 2
  
  typedef struct position 
  {
    /* first line and first column are 0 not 1 */
    unsigned long lineNumber;
    unsigned long columnNumber;
  } POSITION;
  
  typedef struct 
  {
    const char *name;
    const char *valuePtr;
    const char *valueEnd;
    char normalized;
  } ATTRIBUTE;
  
  struct encoding;
  typedef struct encoding ENCODING;
  
  /* typedef int (PTRCALL *SCANNER)(int *, TokDataStruct *, const ENCODING *,
   *                              const char *,
   *                              const char *,
   *                              const char **);
   */
  typedef int (PTRCALL *SCANNER)(int *, TokDataStruct *, const ENCODING *, int *, char*,
      const char **);
  
  /**
   * This struct is the type which represents encoding. This is struct is assigned
   * values in xmltok.c.
   * @see for examplestatic const struct normal_encoding internal_utf8_encoding_ns 
   *  in xmltok.c
   */
  struct encoding 
  {
    SCANNER scanners[XML_N_STATES];
    int (PTRCALL *nameMatchesAscii)(const ENCODING *,
                                    const char *,
                                    const char *,
                                    const char *);
    /* Converts to utf8 which is the parse output encoding which
     * is decided in
     * xpp_context_t* ct = (xpp_context_t*) parser_create("UTF-8");
     */
    void (PTRCALL *utf8Convert)(const ENCODING *enc,
                                const char **fromP,
                                const char *fromLim,
                                char **toP,
                                const char *toLim);
    void (PTRCALL *utf16Convert)(const ENCODING *enc,
                                 const char **fromP,
                                 const char *fromLim,
                                 unsigned short **toP,
                                 const unsigned short *toLim);
    int minBytesPerChar;
    char isUtf8;
    char isUtf16;
  };
  
  
  /*  Scan the string starting at ptr until the end of the next complete
   *  token, but do not scan past eptr.  Return an integer giving the
   *  type of token.
   *
   *  Return XML_TOK_NONE when ptr == eptr; nextTokPtr will not be set.
   *
   *   Return XML_TOK_PARTIAL when the string does not contain a complete
   *   token; nextTokPtr will not be set.
   *
   *  Return XML_TOK_INVALID when the string does not start a valid
   *  token; nextTokPtr will be set to point to the character which made
   *  the token invalid.
   *
   *  Otherwise the string starts with a valid token; nextTokPtr will be
   *  set to point to the character following the end of that token.
   *
   *  Each data character counts as a single token, but adjacent data
   *  characters may be returned together.  Similarly for characters in
   *  the prolog outside literals, comments and processing instructions.
   */
  
  
  #define XmlTok(parserState, data, enc, state, numOfChars, end, ptr) \
    (((enc)->scanners[state])(parserState, data, enc, numOfChars, end, ptr))
  
  #define XmlPrologTok(parserState, data, enc, numOfChars, end, ptr) \
     XmlTok(parserState, data, enc, XML_PROLOG_STATE, numOfChars, end, ptr)
  
  #define XmlContentTok(parserState, data, enc, numOfChars, end, ptr) \
     XmlTok(parserState, data,enc, XML_CONTENT_STATE, numOfChars, end, ptr)
  
  #define XmlNameMatchesAscii(enc, ptr1, end1, ptr2) \
    (((enc)->nameMatchesAscii)(enc, ptr1, end1, ptr2))
  
  #define XmlUtf8Convert(enc, fromP, fromLim, toP, toLim) \
    (((enc)->utf8Convert)(enc, fromP, fromLim, toP, toLim))
  
  #define XmlUtf16Convert(enc, fromP, fromLim, toP, toLim) \
    (((enc)->utf16Convert)(enc, fromP, fromLim, toP, toLim))
  
  typedef struct 
  {
    ENCODING initEnc;
    const ENCODING **encPtr;
  } INIT_ENCODING;
  
  int XmlParseXmlDecl(int isGeneralTextEntity,
                      const ENCODING *enc,
                      const char *ptr,
                      const char *end,
                      const char **badPtr,
                      const char **versionPtr,
                      const char **versionEndPtr,
                      const char **encodingNamePtr,
                      const ENCODING **namedEncodingPtr,
                      int *standalonePtr);
  
  int SppUtf8Convert(const ENCODING *, const char **, const char *,
      const char **, const char *);
  int XmlInitEncoding(INIT_ENCODING *, const ENCODING **, const char *name);
  const ENCODING *XmlGetUtf8InternalEncoding(void);
  const ENCODING *XmlGetUtf16InternalEncoding(void);
  int FASTCALL XmlUtf8Encode(int charNumber, char *buf);
  int FASTCALL XmlUtf16Encode(int charNumber, unsigned short *buf);
  int XmlSizeOfUnknownEncoding(void);
  
  typedef int (*CONVERTER)(void *userData, const char *p);
  
  ENCODING *
  XmlInitUnknownEncoding(void *mem,
                         int *table,
                         CONVERTER convert,
                         void *userData);
  
  
  int XmlInitEncoding(INIT_ENCODING *, const ENCODING **, const char *name);
  const ENCODING *XmlGetUtf8InternalEncodingNS(void);
  const ENCODING *XmlGetUtf16InternalEncodingNS(void);
  ENCODING *
  XmlInitUnknownEncodingNS(void *mem,
                           int *table,
                           CONVERTER convert,
                           void *userData);
  #ifdef __cplusplus
  }
  #endif
  
  #endif /* not XmlTok_OF_AXIS_INCLUDED */
  
  
  
  
  
  1.1                  ws-axis/c/src/xml/txpp/lib/spp_tokenizer.c
  
  Index: spp_tokenizer.c
  ===================================================================
  /*
   *   Copyright 2003-2004 The Apache Software Foundation.
   *
   *   Licensed under the Apache License, Version 2.0 (the "License");
   *   you may not use this file except in compliance with the License.
   *   You may obtain a copy of the License at
   *
   *       http://www.apache.org/licenses/LICENSE-2.0
   *
   *   Unless required by applicable law or agreed to in writing, software
   *   distributed under the License is distributed on an "AS IS" BASIS,
   *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   *   See the License for the specific language governing permissions and
   *   limitations under the License.
   */
  
  #include "spp.h"
  #ifndef PREFIX
  #define PREFIX(ident) ident
  #endif
  
  static int PTRCALL PREFIX(add_ptr)(char *ptr, TokDataStruct *data)
  {
      if (data->numOfPtrs == data->ptrBuffSize
          || !data->ptrBuff) 
      {
          int sz = data->ptrBuffSize << 1;
          char **ptrBuff = (char **)malloc(sz << 2);
          if (!ptrBuff)
              return SPP_ERROR_NO_MEMORY;
          if (data->ptrBuff) 
          {
              memmove(ptrBuff, data->ptrBuff, data->numOfPtrs << 2);
              free(data->ptrBuff);
          }
          data->ptrBuff = ptrBuff;
          data->ptrBuffSize = sz;
          /* printf("sz:%d\n", sz); */
      }
  
      data->ptrBuff[data->numOfPtrs++] = ptr;
      
      return SPP_ERROR_NONE;
  }
  
  static int PTRCALL
  PREFIX(is_letter)(int byteType)
  {
      if(BT_NMSTRT == byteType || BT_HEX == byteType)
          return -1;
      else 
          return 0;
      
  }
  
  static int PTRCALL
  PREFIX(is_name_start_char)(int byteType)
  {
      if(PREFIX(is_letter)(byteType) || BT_COLON == byteType)
          return -1;
      else
          return 0;
  }
  
  static int PTRCALL
  PREFIX(is_name_char)(int byteType)
  {
      if(PREFIX(is_letter)(byteType) || BT_DIGIT == byteType || 
          BT_COLON == byteType || BT_MINUS == byteType ||
           BT_NAME == byteType)
          return -1;
      else return 0;
  }
  
  static int PTRCALL
  PREFIX(doHomeWork)(TokDataStruct* data, int* parserState, char** ptr, int* numOfChars)
  {
      numOfChars -= MINBPC(enc);
      *ptr += MINBPC(enc);    
      data->numOfPtrs = 0;
      *parserState = S_0;
      return SPP_ERROR_NONE;
  }
  
  static int PTRCALL
  PREFIX(tokenizeProlog)(int* parserState, TokDataStruct* data, const ENCODING *enc,
                         int *numOfChars, char* end, const char **ptr)
  { 
      /* printf("*numOfChars:%d\n", *numOfChars); */
      int byteType;
      while(*numOfChars >= MINBPC(enc))
      {
          byteType = BYTE_TYPE(enc, *ptr);
          /* printf("*ptr:%s\n", *ptr); */
          /* printf("byteType:%d\n", byteType); */
          /* printf("*parserState:%d\n", *parserState); */
          switch (*parserState) 
          {
              case S_0:
                  if(BT_LT == byteType) 
                  {
                      *parserState = S_13;
                  }
                  else
                      data->type = PCDATA;
                  
                  break;
          
              case S_1:
                  if(BT_QUEST == byteType)
                  {
                      data->type = END_TAG;
                      *parserState = S_10;
                  }
                  else if (PREFIX(is_name_start_char)(byteType)) 
                  {
                      data->type = START_TAG;
                      *parserState = S_2;
                  }
                  else
                      *parserState = S_14;
                  
                  break;
          
              case S_2:
                  if (BT_GT == byteType) 
                  {
                      
                      *parserState = S_0;
                  }
                  else if (BT_QUEST == byteType) 
                  {
                      *parserState = S_9;
                  }
                  else if (BT_S == byteType) 
                  {
                      *parserState = S_3;
                  }    
                  else if (!PREFIX(is_name_char)(byteType))
                      *parserState = S_14;
                  break;
          
              case S_3:
                  if (BT_GT == byteType) 
                  {
                          PREFIX(doHomeWork)(data, parserState, ptr, numOfChars);
                          return SPP_ERROR_NONE;
                      
                  }
                  else if (BT_QUEST == byteType)
                      *parserState = S_9;
                  else if (PREFIX(is_name_start_char)(byteType)) 
                  {
                      *parserState = S_4;
                  }    
                  else if (BT_S != byteType)
                      *parserState = S_14;
                  break;
              
              case S_4:
                  if (BT_EQUALS == byteType) 
                  {
                      *parserState = S_6;
                  }
                  else if (BT_S == byteType) 
                  {
                      *parserState = S_5;
                  }
                  else if (!PREFIX(is_name_char)(byteType))
                      *parserState = S_14;
                  break;
          
              case S_5:
                  if (BT_EQUALS == byteType)
                      *parserState = S_6;
                  else if (BT_S != byteType)
                      *parserState = S_14;
                  break;
          
              case S_6:
                  if (BT_APOS == byteType) 
                  {
                      *parserState = S_8;
                  }
                  else if (BT_QUOT == byteType) 
                  {
                      *parserState = S_7;
                  }
                  else if (BT_S != byteType)
                      *parserState = S_14;
                  break;
          
              case S_7:
                  if (BT_QUOT == byteType) 
                  {
                      *parserState = S_3;
                  }
                  else if(BT_LT == byteType || BT_AMP == byteType)
                  {                    
                      *parserState = S_14;
                      /* some problem exists. fix! */
                  }
                  break;
              
              case S_8:
                  if (BT_APOS == byteType) 
                  {
                      *parserState = S_3;
                  }
                  else if (BT_LT == byteType || BT_AMP == byteType)
                      *parserState = S_14;
                  break;
              
              case S_9:
                  data->type = EMPTY_ELEMENT_TAG;
                  
                  if (BT_GT == byteType) 
                  {
                      PREFIX(doHomeWork)(data, parserState, ptr, numOfChars);
                      return SPP_ERROR_NONE;    
                  }
                  else
                      *parserState = S_14;
                  break;
          
              case S_10:
                  if (PREFIX(is_name_start_char)(byteType)) 
                  {
                      *parserState = S_11;
                  }
                  else
                      *parserState = S_14;
                  break;
          
              case S_11:
                  if (BT_GT == byteType) 
                  {
                      *parserState = S_0;
                  }
                  else if (BT_S == byteType) 
                  {
                      *parserState = S_12;
                  }
                  else if (!PREFIX(is_name_char)(byteType))
                      *parserState = S_14;
                  break;
          
              case S_12:
                  if (BT_GT == byteType) 
                  {
                      *parserState = S_0;
                  }
                  else if (BT_S != byteType)
                      *parserState = S_14;
                  break;
          
              case S_13:
                  if(BT_QUEST == byteType)
                      *parserState = S_1;
                  else
                      *parserState = S_14;
                  break;
  
              case S_14:
                  printf("some problem exists. fix it!\n");
      
              default:
                  
                  return -(*parserState);
              }
  
              *numOfChars -= MINBPC(enc);
              *ptr += MINBPC(enc);
          }
  
          return SPP_ERROR_TOKENIZER_FAILED;
  
  }
  
  static int PTRCALL
  PREFIX(tokenizeContent)(int* parserState, TokDataStruct* data, const ENCODING *enc,
                          int *numOfChars, char* end, const char **ptr)
  { 
      /* printf("*numOfChars:%d\n", *numOfChars); */
      int byteType;
      while(*numOfChars >= MINBPC(enc))
      {
          byteType = BYTE_TYPE(enc, *ptr);
          /*printf("*ptr:%s\n", *ptr);*/
          /* printf("byteType:%d\n", byteType); */
          /* printf("*parserState:%d\n", *parserState); */
          switch (*parserState) 
          {
              case S_0:
                  /* if ('<' == ch) { */
                  if(BT_LT == byteType) 
                  {
                      if (data->numOfPtrs) 
                      {
                          PREFIX(add_ptr)(*ptr - 1, data);
                          return SPP_ERROR_NONE;
                      }
                      
                      *parserState = S_1;
                  }
                  else
                  if (!data->numOfPtrs) 
                  {
                      data->type = PCDATA;
                      PREFIX(add_ptr)(*ptr, data); 
                  }
                  
                  break;
          
              case S_1:
                  /* if ('/' == ch) { */
                  if(BT_SOL == byteType)
                  {
                      data->type = END_TAG;
                      
                      *parserState = S_10;
                  }
                  else if (PREFIX(is_name_start_char)(byteType)) 
                  {
                      data->type = START_TAG;
                      PREFIX(add_ptr)(*ptr, data);
                      
                      *parserState = S_2;
                  }
                  else
                      *parserState = S_13;
                  
                  break;
          
              case S_2:
                  if (BT_GT == byteType) 
                  {
                      if (data->numOfPtrs) 
                      {
                          PREFIX(add_ptr)(*ptr - 1, data);
                          return SPP_ERROR_NONE;
                      }
                      
                      *parserState = S_0;
                  }
                  else if (BT_SOL == byteType) 
                  {
                      PREFIX(add_ptr)(*ptr - 1, data);
                      
                      *parserState = S_9;
                  }
                  /* else if (is_white_space(ch)) { */
                  else if (BT_S == byteType) 
                  {
                      PREFIX(add_ptr)(*ptr - 1, data);
                      
                      *parserState = S_3;
                  }    
                  /* else if (!is_name_char(ch)) */
                  else if (!PREFIX(is_name_char)(byteType))
                      *parserState = S_13;
                  break;
          
              case S_3:
                  if (BT_GT == byteType) 
                  {
                      if (data->numOfPtrs)
                          return SPP_ERROR_NONE;
                      
                      *parserState = S_0;
                  }
                  /* else if ('/' == ch) */
                  else if (BT_SOL == byteType)
                      *parserState = S_9;
                  else if (PREFIX(is_name_start_char)(byteType)) 
                  {
                      PREFIX(add_ptr)(*ptr, data);
                      
                      *parserState = S_4;
                  }    
                  else if (BT_S != byteType)
                      *parserState = S_13;
                  break;
              
              case S_4:
                  /* if ('=' == ch) { */
                  if (BT_EQUALS == byteType) 
                  {
                      PREFIX(add_ptr)(*ptr - 1, data);
                      
                      *parserState = S_6;
                  }
                  else if (BT_S == byteType) 
                  {
                      PREFIX(add_ptr)(*ptr - 1, data);
                      
                      *parserState = S_5;
                  }
                  else if (!PREFIX(is_name_char)(byteType))
                      *parserState = S_13;
                  break;
          
              case S_5:
                  /* if ('=' == ch) */
                  if (BT_EQUALS == byteType)
                      *parserState = S_6;
                  else if (BT_S != byteType)
                      *parserState = S_13;
                  break;
          
              case S_6:
                  /* if ('\'' == ch) { */
                  if (BT_APOS == byteType) 
                  {
                      PREFIX(add_ptr)(*ptr, data);
                      
                      *parserState = S_8;
                  }
                  /* else if ('"' == ch) { */
                  else if (BT_QUOT == byteType) 
                  {
                      PREFIX(add_ptr)(*ptr, data);
                      
                      *parserState = S_7;
                  }
                  else if (BT_S != byteType)
                      *parserState = S_13;
                  break;
          
              case S_7:
                  /* if ('"' == ch) { */
                  if (BT_QUOT == byteType) 
                  {
                      PREFIX(add_ptr)(*ptr, data);
                      
                      *parserState = S_3;
                  }
                  /* else if ('<' == ch || '&' == ch) */
                  else if (BT_LT == byteType || BT_AMP == byteType)
                      *parserState = S_13;
                  /* some problem exists. fix it!. */
                  break;
              
              case S_8:
                  /* if ('\'' == byteType) { */
                  if (BT_APOS == byteType) 
                  {
                      PREFIX(add_ptr)(*ptr, data);
                  
                      *parserState = S_3;
                  }
                  /* else if ('<' == ch || '&' == ch) */
                  else if (BT_LT == byteType || BT_AMP == byteType)
                      *parserState = S_13;
                  break;
              
              case S_9:
                  data->type = EMPTY_ELEMENT_TAG;
                  
                  /* if ('>' == ch) { */
                  if (BT_GT == byteType) 
                  {
                      if (data->numOfPtrs)
                          return SPP_ERROR_NONE;
                      
                      *parserState = S_0;
                  }
                  else
                      *parserState = S_13;
                  break;
          
              case S_10:
                  if (PREFIX(is_name_start_char)(byteType)) 
                  {
                      PREFIX(add_ptr)(*ptr, data);
                      
                      *parserState = S_11;
                  }
                  else
                      *parserState = S_13;
                  break;
          
              case S_11:
                  /* if ('>' == ch) { */
                  if (BT_GT == byteType) 
                  {
                      if (data->numOfPtrs) 
                      {
                          PREFIX(add_ptr)(*ptr - 1, data);
                          return SPP_ERROR_NONE;
                      }
                      
                      *parserState = S_0;
                  }
                  else if (BT_S == byteType) 
                  {
                      PREFIX(add_ptr)(*ptr - 1, data);
                      
                      *parserState = S_12;
                  }
                  else if (!PREFIX(is_name_char)(byteType))
                      *parserState = S_13;
                  break;
          
              case S_12:
                  /* if ('>' == ch) { */
                  if (BT_GT == byteType) 
                  {
                      if (data->numOfPtrs)
                          return SPP_ERROR_NONE;
                      
                      *parserState = S_0;
                  }
                  else if (BT_S != byteType)
                      *parserState = S_13;
                  break;
          
              case S_13:
                  /* some problem exists. fix it!. */
      
              default:
                  
                  return -(*parserState);
              }
  
              *numOfChars -= MINBPC(enc);
              *ptr += MINBPC(enc);
          }
  
          return SPP_ERROR_TOKENIZER_FAILED;
  
  }
  
  static int PTRCALL
  PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
                           const char *end1, const char *ptr2)
  {
    for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) 
    {
      if (ptr1 == end1)
        return 0;
      if (!CHAR_MATCHES(enc, ptr1, *ptr2))
        return 0;
    }
    return ptr1 == end1;
  }
              
  
  
  
  1.1                  ws-axis/c/src/xml/txpp/lib/spp_tokenizer.h
  
  Index: spp_tokenizer.h
  ===================================================================
  /*
   *   Copyright 2003-2004 The Apache Software Foundation.
   *
   *   Licensed under the Apache License, Version 2.0 (the "License");
   *   you may not use this file except in compliance with the License.
   *   You may obtain a copy of the License at
   *
   *       http://www.apache.org/licenses/LICENSE-2.0
   *
   *   Unless required by applicable law or agreed to in writing, software
   *   distributed under the License is distributed on an "AS IS" BASIS,
   *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   *   See the License for the specific language governing permissions and
   *   limitations under the License.
   */
  
  enum 
  {
    BT_NONXML,      /* 0: */
    BT_MALFORM,     /* 1: */
    BT_LT,     /* 2:less than */
    BT_AMP,    /* 3:ampersand */
    BT_RSQB,     /* 4: */
    BT_LEAD2,    /* 5: */
    BT_LEAD3,    /* 6: */
    BT_LEAD4,    /* 7: */
    BT_TRAIL,    /* 8: */
    BT_CR,    /* 9: */
    BT_LF,    /* 10:carriage return(new line) */
    BT_GT,    /* 11:greater than */
    BT_QUOT,    /* 12:double quote */
    BT_APOS,    /* 13:single quote */
    BT_EQUALS,    /* 14:equal sign */
    BT_QUEST,    /* 15:question mark(?) */
    BT_EXCL,    /* 16: */
    BT_SOL,    /* 17:slash */
    BT_SEMI,    /* 18: */
    BT_NUM,    /* 19: */
    BT_LSQB,    /* 20: */
    BT_S,        /* 21:space */
    BT_NMSTRT,   /* //22:_, */
    BT_COLON,    /*23:colon */
    BT_HEX,    /* 24:hexadecimal characters */
    BT_DIGIT,    /* 25:digit */
    BT_NAME,    /* 26:dot, */
    BT_MINUS,    /* 27:dash(minus sign) */
    BT_OTHER,     /*28: known not to be a name or name start character... for ex  ` character */
    BT_NONASCII,     /* 29: might be a name or name start character  */
    BT_PERCNT,    /* 30: */
    BT_LPAR,    /* 31: */
    BT_RPAR,    /* 32: */ 
    BT_AST,    /* 33: */
    BT_PLUS,    /* 34: */ 
    BT_COMMA,    /* 35: */
    BT_VERBAR    /* 36: */
  };    
  
  
  #include <stddef.h>
  
  
  
  
  

Mime
View raw message