stdcxx-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From se...@apache.org
Subject svn commit: r448754 [2/6] - /incubator/stdcxx/trunk/util/
Date Fri, 22 Sep 2006 00:42:17 GMT
Modified: incubator/stdcxx/trunk/util/codecvt.cpp
URL: http://svn.apache.org/viewvc/incubator/stdcxx/trunk/util/codecvt.cpp?view=diff&rev=448754&r1=448753&r2=448754
==============================================================================
--- incubator/stdcxx/trunk/util/codecvt.cpp (original)
+++ incubator/stdcxx/trunk/util/codecvt.cpp Thu Sep 21 17:42:16 2006
@@ -2,20 +2,27 @@
  *
  * codecvt.cpp
  *
- * $Id: //stdlib/dev/source/stdlib/util/codecvt.cpp#4 $
+ * $Id$
  *
  ***************************************************************************
  *
- * Copyright (c) 1994-2005 Quovadx,  Inc., acting through its  Rogue Wave
- * Software division. Licensed under the Apache License, Version 2.0 (the
- * "License");  you may  not use this file except  in compliance with the
- * License.    You    may   obtain   a   copy   of    the   License    at
- * http://www.apache.org/licenses/LICENSE-2.0.    Unless   required    by
- * applicable law  or agreed to  in writing,  software  distributed under
- * the License is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR
- * CONDITIONS OF  ANY KIND, either  express or implied.  See  the License
- * for the specific language governing permissions  and limitations under
- * the License.
+ * Licensed to the Apache Software  Foundation (ASF) under one or more
+ * contributor  license agreements.  See  the NOTICE  file distributed
+ * with  this  work  for  additional information  regarding  copyright
+ * ownership.   The ASF  licenses this  file to  you under  the Apache
+ * License, Version  2.0 (the  "License"); you may  not use  this file
+ * except in  compliance with the License.   You may obtain  a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the  License is distributed on an  "AS IS" BASIS,
+ * WITHOUT  WARRANTIES OR CONDITIONS  OF ANY  KIND, either  express or
+ * implied.   See  the License  for  the  specific language  governing
+ * permissions and limitations under the License.
+ *
+ * Copyright 2001-2006 Rogue Wave Software.
  * 
  **************************************************************************/
 
@@ -26,264 +33,347 @@
 #include "scanner.h"      // for scanner
 
 #include <cassert>        // for assert()
+#include <climits>        // for UCHAR_MAX
+#include <cstring>        // for memset()
 #include <fstream>        // for ifstream, ofstream
 
 
-// all characters should go into the codecvt_mb_set
-void Def::
-gen_valid_codecvt_mb_set () 
+typedef std::map<std::string, wchar_t>::const_iterator n_cmap_citer2;
+
+
+std::size_t Def::
+gen_mbchar_tables (codecvt_offsets_map_t           &tab,
+                   std::map<std::string, unsigned> &off_map,
+                   const std::string               &charp /* = "" */,
+                   unsigned                         tabno /* = 0 */)
 {
-    if (!valid_codecvt_mb_set_.empty())
-        return;
-    
-    for (n_cmap_iter2 n_cmap_it = charmap_.get_n_cmap2().begin();
-         n_cmap_it != charmap_.get_n_cmap2().end(); n_cmap_it++) {
-        std::string valid = n_cmap_it->first.substr 
-            (0, n_cmap_it->first.size() - 1);
-        while (valid.size() > 0){
-            valid_codecvt_mb_set_.insert (valid);
-            valid = valid.substr(0, valid.size() - 1); 
+    // upon the first call (but not during subsequent recursive calls)
+    // generate a set of multibyte prefixes from the set of all known
+    // multibyte characters
+    static unsigned               ntabs   = 0;
+    static std::set<std::string>* pfx_set = 0;
+
+    const n_cmap_citer2 mb_map_end = charmap_.get_mb_cmap ().end ();
+
+    if (0 == pfx_set) {
+        pfx_set = new std::set<std::string>;
+
+        // iterate over the range of valid multibyte characters
+        // obtained from the charmap and generate a complete
+        // subset of non-empty multibyte prefixes from each
+        unsigned off = 0;
+
+        const n_cmap_citer2 mb_map_begin = charmap_.get_mb_cmap ().begin ();
+
+        for (n_cmap_citer2 it = mb_map_begin; it != mb_map_end; ++it, ++off) {
+
+            // insert the ordinal number of each multibyte character
+            // into a map for fast lookup later
+            off_map.insert (std::make_pair (it->first, off));
+
+            // generate non-empty prefixes up to one byte less
+            // in length than the complete multibyte character
+            for (std::string prefix = it->first; 1 < prefix.size (); ) {
+                prefix = prefix.substr (0, prefix.size () - 1);
+                pfx_set->insert (prefix);
+            }
         }
     }
-}
 
+    // number of valid characters inserted into the tables
+    std::size_t nchars = 0;
 
-void Def::
-create_wchar_utf8_table ()
-{
-    if (!wchar_utf8_to_ext_.empty())
-        return;
+    // an array of offsets to the multibyte character or to the next
+    // array containing such offsets (defined recursively for up to
+    // MB_CUR_MAX levels of nesting)
+    codecvt_offset_tab_t* const offsets = new codecvt_offset_tab_t;
 
-    n_cmap_iter2 n_cmap_it;
-    for (n_cmap_it = charmap_.get_n_cmap2().begin();
-         n_cmap_it != charmap_.get_n_cmap2().end();
-         n_cmap_it ++) {
-        std::string wchar_utf8 = utf8_encode (n_cmap_it->second);
-        wchar_utf8_to_ext_.insert (std::make_pair (wchar_utf8, 
-                                                   n_cmap_it->first));
-    }
-}
+    std::string mb_char (charp + '\0');
 
+    for (unsigned i = 0; i <= UCHAR_MAX; ++i) {
 
-void Def::
-gen_valid_codecvt_wchar_set () {
+        unsigned char cur_char = (unsigned char)i;
 
-    if (!valid_codecvt_wchar_set_.empty())
-        return;
+        mb_char [mb_char.size () - 1] = char (cur_char);
 
-    create_wchar_utf8_table();
+        if (mb_map_end == charmap_.get_mb_cmap ().find (mb_char)) {
+            // mb_char is not a complete, valid multibyte character
+            // check to see if it's a prefix of one
+
+            if (pfx_set->find (mb_char) == pfx_set->end ()) {
+                // mb_char is not a prefix of a valid multibyte
+                // character, mark it invalide
+                offsets->off [cur_char] = UINT_MAX;
+            }
+            else {
+                // mb_char is a prefix of a valid multibyte character,
+                // set the MSB to denote that it "continues" in the
+                // table at the next higher offset
+                offsets->off [cur_char] = ++ntabs | 0x80000000;
 
-    for (wchar_utf8_iter it = wchar_utf8_to_ext_.begin();
-         it != wchar_utf8_to_ext_.end(); it++) {
-        std::string str = it->first.substr (0, it->first.size () - 1);
-        while (str.size() > 0) {
-            valid_codecvt_wchar_set_.insert (str);
-            str = str.substr (0, str.size() - 1); 
+                // generate that table
+                nchars += gen_mbchar_tables (tab, off_map, mb_char, ntabs);
+            }
+        }
+        else {
+            // mb_char is a complete, valid miltibyte character
+            // insert its ordinal number (offset) into the array
+            offsets->off [cur_char] = off_map.find (mb_char)->second;
+            ++nchars;
         }
     }
-}
 
+    // insert the completely populated table into the map
+    tab.insert (std::make_pair (tabno, offsets));
 
-void Def::
-gen_valid_codecvt_utf8_set () {
-
-    if (!valid_codecvt_utf8_set_.empty())
-        return;
-
-    for (ucs4_cmap_iter it = charmap_.get_ucs4_cmap().begin();
-         it != charmap_.get_ucs4_cmap().end(); it++) {
-        std::string str = utf8_encode(it->second);
-        str = str.substr (0, str.size () - 1);
-        while (str.size() > 0) {
-            valid_codecvt_utf8_set_.insert (str);
-            str = str.substr (0, str.size() - 1); 
-        }
+    if (0 == ntabs) {
+        // clean up on return from the topmost (non-recursive) call
+        delete pfx_set;
+        pfx_set = 0;
     }
+
+    return nchars;
 }
 
 
-void Def::
-generate_codecvt_table (const std::string &charp, 
-                        unsigned int tab_num)
+std::size_t Def::
+gen_wchar_tables (codecvt_offsets_map_t &tab,
+                  const std::string     &charp /* = "" */, 
+                  unsigned int           tabno /* = 0 */)
 {
-    gen_valid_codecvt_mb_set();
-    ctype_offset_tab_t tab;
-    n_cmap_iter2 n_cmap_it;
+    // upon the first call (but not during subsequent recursive calls)
+    // generate a set of multibyte prefixes from the set of all known
+    // multibyte characters
+    static unsigned                            ntabs   = 0;
+    static std::set<std::string>              *pfx_set = 0;
+    static std::map<std::string, unsigned>    *off_map = 0;
+    static std::map<std::string, std::string> *utf_map = 0;
 
-    for (unsigned int i = 0; i <= UCHAR_MAX; i++) {
+    if (0 == utf_map) {
+        pfx_set = new std::set<std::string>;
+        off_map = new std::map<std::string, unsigned>;
+        utf_map = new std::map<std::string, std::string>;
 
-        unsigned char cur_char = (unsigned char)i;
+        const n_cmap_citer2 first = charmap_.get_mb_cmap ().begin ();
+        const n_cmap_citer2 last  = charmap_.get_mb_cmap ().end ();
 
-        std::string mb_char = charp;
-        mb_char += char (cur_char);
+        unsigned off = 0;
 
-        n_cmap_it = charmap_.get_n_cmap2 ().find (mb_char);
+        for (n_cmap_citer2 it = first; it != last; ++it) {
 
-        if (n_cmap_it == charmap_.get_n_cmap2 ().end ()) {
+            off_map->insert (std::make_pair (it->first, off));
 
-            if (   valid_codecvt_mb_set_.find (mb_char) 
-                != valid_codecvt_mb_set_.end ()) {
+            off += it->first.size () + 1;
 
-                ++next_codecvt_tab_num_;
-                tab.off [cur_char] = next_codecvt_tab_num_ | 0x80000000;
-                generate_codecvt_table (mb_char, next_codecvt_tab_num_);
-            }
-            else {
-                tab.off [cur_char] = UINT_MAX;
+            std::string utf = utf8_encode (it->second);
+
+            utf_map->insert (std::make_pair (utf, it->first));
+
+            while (1 < utf.size ()) {
+                utf = utf.substr (0, utf.size () - 1);
+                pfx_set->insert (utf);
             }
         }
-        else {
-            // get the offset for this character and put it in the table
-            tab.off[cur_char] = wchar_off_map_.find (mb_char)->second;
-        }
     }
 
-    mb_char_offs_.insert (std::make_pair (tab_num, tab));
-}
+    codecvt_offset_tab_t* const offsets = new codecvt_offset_tab_t;
 
+    // number of valid characters inserted into the tables
+    std::size_t nchars = 0;
 
-void Def::
-generate_wchar_codecvt_table (const std::string &charp, 
-                              unsigned int tab_num)
-{
-    gen_valid_codecvt_wchar_set();
+    std::string mb_char (charp + '\0');
 
-    ctype_offset_tab_t tab;
-    wchar_utf8_iter wu_it;
+    for (unsigned i = 0; i <= UCHAR_MAX; ++i) {
 
-    for (unsigned int i = 0; i <= UCHAR_MAX; i++){
         unsigned char cur_char = (unsigned char)i;
-        std::string mb_char (charp);
-        mb_char += (char)cur_char;
 
-        wu_it = wchar_utf8_to_ext_.find (mb_char);
-        if (wu_it != wchar_utf8_to_ext_.end()) {
-            tab.off[cur_char] = (mb_char_off_map_.find 
-                                 (wu_it->second))->second;
-        }
-        else {
-            valid_codecvt_wchar_set_iter wit = valid_codecvt_wchar_set_.find (mb_char);
-            if (wit != valid_codecvt_wchar_set_.end()) {
-                ++next_wchar_codecvt_tab_num_;
-                tab.off[cur_char] = next_wchar_codecvt_tab_num_ | 0x80000000;
-                generate_wchar_codecvt_table (mb_char, 
-                                              next_wchar_codecvt_tab_num_);
+        mb_char [mb_char.size () - 1] = char (cur_char);
+
+        const wchar_utf8_iter it = utf_map->find (mb_char);
+        if (it == utf_map->end ()) {
+            if (pfx_set->find (mb_char) == pfx_set->end ()) {
+                offsets->off [cur_char] = UINT_MAX;
             }
             else {
-                tab.off[cur_char] = UINT_MAX;
+                offsets->off [cur_char] = ++ntabs | 0x80000000;
+
+                nchars += gen_wchar_tables (tab, mb_char, ntabs);
             }
         }
+        else {
+            offsets->off [cur_char] = off_map->find (it->second)->second;
+
+            ++nchars;
+        }
+    }
+
+    tab.insert (std::make_pair (tabno, offsets));
+
+    if (0 == ntabs) {
+        // clean up
+        delete pfx_set;
+        delete utf_map;
+
+        pfx_set = 0;
+        utf_map = 0;
     }
 
-    wchar_offs_.insert (std::make_pair (tab_num, tab));
+    return nchars;
 }
 
 
-void Def::
-gen_utf8_map()
+std::size_t Def::
+gen_utf8_tables (codecvt_offsets_map_t           &tab,
+                 std::map<std::string, unsigned> &off_map,
+                 const std::string               &charp /* = "" */,
+                 unsigned                         tabno /* = 0 */)
 {
-    if (!utf8_map_.empty())
-        return;
+    static unsigned                        ntabs   = 0;
+    static std::set<std::string>          *pfx_set = 0;
+    static std::map<std::string, wchar_t> *utf_map = 0;
+
+    if (0 == pfx_set) {
+        pfx_set = new std::set<std::string>;
+
+        const ucs4_cmap_iter first = charmap_.get_ucs4_cmap ().begin ();
+        const ucs4_cmap_iter last  = charmap_.get_ucs4_cmap ().end ();
+
+        for (ucs4_cmap_iter it = first; it != last; ++it) {
+
+            for (std::string prefix = utf8_encode (it->second);
+                 1 < prefix.size (); ) {
+                prefix = prefix.substr (0, prefix.size () - 1);
+                pfx_set->insert (prefix);
+            }
+        }
+    }
+
+    // the set of complete utf8 strings in the current character map
+    typedef std::map<std::string, wchar_t>::iterator utf8_map_iter;
+
+    if (0 == utf_map) {
+        utf_map = new std::map<std::string, wchar_t>;
     
-    for (ucs4_cmap_iter it = charmap_.get_ucs4_cmap().begin();
-         it != charmap_.get_ucs4_cmap().end(); it++) {
-        utf8_map_.insert (std::make_pair(utf8_encode (it->second),
-                                         it->second));
+        const ucs4_cmap_iter first = charmap_.get_ucs4_cmap ().begin ();
+        const ucs4_cmap_iter last  = charmap_.get_ucs4_cmap ().end ();
+
+        for (ucs4_cmap_iter it = first; it != last; ++it) {
+            const std::string utf = utf8_encode (it->second);
+            utf_map->insert (std::make_pair (utf, it->second));
+        }
     }
-}
 
+    codecvt_offset_tab_t* const offsets = new codecvt_offset_tab_t;
 
-void Def::
-generate_utf8_codecvt_table (const std::string &charp, 
-                             unsigned int tab_num)
-{
-    gen_valid_codecvt_utf8_set();
-    gen_utf8_map();
+    // number of valid characters inserted into the tables
+    std::size_t nchars = 0;
 
-    ctype_offset_tab_t tab;
-    utf8_map_iter utf8_it;
+    std::string mb_char = charp + '\0';
+
+    for (unsigned int i = 0; i <= UCHAR_MAX; ++i) {
 
-    for (unsigned int i = 0; i <= UCHAR_MAX; i++){
         unsigned char cur_char = (unsigned char)i;
-        std::string mb_char = charp;
-        mb_char += (char)cur_char;
-        if ((utf8_it = utf8_map_.find (mb_char)) 
-            != utf8_map_.end()) {
+
+        mb_char [mb_char.size () - 1] = char (cur_char);
+
+        const utf8_map_iter where = utf_map->find (mb_char);
+
+        if (where == utf_map->end ()) {
+            if (pfx_set->find (mb_char) == pfx_set->end ()) {
+                offsets->off [cur_char] = UINT_MAX;
+            }
+            else {
+                offsets->off [cur_char] = ++ntabs | 0x80000000;
+                nchars += gen_utf8_tables (tab, off_map, mb_char, ntabs);
+            }
+        }
+        else {
             // first get the symbolic name
             std::string str
-                = charmap_.get_rucs4_cmap().find(utf8_it->second)->second;
+                = charmap_.get_rucs4_cmap ().find (where->second)->second;
+
             // then get the internal encoding of the character
-            wchar_t int_enc = charmap_.get_w_cmap().find (str)->second;
+            const wchar_t int_enc = charmap_.get_w_cmap().find (str)->second;
+
             // then get the external encoding to use in a lookup in 
             // mb_char_off_map
-            str = charmap_.get_rn_cmap2().find (int_enc)->second;
-            tab.off[cur_char] = (mb_char_off_map_.find 
-                                 (str))->second;
-        }
-        else {
-            if (valid_codecvt_utf8_set_.find (mb_char) 
-                != valid_codecvt_utf8_set_.end()) {
-                ++next_utf8_codecvt_tab_num_;
-                tab.off[cur_char] = next_utf8_codecvt_tab_num_ | 0x80000000;
-                generate_utf8_codecvt_table (mb_char, 
-                                             next_utf8_codecvt_tab_num_);
-            }
-            else {
-                tab.off[cur_char] = UINT_MAX;
-            }
+            str = charmap_.get_rmb_cmap ().find (int_enc)->second;
+
+            offsets->off [cur_char] = off_map.find (str)->second;
+
+            ++nchars;
         }
     }
-    utf8_offs_.insert (std::make_pair (tab_num, tab));
+
+    tab.insert (std::make_pair (tabno, offsets));
+
+    if (0 == ntabs) {
+        // clean up
+        delete pfx_set;
+        delete utf_map;
+
+        pfx_set = 0;
+        utf_map = 0;
+    }
+    return nchars;
 }
 
 
 void Def::
-generate_xliteration_data ()
+gen_xlit_data ()
 {
     // data offset points to the beginning of the data containing
     // the narrow strings character encodings
     unsigned int data_offset = 0;
 
     // traverse the map and construct the map of offsets
-    xlit_map_t::iterator it = xlit_map_.begin ();
-    for (; it != xlit_map_.end (); it++) {
+    xlit_map_t::const_iterator it = xlit_map_.begin ();
+    for (; it != xlit_map_.end (); ++it) {
         // insert pair(wchar_t value, offset of first string in data block)
         xlit_data_offset_map_.insert (
             std::make_pair (it->first,data_offset));
 
         // advance the data_offset value to the next "first" string
-        std::list<std::string>::iterator sit = 
+        std::list<std::string>::const_iterator sit = 
             it->second.begin ();
-        for (; sit != it->second.end (); sit++) {
+        for (; sit != it->second.end (); ++sit) {
             data_offset += sit->size () + 1;
         }
-        data_offset++;
+        ++data_offset;
     }
 
     // create a new table (first), populate it with default values
     // and insert it in the tables map
     xlit_offset_table_t table0;
     unsigned int k;
-    for (k = 0; k < _RWSTD_UCHAR_MAX + 1; k++)
-        table0.offset_table [k] = _RWSTD_UINT_MAX;
+    for (k = 0; k < UCHAR_MAX + 1; ++k)
+        table0.offset_table [k] = UINT_MAX;
 
     // insert it into the map
     xlit_table_map_.insert (std::make_pair(0, table0));
 
+    const xlit_map_t::const_iterator xlit_map_end = xlit_map_.end ();
+
     // traverse the map again and build the tables
-    for (it = xlit_map_.begin (); it != xlit_map_.end (); it++) {
+    for (it = xlit_map_.begin (); it != xlit_map_end; ++it) {
+
         // encode the wchar_t value to UTF-8
-        std::string utf8_rep (utf8_encode (it->first));
+        const std::string utf8_rep (utf8_encode (it->first));
         data_offset = xlit_data_offset_map_.find (it->first)->second;
 
         // traverse the utf8 representation string and create the 
         // necessary tables and populate the indexes
         unsigned int table_idx = 0;
-        std::string::iterator string_it = utf8_rep.begin ();
-        for (; string_it != utf8_rep.end (); string_it++) {
+
+        const std::string::const_iterator utf8_rep_end = utf8_rep.end ();
+        std::string::const_iterator       string_it    = utf8_rep.begin ();
+
+        for (; string_it != utf8_rep_end; ++string_it) {
             // get the table corresponding to the current index and locate
             // the value at that index
-            xlit_table_map_t::iterator res = xlit_table_map_.find (table_idx);
+            const xlit_table_map_t::iterator res =
+                xlit_table_map_.find (table_idx);
+
             assert (res != xlit_table_map_.end ());
 
             // offset in table
@@ -291,12 +381,12 @@
 
             // res is the iterator pointing to the correct table in the map
             // check the index and if not populated, create a new table
-            if (res->second.offset_table [off_idx] == _RWSTD_UINT_MAX) {
+            if (res->second.offset_table [off_idx] == UINT_MAX) {
 
                 // if this is the last position in the string, then
                 // fill the table position with the offset of the string data
                 if ((string_it + 1) == utf8_rep.end ()) {
-                    xlit_data_offset_map_t::iterator data_it = 
+                    xlit_data_offset_map_t::const_iterator data_it = 
                         xlit_data_offset_map_.find (it->first);
                     assert (data_it != xlit_data_offset_map_.end ());
 
@@ -307,8 +397,8 @@
 
                 // create a new table and append it to the map
                 xlit_offset_table_t table;
-                for (unsigned int i = 0; i < _RWSTD_UCHAR_MAX + 1; i++)
-                    table.offset_table [i] = _RWSTD_UINT_MAX;
+                for (unsigned int i = 0; i < UCHAR_MAX + 1; ++i)
+                    table.offset_table [i] = UINT_MAX;
 
                 // insert it into the map
                 unsigned int tmp = xlit_table_map_.size ();
@@ -329,96 +419,98 @@
 void Def::
 write_codecvt (std::string dir_name)
 {
-    next_wchar_codecvt_tab_num_ = 0;
-    next_utf8_codecvt_tab_num_ = 0;
-
     // if it has been already written
     if (codecvt_written_)
         return;
 
     // compose the directory name 
     ((dir_name += _RWSTD_PATH_SEP) += "..") += _RWSTD_PATH_SEP;
-    dir_name += charmap_.get_code_set_name();
+    dir_name += charmap_.get_code_set_name ();
 
-    // check for its existence
-    std::ifstream in (dir_name.c_str(), std::ios::in);
-
-    if (in) {
+    // check to see if the codecvt database already exists and
+    // avoid recreating it if it does (as an optimization)
+    if (std::ifstream (dir_name.c_str ())) {
         issue_diag (I_OPENWR, false, 0,
                     "%s exists, skipping\n", dir_name.c_str ());
         return;
     }
 
-    issue_diag (I_OPENWR, false, 0, "writing %s\n", dir_name.c_str ());
+    //////////////////////////////////////////////////////////////////
+    // generate multibyte conversion tables
+    issue_diag (I_STAGE, false, 0, "generating multibyte tables\n");
 
-    // create the stream with exceptions enabled
-    std::ofstream out (dir_name.c_str(), std::ios::binary);
-    out.exceptions (std::ios::failbit | std::ios::badbit);
-            
-    std::size_t temp_off = 0;
-    std::size_t count_off = 0;
-    n_cmap_iter2 iter;
-    for (iter = charmap_.get_n_cmap2().begin();
-         iter != charmap_.get_n_cmap2().end(); iter++, count_off ++) {
-        mb_char_off_map_.insert (std::make_pair (iter->first, 
-                                                 temp_off));
-        wchar_off_map_.insert (std::make_pair (iter->first,
-                                               count_off));
-        temp_off += iter->first.size() + 1;
-              
-    }
+    codecvt_offsets_map_t           mbchar_offs;
+    std::map<std::string, unsigned> off_map;
+    const std::size_t n_mbchars = gen_mbchar_tables (mbchar_offs, off_map);
 
-    next_codecvt_tab_num_ = 0;
-    next_wchar_codecvt_tab_num_ = 0;
-                
-    generate_codecvt_table ("", 0);
-    generate_wchar_codecvt_table ("", 0);
-    generate_utf8_codecvt_table ("", 0);
+    // generate wchar_t conversion tables
+    issue_diag (I_STAGE, false, 0, "generating wchar_t tables\n");
+
+    codecvt_offsets_map_t wchar_offs;
+    const std::size_t n_wchars = gen_wchar_tables (wchar_offs);
+
+    // generate UTF-8 conversion conversion tables
+    issue_diag (I_STAGE, false, 0, "generating UTF-8 tables\n");
+
+    codecvt_offsets_map_t uchar_offs;
+    const std::size_t n_uchars = gen_utf8_tables (uchar_offs, off_map);
+
+    // not needed beyond this point, clear it out
+    off_map.clear ();
     
     // generate the transliteration tables and the transliteration data
-    generate_xliteration_data ();
+    issue_diag (I_STAGE, false, 0, "generating transliteration tables\n");
+    gen_xlit_data ();
+
+    //////////////////////////////////////////////////////////////////
+    // populate the codecvt structure before writing it out
+    // in binary form to the file (the codecvt database)
+    _RW::__rw_codecvt_t codecvt_out;
+    std::memset (&codecvt_out, 0, sizeof codecvt_out);
+
+    // calculate byte offsets within the structure
+    codecvt_out.n_to_w_tab_off = 0;
+    codecvt_out.w_to_n_tab_off = codecvt_out.n_to_w_tab_off
+        + mbchar_offs.size () * (UCHAR_MAX + 1) * sizeof (unsigned);
 
-    // calculate all offsets
-    codecvt_out_.n_to_w_tab_off = 0;
-    codecvt_out_.w_to_n_tab_off = codecvt_out_.n_to_w_tab_off 
-        + mb_char_offs_.size() * (UCHAR_MAX + 1) 
-        * sizeof (unsigned int);
-
-    codecvt_out_.utf8_to_ext_tab_off = codecvt_out_.w_to_n_tab_off
-        + wchar_offs_.size() * (UCHAR_MAX + 1)
-        * sizeof (unsigned int);
+    codecvt_out.utf8_to_ext_tab_off = codecvt_out.w_to_n_tab_off
+        + wchar_offs.size () * (UCHAR_MAX + 1) * sizeof (unsigned);
 
     // insert the transliteration tables here
-    codecvt_out_.xliteration_off = codecvt_out_.utf8_to_ext_tab_off
-        + utf8_offs_.size() * (UCHAR_MAX + 1)
-        * sizeof (unsigned int);
-
-    codecvt_out_.wchar_off = codecvt_out_.xliteration_off + 
-        xlit_table_map_.size () * (_RWSTD_UCHAR_MAX + 1) * 
-        sizeof (unsigned int);
+    codecvt_out.xliteration_off = codecvt_out.utf8_to_ext_tab_off
+        + uchar_offs.size () * (UCHAR_MAX + 1) * sizeof (unsigned);
 
-    codecvt_out_.codeset_off = codecvt_out_.wchar_off 
-        + charmap_.get_n_cmap2().size() * 2 * sizeof (wchar_t);
+    codecvt_out.wchar_off = codecvt_out.xliteration_off
+        + xlit_table_map_.size () * (UCHAR_MAX + 1) * sizeof (unsigned);
 
-    codecvt_out_.charmap_off = codecvt_out_.codeset_off 
-        + charmap_.get_code_set_name().size() + 1;
+    codecvt_out.codeset_off = codecvt_out.wchar_off
+        + charmap_.get_mb_cmap ().size () * 2 * sizeof (wchar_t);
+
+    codecvt_out.charmap_off = codecvt_out.codeset_off
+        + charmap_.get_code_set_name ().size () + 1 /* NUL */;
             
-    std::size_t mb_offset = codecvt_out_.charmap_off
-        + charmap_.get_charmap_name().size() + 1;
+    const std::size_t mb_offset = codecvt_out.charmap_off
+        + charmap_.get_charmap_name ().size () + 1 /* NUL */;
 
     // compute the size of narrow strings map which added to 
     // mb_offset will give the start of the transliteration data
     std::size_t xlit_data_offset = mb_offset;
-    for (iter = charmap_.get_n_cmap2().begin();
-         iter != charmap_.get_n_cmap2().end(); iter++) {
+
+    mb_cmap_iter iter;
+
+    for (iter = charmap_.get_mb_cmap ().begin();
+         iter != charmap_.get_mb_cmap().end(); ++iter) {
         xlit_data_offset += iter->first.size() + 1;
     }
 
     // now traverse again the utf8 tables for transliteration data
     // and recompute the offsets:
+    const xlit_table_map_t::const_iterator xlit_table_map_end =
+        xlit_table_map_.end ();
+
     xlit_table_map_t::iterator xit = xlit_table_map_.begin ();
-    for (; xit != xlit_table_map_.end (); xit++) {
-        for (unsigned int i = 0; i < _RWSTD_UCHAR_MAX + 1; i++) {
+    for (; xit != xlit_table_map_end; ++xit) {
+        for (unsigned int i = 0; i < UCHAR_MAX + 1; ++i) {
             if (xit->second.offset_table [i] & 0x80000000)
                 continue;
             // add the offset for xliteration data 
@@ -426,78 +518,111 @@
         }
     }
 
-    wchar_offs_iter wchar_offs_it;
-    for (wchar_offs_it = wchar_offs_.begin(); 
-         wchar_offs_it != wchar_offs_.end(); wchar_offs_it ++) {
-        for (unsigned int i = 0; i <= UCHAR_MAX; i++) {
-            if (!((wchar_offs_it->second).off[i] & 0x80000000))
-                (wchar_offs_it->second).off[i] += mb_offset;
-        }
-    }
-
-    utf8_offs_iter utf8_offs_it;
-    for (utf8_offs_it = utf8_offs_.begin(); 
-         utf8_offs_it != utf8_offs_.end(); utf8_offs_it ++) {
-        for (unsigned int i = 0; i <= UCHAR_MAX; i++) {
-            if (!((utf8_offs_it->second).off[i] & 0x80000000))
-                (utf8_offs_it->second).off[i] += mb_offset;
-        }
-    }
-    codecvt_out_.mb_cur_max = charmap_.get_mb_cur_max();
+    codecvt_out.mb_cur_max = charmap_.get_mb_cur_max();
 
+    issue_diag (I_OPENWR, false, 0, "writing %s\n", dir_name.c_str ());
 
+    // create the stream with exceptions enabled
+    std::ofstream out (dir_name.c_str(), std::ios::binary);
+    out.exceptions (std::ios::failbit | std::ios::badbit);
+            
     // write the codecvt_out structure
-    out.write ((char*)&codecvt_out_, sizeof(codecvt_out_));
+    out.write ((char*)&codecvt_out, sizeof codecvt_out);
 
-    issue_diag (I_WRITE, false, 0, "writing char to wchar_t table\n");
+    typedef codecvt_offsets_map_t::iterator off_iter_t;
 
-    // write the narrow_to_wide tables
-    mb_char_offs_iter mb_char_offs_it;
-    for (mb_char_offs_it = mb_char_offs_.begin(); 
-         mb_char_offs_it != mb_char_offs_.end(); mb_char_offs_it++) {
-        for (unsigned int c = 0; c <= UCHAR_MAX; c++) {
-            out.write ((const char*)&mb_char_offs_it->second.off[c],
-                       sizeof (mb_char_offs_it->second.off[c]));
+    //////////////////////////////////////////////////////////////////
+    // write out the multibyte to wchar_t tables
+    issue_diag (I_WRITE, false, 0,
+                "writing %lu multibyte tables (%lu characters)\n",
+                mbchar_offs.size (), n_mbchars);
+
+    for (off_iter_t it = mbchar_offs.begin (); it != mbchar_offs.end (); ++it) {
+        for (unsigned i = 0; i <= UCHAR_MAX; ++i) {
+
+            const unsigned off = it->second->off [i];
+
+            out.write ((const char*)&off, sizeof off);
         }
+
+        delete it->second->off;
     }
 
-    issue_diag (I_WRITE, false, 0, "writing wchar_t to char table\n");
+    // not needed beyond this point, clear it out
+    mbchar_offs.clear ();
+
+    //////////////////////////////////////////////////////////////////
+    // write out the wchar_t to multibyte conversion tables
+    issue_diag (I_WRITE, false, 0,
+                "writing %lu wchar_t tables (%lu characters)\n",
+                wchar_offs.size (), n_wchars);
+
+    for (off_iter_t it = wchar_offs.begin (); it != wchar_offs.end (); ++it) {
+        for (unsigned i = 0; i <= UCHAR_MAX; ++i) {
+
+            // adjust offsets to multibyte characters (but not those
+            // to other tables or invalid encodings)
+            unsigned off = it->second->off [i];
+
+            if (!(off & 0x80000000))
+                off += mb_offset;
 
-    // now write the wide_to_narrow tables
-    for (wchar_offs_it = wchar_offs_.begin(); 
-         wchar_offs_it != wchar_offs_.end(); wchar_offs_it++) {
-        for (unsigned int c = 0; c <= UCHAR_MAX; c++) {
-            out.write ((const char*)&wchar_offs_it->second.off[c],
-                       sizeof (wchar_offs_it->second.off[c]));
+            out.write ((const char*)&off, sizeof off);
         }
+
+        delete it->second->off;
     }
             
-    issue_diag (I_WRITE, false, 0, "writing UTF-8 to char table\n");
+    // not needed beyond this point, clear it out
+    wchar_offs.clear ();
+
+    //////////////////////////////////////////////////////////////////
+    // write out the UTF-8 to (libc) multibyte tables
+    issue_diag (I_WRITE, false, 0,
+                "writing %lu UTF-8 tables (%lu characters)\n",
+                uchar_offs.size (), n_uchars);
+
+    for (off_iter_t it = uchar_offs.begin (); it != uchar_offs.end (); ++it) {
+        for (unsigned i = 0; i <= UCHAR_MAX; ++i) {
+
+            // adjust offsets to multibyte characters (but not those
+            // to other tables or invalid encodings)
+            unsigned off = it->second->off [i];
 
-    // write the utf8_to_external tables
-    for (utf8_offs_it = utf8_offs_it = utf8_offs_.begin(); 
-         utf8_offs_it != utf8_offs_.end(); utf8_offs_it++) {
-        for (unsigned int c = 0; c <= UCHAR_MAX; c++) {
-            out.write ((const char*)&utf8_offs_it->second.off[c],
-                       sizeof (utf8_offs_it->second.off[c]));
+            if (!(off & 0x80000000))
+                off += mb_offset;
+
+            out.write ((const char*)&off, sizeof off);
         }
+
+        delete it->second->off;
     }
 
-    issue_diag (I_WRITE, false, 0, "writing transliteration lookup table\n");
+    // not needed beyond this point, clear it out
+    uchar_offs.clear ();
+
+    //////////////////////////////////////////////////////////////////
+    // write out the transliteration UTF-8 lookup tables
+    issue_diag (I_WRITE, false, 0,
+                "writing transliteration table (size %lu)\n",
+                xlit_table_map_.size ());
 
-    // write the transliteration UTF-8 lookup tables
     xit = xlit_table_map_.begin ();
-    for (; xit != xlit_table_map_.end (); xit++) {
-        unsigned int* ptable = &xit->second.offset_table [0];
-        for (unsigned int i = 0; i < _RWSTD_UCHAR_MAX + 1; i++, ptable++) 
+    for (; xit != xlit_table_map_end; ++xit) {
+        const unsigned int* ptable = &xit->second.offset_table [0];
+        for (unsigned int i = 0; i < UCHAR_MAX + 1; ++i, ++ptable) 
             out.write ((const char*)ptable, sizeof (unsigned int));
     }
 
-    issue_diag (I_WRITE, false, 0, "writing UCS to wchar_t table\n");
+    issue_diag (I_WRITE, false, 0,
+                "writing the UCS table (%lu characters)\n",
+                charmap_.get_mb_cmap ().size ());
+
+    const mb_cmap_iter n_cmap2_end = charmap_.get_mb_cmap ().end ();
 
     // write the locale-encoded wchar_t and the UCS4 wchar_t
-    for (iter = charmap_.get_n_cmap2().begin();
-         iter != charmap_.get_n_cmap2().end(); iter++) {
+    for (iter = charmap_.get_mb_cmap ().begin();
+         iter != n_cmap2_end; ++iter) {
         out.write ((const char*)&iter->second, sizeof (iter->second));
         out.write ((const char*)& (charmap_.get_ucs4_cmap().find 
                                    (charmap_.get_rw_cmap().find 
@@ -511,19 +636,21 @@
 
 
     // write out the narrow character strings
-    for (iter = charmap_.get_n_cmap2().begin();
-         iter != charmap_.get_n_cmap2().end(); iter++) {
+    for (iter = charmap_.get_mb_cmap().begin();
+         iter != n_cmap2_end; ++iter) {
         out.write (iter->first.c_str(), iter->first.size() + 1);
     }
 
-    issue_diag (I_WRITE, false, 0, "writing transliteration table\n");
+    issue_diag (I_WRITE, false, 0,
+                "writing transliteration data (size %lu)\n",
+                xlit_map_.size ());
 
     // write out the transliteration data
-    xlit_map_t::iterator xlit_data_it = xlit_map_.begin ();
-    for (; xlit_data_it != xlit_map_.end (); xlit_data_it++) {
-        std::list<std::string>::iterator sit = 
+    xlit_map_t::const_iterator xlit_data_it = xlit_map_.begin ();
+    for (; xlit_data_it != xlit_map_.end (); ++xlit_data_it) {
+        std::list<std::string>::const_iterator sit = 
             xlit_data_it->second.begin ();
-        for (; sit != xlit_data_it->second.end (); sit++) {
+        for (; sit != xlit_data_it->second.end (); ++sit) {
             out.write (sit->c_str (), sit->size () + 1);
         }
         out.write ("\0", 1);



Mime
View raw message