Author: sebor Date: Thu Sep 21 17:42:16 2006 New Revision: 448754 URL: http://svn.apache.org/viewvc?view=rev&rev=448754 Log: 2006-09-21 Martin Sebor * aliases.cpp (get_installed_locales): Redirected locale's stderr to /dev/null to prevent it from cluttering our own. Eliminated the handling of invalid category values. * charmap.h (): Included. (portable_charset): Declared. (get_n_cmap, get_rn_cmap, get_w_cmap, get_rw_cmap): Documented. (get_n_cmap2, get_rn_cmap2, get_strval_map, increment_val): Renamed... (get_mb_cmap, get_rmb_cmap, get_symnames_list, increment_wchar): ...to this. (get_strval_map): Replaced... (get_symnames_list): ...with this. (process_ellipsis): Changed to return the number of characters denoted by the ellipsis instead of void. (increment_strval): Renamed... (increment_encoding): ...to this for clarity and changed it to return true on success and false otherwise. (verify_portable_charset): Made const. (parse_ext_strval): Renamed... (encoding_to_mbchar): ...to this for clarity. (add_to_cmaps): Added a bool argument. (n_cmap2_, rn_cmap2_): Renamed... (mb_cmap_, rmb_cmap_): ...to this for clarity. (valid_mb_set_, valid_wchar_set_, strval_map_): Removed unused members. (symnames_list_): Added new member. * def.h (process_abs_ellipsis): New. (process_sym_ellipsis): Returned the number of characters denoted by the ellipsis instead of void. (hex_increment, dec_increment): Removed (defined as static non-members in collate.cpp). (process_mask): Added an argument denoting the human readable name of the mask for progress diagnostics. (process_transliteration, process_transliteration_statement, generate_xliteration_data): Renamed... (process_xlit, process_xlit_statement, gen_xlit_data): ...to this for brevity. (extract_string_array): New. (ctype_offset_tab_t): Renamed... (codecvt_offset_tab_t): ...to this. (wchar_off_map_, mb_char_offs_, wchar_offs_, wchar_utf8_to_ext_, utf8_offs_, valid_codecvt_utf8_set_, utf8_map_, next_codecvt_tab_num_, next_wchar_codecvt_tab_num_, next_utf8_codecvt_tab_num_, codecvt_out_): Removed data members. (mb_char_offs_iter, wchar_offs_iter, utf8_offs_iter, utf8_map_iter, (gen_valid_codecvt_wchar_set, gen_valid_codecvt_utf8_set, gen_utf8_map, generate_wchar_codecvt_table, generate_utf8_codecvt_table): Replaced member functions... (gen_mbchar_tables, gen_wchar_tables, gen_utf8_tables): ...with these. * diagnostic.h (E_NOTSUP, W_NOTSUP, W_CHARMAP, I_SKIP): Added. (W_COLVAL): Removed. * scanner.h (token_id): Arranged enumerators by LC_XXX section and documented. Added enumerators for LC_ADDRESS and other sections defined by ISO/IEC TR 14652. (tok_ellipsis, tok_dellipsis, tok_qellipsis, tok_doub_inc_ellipsis): Renamed... (tok_abs_ellipsis, tok_hex_ellipsis, tok_dec_ellipsis, tok_dbl_ellipsis): ...to this for better correspondence to ISO/IEC TR 14652 names. (tok_eof, tok_cont_line): Removed unused names. (tok_octal_value, tok_decimal_value, tok_hex_value): Replaced... (tok_char_value): ...with this for simplicity. * charmap.cpp (convert_escape): Removed dead code. (portable_charset): Defined. (convert_to_wc): Simplified, added comments. (increment_val): Renamed... (increment_wchar): ...to this, renamed locals for clarity, and replaced hardcoded 0xff with UCHAR_MAX (is this really correct when char is more that 8 bits wide?). (increment_strval): Renamed... (increment_encoding): ...to this, removed local statics, and optimized. The function is unused since it is not possible to increment a multibyte character so that its last byte overflows (i.e., wraps around to 0). It should/will be removed in a future commit. (parse_ext_strval): Renamed... (encoding_to_mbchar): ...to this and renamed locals. (convert_sym_to_ucs): Tightened up the checking of symbolic character names for validity (expect a hex digit after the leading " #include #include // for memcpy(), strlen() -#include +#include // for setlocale() #include #include #include @@ -541,10 +541,10 @@ } char* locname = slocname; - char* save_localename = 0; - if (loc_cat != int (LC_INVALID_CAT)) - save_localename = std::setlocale (loc_cat, 0); + // save the current locale setting and set the locale to "C" + const char* const save_localename = std::setlocale (LC_ALL, 0); + std::setlocale (LC_ALL, "C"); #if __GNUG__ == 2 && __GNUC_MINOR__ == 96 @@ -582,11 +582,11 @@ // sizeof ("locale -a | grep \"\" > ") // 22 // + strlen (fname) // must be <= L_tmpnam - char cmd [22 + L_tmpnam]; + char cmd [80 + L_tmpnam]; - std::sprintf (cmd, "/usr/bin/locale -a > %s 2>/dev/null", fname); + std::sprintf (cmd, "LC_ALL=C /usr/bin/locale -a >%s 2>/dev/null", fname); - int ret = std::system(cmd); + const int ret = std::system (cmd); if (ret && ret != 256) { std::strcpy (slocname, "call to system "); @@ -653,8 +653,10 @@ *locname = '\0'; } - if (loc_cat != int (LC_INVALID_CAT)) - std::setlocale (loc_cat, save_localename); + + // restore the original locale + if (save_localename) + std::setlocale (LC_ALL, save_localename); std::fclose (f); std::remove (fname); Modified: incubator/stdcxx/trunk/util/charmap.cpp URL: http://svn.apache.org/viewvc/incubator/stdcxx/trunk/util/charmap.cpp?view=diff&rev=448754&r1=448753&r2=448754 ============================================================================== --- incubator/stdcxx/trunk/util/charmap.cpp (original) +++ incubator/stdcxx/trunk/util/charmap.cpp Thu Sep 21 17:42:16 2006 @@ -45,7 +45,7 @@ #include // for LC_CTYPE, setlocale() #include #include -#include // for strerror() +#include // for strrchr(), strerror() #include #include @@ -234,66 +234,164 @@ } -#if 0 - -// convert the first character of a string to an unsigned char -unsigned char Charmap:: -convert_escape (const char *str, const char **pend /* = 0 */) const +// count the number of bytes in a multibyte sequence denoted +// by the argument by counting the number of escape characters +std::size_t Charmap::mbcharlen (const std::string &str) const { - assert (str != 0); - - if (!*str || *str != scanner_.escape_char ()) - issue_diag (E_CVT, true, &next, - "unable to convert character %s\n", str); + std::size_t count = 1; - long ch = 0; + const char escape = scanner_.escape_char (); - char *end; + for (std::size_t idx = 0; ; ++idx, ++count) { + idx = str.find (escape, idx); - switch (str [1]) { - case '%': ch = std::strtol (str + 2, &end, 8); break; - case 'd': ch = std::strtol (str + 2, &end, 10); break; - case 'x': ch = std::strtol (str + 2, &end, 16); break; - default: ch = -1; end = _RWSTD_CONST_CAST (char*, str) + 1; + if (std::string::npos == idx) + break; } - if (end == str + 2 || (ch < 0 || ch > UCHAR_MAX)) - issue_diag (E_IFMT, true, &next, - "unable to convert character %s\n", str); - - if (pend) - *pend = end; - - typedef unsigned char UChar; - - return UChar (ch); + return count; } -#endif +/**************************************************************************/ -// find the number of bytes in the multibyte string by counting the -// number of escape chars in the string -size_t Charmap::mbcharlen (const std::string &str) const -{ - std::size_t count = 0; - - std::size_t idx = str.find (scanner_.escape_char ()); - - for (; std::string::npos != idx; ++count) { - idx = str.find (scanner_.escape_char (), idx + 1); - } - - return count; -} - +const char* const Charmap:: +portable_charset[] = { + /* 0x00 */ "", + /* 0x01 SOH */ 0, + /* 0x02 STX */ 0, + /* 0x03 ETX */ 0, + /* 0x04 EOT */ 0, + /* 0x05 ENQ */ 0, + /* 0x06 ACK */ 0, + /* 0x07 BEL */ "", + /* 0x08 */ "", + /* 0x09 TAB */ "", + /* 0x0a */ "", + /* 0x0b */ "", + /* 0x0c */ "", + /* 0x0d */ "", + /* 0x0e SO */ 0, + /* 0x0f SI */ 0, + /* 0x10 DLE */ 0, + /* 0x11 DC1 */ 0, + /* 0x12 DC2 */ 0, + /* 0x13 DC3 */ 0, + /* 0x14 DC4 */ 0, + /* 0x15 NAK */ 0, + /* 0x16 SYN */ 0, + /* 0x17 ETB */ 0, + /* 0x18 CAN */ 0, + /* 0x19 EM */ 0, + /* 0x1a SUB */ 0, + /* 0x1b ESC */ 0, + /* 0x1c IS4 */ 0, + /* 0x1d IS3 */ 0, + /* 0x1e IS2 */ 0, + /* 0x1f IS1 */ 0, + /* 0x20 SPC */ "", + /* 0x21 ! */ "", + /* 0x22 ' */ "", + /* 0x23 # */ "", + /* 0x24 $ */ "", + /* 0x25 % */ "", + /* 0x26 & */ "", + /* 0x27 ' */ "", + /* 0x28 ( */ "", + /* 0x29 ) */ "", + /* 0x2a * */ "", + /* 0x2b + */ "", + /* 0x2c , */ "", + /* 0x2d - */ "", // "", + /* 0x2e . */ "", // "", + /* 0x2f / */ "", // "", + /* 0x30 0 */ "", + /* 0x31 1 */ "", + /* 0x32 2 */ "", + /* 0x33 3 */ "", + /* 0x34 4 */ "", + /* 0x35 5 */ "", + /* 0x36 6 */ "", + /* 0x37 7 */ "", + /* 0x38 8 */ "", + /* 0x39 9 */ "", + /* 0x3a : */ "", + /* 0x3b ; */ "", + /* 0x3c < */ "", + /* 0x3d = */ "", + /* 0x3e > */ "", + /* 0x3f ? */ "", + /* 0x40 @ */ "", + /* 0x41 A */ "", + /* 0x42 B */ "", + /* 0x43 C */ "", + /* 0x44 D */ "", + /* 0x45 E */ "", + /* 0x46 F */ "", + /* 0x47 G */ "", + /* 0x48 H */ "", + /* 0x49 I */ "", + /* 0x4a J */ "", + /* 0x4b K */ "", + /* 0x4c L */ "", + /* 0x4d M */ "", + /* 0x4e N */ "", + /* 0x4f O */ "", + /* 0x50 P */ "

", + /* 0x71 q */ "", + /* 0x72 r */ "", + /* 0x73 s */ "", + /* 0x74 t */ "", + /* 0x75 u */ "", + /* 0x76 v */ "", + /* 0x77 w */ "", + /* 0x78 x */ "", + /* 0x79 y */ "", + /* 0x7a z */ "", + /* 0x7b { */ "", // "", + /* 0x7c | */ "", + /* 0x7d } */ "", // "", + /* 0x7e ~ */ "", + /* 0x7f */ 0 +}; // convert a string of narrow character into a wchar_t bool Charmap::convert_to_wc (const std::string& sym_name, const std::string& ext_enc, wchar_t& wc) { - #ifndef _RWSTD_NO_ISO_10646_WCHAR_T // the internal wchar_t representation for all characters @@ -302,17 +400,14 @@ #else // if defined _RWSTD_NO_ISO_10646_WCHAR_T - if (UCS4_internal_) { + if (UCS4_internal_ || Clocale_.empty ()) { - // translate the character to ISO-10646 (UCS) + // when using UCS as the internal encoding or for a locale + // that has no corresponding C library locale convert the + // character to ISO-10646 (UCS) return convert_to_ucs (sym_name, ext_enc, wc); } - // for a locale that has no corresponding C library locale - // convert the character to ISO-10646 (UCS) - if (Clocale_.empty ()) - return convert_to_ucs (sym_name, ext_enc, wc); - // otherwise use libc to convert the multi-byte character // to its wchar_t value if (-1 == std::mbtowc (&wc, ext_enc.c_str (), ext_enc.size ())) { @@ -332,6 +427,7 @@ return true; #endif // _RWSTD_NO_ISO_10646_WCHAR_T + } @@ -377,153 +473,198 @@ } -wchar_t Charmap::increment_val (const wchar_t val) const +wchar_t Charmap::increment_wchar (wchar_t val) const { #ifndef _RWSTD_NO_ISO_10646_WCHAR_T + // to increment a wchar_t value and keep the encoding all we have - // to do is increment the val because the internal encoding is utf8 + // to do is increment the val because the internal encoding is UCS return val + 1; #else // to increment a wchar_t value and keep the encoding we have to // convert the wchar_t to the external encoding, increment that // string value, and convert back to the internal representation - rn_cmap2_iter it = rn_cmap2_.find (val); + const rmb_cmap_iter it = rmb_cmap_.find (val); + + if (it != rmb_cmap_.end ()) { - if (it != rn_cmap2_.end ()) { + mb_cmap_iter ret; - n_cmap2_iter ret; + // multibyte character corresponding to the wchar_t value + std::string encoding = it->second; - std::string mb_str = it->second; // continue incrementing the multi-byte value until we get a valid // character. NOTE: this must be done for encodings such as SJIS where // \x7f in the last byte of a multibyte string is not a valid character // NOTE: this will not detect errors in the sequence, since the program // will continue until it finds a valid character do { - int last_elm = mb_str.size() - 1; + int last_elm = encoding.size () - 1; while (last_elm >= 0) { typedef unsigned char UChar; - const unsigned ic = UChar (mb_str [last_elm]) + 1; + const unsigned ic = UChar (encoding [last_elm]) + 1; // if incrementing the last element caused it to exceed - // 0xff increment the next higher byte if there is one - if (ic > 0xff) - mb_str [last_elm--] = 0; + // UCHAR_MAX increment the next higher byte if there is + // one + if (UCHAR_MAX < ic) + encoding [last_elm--] = '\0'; else { - mb_str [last_elm] = char (ic); + encoding [last_elm] = char (ic); break; } } if (last_elm < 0) - std::cerr << "cannot convert character\n"; - } while ((ret = n_cmap2_.find (mb_str)) == n_cmap2_.end ()); + return -1; // error + + } while ((ret = mb_cmap_.find (encoding)) == mb_cmap_.end ()); return ret->second; } - return -1; + return -1; // error #endif // _RWSTD_NO_ISO_10646_WCHAR_T + } -const char* Charmap::increment_strval (const char* str) + +bool Charmap:: +increment_encoding (std::string &encoding) { - static char s [64]; - static char sd [64]; - int i = 0; + // find the last escape character in the human readable representation + // of the encoding (i.e., in the multibyte character such as "/xf0/x80") + const std::string::size_type pos = + encoding.rfind (scanner_.escape_char ()); + + // the escape character must be there (guaranteed by the scanner) + assert (pos < encoding.size ()); + + const char* end = 0; + + // convert the last character in the multibyte character to a numeric + // value representing the last byte of the sequence + unsigned long last_byte = + scanner_.convert_escape (encoding.c_str () + pos, &end); + + // POSIX requires that the incremented value be non-NUL + if (UCHAR_MAX <= last_byte || *end) + return false; - char* ps = s; + // increment the last byte + ++last_byte; - // zero the first element of string s - *ps = 0; + // format the last byte in the same notation (octal, decimal, + // or hexadecimal escape sequence) + static const char xdigits[] = "0123456789ABCDEF"; - if (str == 0 || *str == 0) - return s; + char byte_str [5]; + char *pdig = byte_str; - for (i = 0; *str && i < 64; i++) { + switch (encoding [pos + 1]) { + case 'd': { // decimal escape + const unsigned hundreds = last_byte / 100; + const unsigned tens = (last_byte - hundreds) / 10; + const unsigned units = last_byte % 10; - *ps++ = scanner_.convert_escape (str, (const char**)&str); + *pdig++ = 'd'; + + if (hundreds) + *pdig++ = xdigits [hundreds]; + + *pdig++ = xdigits [tens]; + *pdig++ = xdigits [units]; + *pdig = '\0'; + break; } - *ps = 0; + case 'x': { // hex escape + const unsigned hi = last_byte >> 4; + const unsigned lo = last_byte & 0xfU; - // now attempt to increment the last character in the string if the - // character gets incremented above /xff then we increment the next char - if (ps == s) { - return s; + *pdig++ = 'x'; + *pdig++ = xdigits [hi]; + *pdig++ = xdigits [lo]; + *pdig = '\0'; + break; } + default: { // octal escape + const unsigned hi = last_byte >> 6; + const unsigned mid = (last_byte >> 3) & 07U; + const unsigned lo = last_byte & 07U; - for (ps--; ps >= s; ps--) - if ((unsigned char)(*ps) == 0xff) { - *ps = 0; - continue; - } else { - (*ps)++; - break; - } + if (hi) + *pdig++ = xdigits [hi]; - const char fmt [] = {scanner_.escape_char (), - 'x', '%', '0', '2', 'x', '\0'}; - ps = s; - for (i = 0; *ps; i += 4, ps++) - std::sprintf (&sd [i], fmt, *(unsigned char*)ps); + *pdig++ = xdigits [mid]; + *pdig++ = xdigits [lo]; + *pdig = '\0'; + } + } // switch + + // replace the last escape sequence with the new one + encoding.replace (pos + 1, std::string::npos, byte_str); - return sd; + return true; } -std::string Charmap::parse_ext_strval (const std::string &strval) const +std::string Charmap:: +encoding_to_mbchar (const std::string &encoding) const { - std::string ext_enc; + std::string mbchar; - for (const char *str = strval.c_str (); str && *str; ) - ext_enc += char (scanner_.convert_escape (str, &str)); + for (const char *pbyte = encoding.c_str (); pbyte && *pbyte; ) + mbchar += char (scanner_.convert_escape (pbyte, &pbyte)); - return ext_enc; + return mbchar; } // convert the locale's encoded character to UCS4 wchar_t -wchar_t Charmap::convert_sym_to_ucs (const std::string& s) const +wchar_t Charmap:: +convert_sym_to_ucs (const std::string &sym) const { - std::string::const_iterator it (s.begin ()); + std::string::const_iterator it (sym.begin ()); - if (s.size () < 4 || *it != '<' || *++it != 'U') { - issue_diag (E_UCS, true, 0, - "Attempt to convert symbolic name to UCS value failed. " - "Name %s not in form.\n", s.c_str ()); + if ( sym.size () < 4 || *it != '<' || *++it != 'U' + || !isxdigit (*++it)) { + issue_diag (E_UCS, true, 0, + "Unable to convert symbolic name %s to UCS.\n", + sym.c_str ()); } - long w = std::strtol (&*++it, (char**)0, 16); - if (w == _RWSTD_LONG_MIN || w == _RWSTD_LONG_MAX || - w > _RWSTD_WCHAR_T_MAX ) + const unsigned long val = std::strtoul (&*++it, (char**)0, 16); + + if (_RWSTD_WCHAR_T_MAX <= val) issue_diag (E_UCS, true, 0, - "Attempt to convert symbolic name to UCS value failed. " - "Value of %s out of range.\n", s.c_str ()); + "UCS value %lu of symbolic character %s out of range.\n", + val, sym.c_str ()); - return wchar_t (w); + return wchar_t (val); } + // convert the locale's encoded character to UCS4/UCS2 wchar_t bool Charmap::convert_to_ucs (const std::string &sym_name, - const std::string &ext_enc, wchar_t& wc) + const std::string &encoding, wchar_t& wc) { #ifndef _MSC_VER if (in_utf8_) { - wc = utf8_decode (ext_enc.c_str (), &*(ext_enc.end () - 1)); + wc = utf8_decode (encoding.c_str (), &*(encoding.end () - 1)); return true; } char utf8_enc [_RWSTD_MB_LEN_MAX + 1]; const char* const ch_end = - convert_to_utf8 (ext_enc.c_str (), ext_enc.size (), + convert_to_utf8 (encoding.c_str (), encoding.size (), utf8_enc, sizeof utf8_enc); if (ch_end) // only if conversion to utf8 succeeded @@ -538,7 +679,7 @@ if (0 != codepage_) { wchar_t ret[2] = {0}; - MultiByteToWideChar (codepage_, 0, ext_enc.c_str(), -1, ret, 2); + MultiByteToWideChar (codepage_, 0, encoding.c_str(), -1, ret, 2); if (ret[1] != 0) return false; @@ -556,16 +697,27 @@ void Charmap::add_to_cmaps (const std::string &sym_name, - const std::string &strval) + const std::string &encoding, + bool is_mbchar /* = false */) { - strval_map_.insert (std::make_pair (strval, sym_name)); + // compute the external (multibyte) encoding of the character + // if necessary (i.e., unless already done by the caller) + const std::string mbchar = + is_mbchar ? encoding : encoding_to_mbchar (encoding); + + symnames_list_.push_back (sym_name); - if (mbcharlen (strval) == 1) { + if (1 == mbchar.size ()) { + // strval is a single-byte character - const unsigned char ch = scanner_.convert_escape (strval.c_str ()); + const unsigned char ch = mbchar [0]; - if (forward_maps) + // add the wide character and its symbolic name to the narrow + // character maps + if (forward_maps) { + // the locale utility doesn't need reverse maps n_cmap_.insert (std::make_pair (sym_name, ch)); + } if (reverse_maps) rn_cmap_.insert (std::make_pair (ch, sym_name)); @@ -574,176 +726,204 @@ largest_nchar_ = ch; } - const std::string ext_enc = parse_ext_strval (strval); - - wchar_t ch; + // (try to) compute the wide character value of the character + wchar_t wch; - if (convert_to_wc (sym_name, ext_enc, ch)) { + if (convert_to_wc (sym_name, mbchar, wch)) { - if (forward_maps) - w_cmap_.insert (std::make_pair (sym_name, ch)); + // add the wide character and its symbolic name to the wide + // character maps + if (forward_maps) { + // the locale utility doesn't need forward maps + w_cmap_.insert (std::make_pair (sym_name, wch)); + } if (reverse_maps) - rw_cmap_.insert (std::make_pair (ch, sym_name)); - - std::string n_strval = parse_ext_strval (strval); - - n_cmap2_.insert (std::make_pair (n_strval, ch)); - rn_cmap2_.insert (std::make_pair (ch, n_strval)); - - assert (n_strval.size () != 0); + rw_cmap_.insert (std::make_pair (wch, sym_name)); - for (std::string::size_type i = n_strval.size (); --i; ) - valid_mb_set_.insert (n_strval.substr (0, i)); + // add the corresponding multibyte character to the multibyte + // character maps + mb_cmap_.insert (std::make_pair (mbchar, wch)); + rmb_cmap_.insert (std::make_pair (wch, mbchar)); } - wchar_t ucs_val; + // compute the UCS value of the character + wchar_t uch; - if (convert_to_ucs (sym_name, ext_enc, ucs_val)) { - ucs4_cmap_.insert (std::make_pair (sym_name, ucs_val)); - rucs4_cmap_.insert (std::make_pair (ucs_val, sym_name)); + if (convert_to_ucs (sym_name, mbchar, uch)) { + + // add UCS character and its symbolic name to the UCS + // character maps + ucs4_cmap_.insert (std::make_pair (sym_name, uch)); + rucs4_cmap_.insert (std::make_pair (uch, sym_name)); } } // process the characters implicitly defined by using ellipsis between // two explicitly defined characters -void Charmap:: +std::size_t Charmap:: process_ellipsis (const Scanner::token_t &beg_tok, int num_ellipsis) { + // get the upper end of the range denoted by the ellipsis const Scanner::token_t end_tok = scanner_.next_token (); - std::string strval = scanner_.next_token ().name; - // first add the beg_tok symbol name to the maps - add_to_cmaps (beg_tok.name, strval); + // get the human readabale encoding of the character + // denoted by the lower end of the ellipsis + const std::string encoding = scanner_.next_token ().name; + + // convert the encoding to a multibyte character + std::string mbchar = encoding_to_mbchar (encoding); + + // add the beg_tok symbol name to the maps + add_to_cmaps (beg_tok.name, mbchar, true); - // seperate the numeric portion of the symbolic name from the - // character portion in order to dynamically create symbolic - // names with increasing numeric values - std::string begin; - size_t idx = 0; + // extract the numeric portion of the symbolic character name + // denoted by the lower end of the ellipsis + std::size_t idx = 0; int base; // numeric base const char *fmat; // sprintf() format specifier + const std::size_t beg_len = beg_tok.name.size (); + // determine the value of the beginning of the range // denoted by the ellipsis - if (num_ellipsis == 2) { + if (2 == num_ellipsis) { base = 16; - fmat = "%s%0*lX>"; + fmat = "%.*s%0*lX>"; // advance to the first hex digit - while ( idx < beg_tok.name.size () - && !(std::isxdigit)(beg_tok.name [idx])) - begin += beg_tok.name [idx++]; + while (idx < beg_len && !(std::isxdigit)(beg_tok.name [idx])) + ++idx; } else { base = 10; - fmat = "%s%0*ld>"; + fmat = "%.*s%0*ld>"; // advance to the first decimal digit - while ( idx < beg_tok.name.size () - && !(std::isdigit)(beg_tok.name [idx])) - begin += beg_tok.name [idx++]; + while (idx < beg_len && !(std::isdigit)(beg_tok.name [idx])) + ++idx; } - std::string num_str; // the numeric portion of the sym name - - // get the numeric portion of the sym_name, this is the portion - // that will be different for each sym_name within the ellipsis - while (idx < beg_tok.name.size () && beg_tok.name [idx] != '>') - num_str += beg_tok.name [idx++]; - - const int num_size = int (num_str.size ()); - - // convert the num_str to a long - unsigned long start_num = std::strtoul (num_str.c_str(), (char**)0, base); + // length of non-numeric prefix of the symbolic character name + const std::size_t pfx_len = idx; - // increment the start_num once because we already added the - // beg_tok symbol name to the cmaps - ++start_num; + // get the character value plus one (since the first value + // has already been added to the map earlier) + char *num_end; + const unsigned long beg_val = + 1 + std::strtoul (beg_tok.name.c_str () + pfx_len, &num_end, base); + + // the length of the numeric portion + const std::size_t num_size = + num_end - (beg_tok.name.c_str () + pfx_len); // find the end of the range denoted by the ellipsis idx = 0; - if (num_ellipsis == 2) { + const std::size_t end_len = end_tok.name.size (); + + if (2 == num_ellipsis) { // advance to the next hex digit - while ( idx < end_tok.name.size () - && !(std::isxdigit)(end_tok.name [idx])) + while (idx < end_len && !(std::isxdigit)(end_tok.name [idx])) ++idx; } else { // advance to the next dec digit - while ( idx < end_tok.name.size () - && (std::isdigit)(end_tok.name [idx])) + while (idx < end_len && (std::isdigit)(end_tok.name [idx])) ++idx; } - num_str.clear (); - - // advance to the closing bracket ('>') - while (idx < end_tok.name.size() && end_tok.name[idx] != '>') - num_str += end_tok.name [idx++]; - - unsigned long end_num = std::strtoul (num_str.c_str(), (char**)0, base); + const unsigned long end_val = + std::strtoul (end_tok.name.c_str () + idx, (char**)0, base); - // the ending numeric value should be greater then the start numeric value - if (end_num < start_num) + // the ending numeric value must be greater than or equal + // to the beginning numeric value + if (end_val < beg_val) issue_diag (E_RANGE, true, &end_tok, "invalid range found in character map file\n"); char next_name [MAX_SYM_NAME_LEN]; - for (; start_num <= end_num; ++start_num) { + std::size_t nchars = 0; - std::sprintf (next_name, fmat, begin.c_str (), num_size, start_num); - - // increment the string value to the next encoded character value - strval = increment_strval (strval.c_str ()); + const char* const pfx = beg_tok.name.c_str (); + + for (unsigned long val = beg_val; val <= end_val; ++val, ++nchars) { - // and finally add the generated name and string value to the maps - add_to_cmaps (next_name, strval); + std::sprintf (next_name, fmat, pfx_len, pfx, num_size, val); + + // increment the last byte of the multibyte character + // and if the result is valid (i.e., doesn't contain + // an embedded NUL) add the generated name and the + // multibyte character to the maps + const unsigned char last_byte = mbchar [mbchar.size () - 1]; + if (last_byte < UCHAR_MAX) { + mbchar [mbchar.size () - 1] = last_byte + 1; + add_to_cmaps (next_name, mbchar, true); + } + else { + // an ellipsis must not specify a range that includes + // an encoding with an embedded NUL + issue_diag (E_RANGE, true, &beg_tok, + "encoding of an element in range contains NUL\n"); + } } + + // return the number of characters denoted by the ellipsis + return nchars; } // process all the characters in the character map file. void Charmap::process_chars() { + issue_diag (I_STAGE, false, 0, "processing CHARMAP section\n"); + + std::size_t ntokens = 0; + std::size_t nellips = 0; + std::size_t nchars = 0; + next = scanner_.next_token(); Scanner::token_t nextnext; // loop until we find the closing charmap token - while (next.token != Scanner::tok_charmap) { + for ( ; next.token != Scanner::tok_charmap; ++ntokens) { - switch (next.token){ + switch (next.token) { case Scanner::tok_nl: case Scanner::tok_end: break; case Scanner::tok_sym_name: - // the next token may be either ellipsis if this line of the - // charmap is in the form: + // the next token may be either ellipsis if this line + // of the charmap is in the form: // "%s...%s %s\n", , , // or an encoding if this line is in the format: // "%s %s\n", , - nextnext = scanner_.next_token(); + nextnext = scanner_.next_token (); + ntokens += 3; switch (nextnext.token) { - case Scanner::tok_ellipsis: - process_ellipsis (next, 3); + case Scanner::tok_abs_ellipsis: + // absolute ellipsis (see ISO/IEC TR 14652) + nchars += process_ellipsis (next, 3); + ++nellips; break; - case Scanner::tok_dellipsis: - process_ellipsis (next, 2); + case Scanner::tok_hex_ellipsis: + // hexadecimal symbolic ellipsis (see ISO/IEC TR 14652) + nchars += process_ellipsis (next, 2); + ++nellips; break; - case Scanner::tok_decimal_value: - case Scanner::tok_hex_value: - case Scanner::tok_octal_value: + case Scanner::tok_char_value: + // character represented as a numeric constant add_to_cmaps (next.name, nextnext.name); + ++nchars; break; default: @@ -752,7 +932,7 @@ "name in character map file\n"); } - scanner_.ignore_line(); + scanner_.ignore_line (); break; default: @@ -762,9 +942,13 @@ } next = scanner_.next_token(); - } - + + issue_diag (I_STAGE, false, 0, + "done processing CHARMAP section (%lu tokens, " + "%lu ellipses, %lu characters)\n", + ntokens, nellips, nchars); + // make sure that all characters in the portable character set // are in the charmap if (forward_maps) @@ -772,37 +956,20 @@ } -void Charmap::verify_portable_charset() +void Charmap::verify_portable_charset () const { - static const char* const charset[] = { - "","", "", "", "", - "", "", "", "", - "", "", "", - "", "", "", "", - "", "", "", - "", "", "", "", "", - "", "", "", "", "", "", - "", "", "", "", "", "", - "", "", "", "", - "", "", "", - "", "", "", "", "", "", "", "", - "", "", "", "", "","", "", "", "

","", - "", "", "", "", "", "", "", "", "", - "", "", "", - "", "", "", - "", "", "", - "", "", "", "", "", "", "", - "", "", "", "", "","", "", "", "

","", - "", "", "", "", "", "", "", "", "", - "", "", "", - "", "", "" - }; + const std::size_t nchars = + sizeof portable_charset / sizeof *portable_charset; + + for (std::size_t i = 0; i < nchars; ++i) { + if (0 == portable_charset [i]) + continue; - for (std::size_t i = 0; i < sizeof charset / sizeof (char*); ++i) - if (n_cmap_.find (charset [i]) == n_cmap_.end ()) + if (n_cmap_.find (portable_charset [i]) == n_cmap_.end ()) issue_diag (W_NOPCS, false, 0, "member of portable character set %s not found " - "in the character map\n", charset[i]); + "in the character map\n", portable_charset [i]); + } } Modified: incubator/stdcxx/trunk/util/charmap.h URL: http://svn.apache.org/viewvc/incubator/stdcxx/trunk/util/charmap.h?view=diff&rev=448754&r1=448753&r2=448754 ============================================================================== --- incubator/stdcxx/trunk/util/charmap.h (original) +++ incubator/stdcxx/trunk/util/charmap.h Thu Sep 21 17:42:16 2006 @@ -2,20 +2,27 @@ * * charmap.h * - * $Id: //stdlib/dev/source/stdlib/util/charmap.h#41 $ + * $Id$ * *************************************************************************** * - * Copyright (c) 1994-2005 Quovadx, Inc., acting through its Rogue Wave - * Software division. Licensed under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance with the - * License. You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0. Unless required by - * applicable law or agreed to in writing, software distributed under - * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR - * CONDITIONS OF ANY KIND, either express or implied. See the License - * for the specific language governing permissions and limitations under - * the License. + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + * + * Copyright 2001-2006 Rogue Wave Software. * **************************************************************************/ @@ -24,6 +31,7 @@ #define _RWSTD_LOC_CHARMAP_H_INCLUDED #include +#include #include #include @@ -36,6 +44,8 @@ class Charmap { public: + static const char* const portable_charset []; + Charmap(const char* /*corresponding C library locale*/, const char* /*filename*/, bool /*is utf8 encoding?*/, @@ -43,37 +53,45 @@ bool /*create_reverse_charmaps*/, bool /*use UCS4 internally*/); - // get the narrow character map - const std::map& get_n_cmap() const { + // returns the narrow character map which maps a symbolic character + // name to its narrow character value + const std::map& get_n_cmap() const { return n_cmap_; } - // get the reverse narrow character map - const std::map& get_rn_cmap() const { + // returns the reverse narrow character map which maps a narrow + // character value to its symbolic name + const std::map& get_rn_cmap() const { return rn_cmap_; } - // get the wide character map - const std::map& get_w_cmap() const { + // returns the wide character map which maps a symbolic character + // name to its wide character value + const std::map& get_w_cmap() const { return w_cmap_; } - // get the reverse wide character map - const std::map& get_rw_cmap() const { + // returns the reverse wide character map which maps a wide + // character value to its symbolic name + const std::map& get_rw_cmap() const { return rw_cmap_; } - const std::map& get_n_cmap2() const { - return n_cmap2_; + // returns the multibyte character map which maps a multibyte + // character to its corresponding wide character value + const std::map& get_mb_cmap() const { + return mb_cmap_; } - const std::map& get_rn_cmap2() const { - return rn_cmap2_; + // returns the reverse multibyte character map which maps a wide + // character value to its corresponding multibyte character + const std::map& get_rmb_cmap() const { + return rmb_cmap_; } // get the string value map - const std::map& get_strval_map() const { - return strval_map_; + const std::list& get_symnames_list() const { + return symnames_list_; } const std::map & get_ucs4_cmap () const { @@ -84,14 +102,6 @@ return rucs4_cmap_; } - const std::set& get_valid_mb_set() const { - return valid_mb_set_; - } - - const std::set& get_valid_wchar_set() const { - return valid_wchar_set_; - } - // return the value of mb_cur_max int get_mb_cur_max() const { return mb_cur_max_; @@ -128,32 +138,36 @@ unsigned char get_largest_nchar () const; - // increment the wide character value to the next encoded character in - // this codeset - wchar_t increment_val (const wchar_t) const; + // increments the wide character value to the next encoded character + // in the current codeset; returns the incremented value or -1 on + // error + wchar_t increment_wchar (wchar_t) const; private: - // process the characters implicitly defined by using ellipsis between - // two explicitly defined characters - void process_ellipsis (const Scanner::token_t&, int); + // processes characters implicitly defined by an ellipsis denoted + // by two explicitly defined characters; returns the number of + // characters in the range, -1 on error + std::size_t process_ellipsis (const Scanner::token_t&, int); // process the charmap file making the necessary mappings in the cmaps void process_chars(); - // increment the multi-byte string by 1. - const char* increment_strval (const char*); + // increment the encoded multi byte character argument + bool increment_encoding (std::string&); - // make sure that all the characters in the portable character set are - // defined in the character map - void verify_portable_charset (); + // verify that all the characters in the portable character set + // are defined in the character map + void verify_portable_charset () const; #ifndef _MSC_VER // open the iconv descriptor to convert to utf8 iconv_t open_iconv_to_utf8 () const; #endif // _MSC_VER - std::string parse_ext_strval (const std::string&) const; + // convert a human-readable encoding of a character + // to its raw multibyte character representation + std::string encoding_to_mbchar (const std::string&) const; // convert a multi-byte string to a utf8 multi-byte string char* convert_to_utf8 (const char *inbuf, std::size_t inbuf_s, @@ -167,8 +181,11 @@ # endif // _RWSTD_NO_ISO_10646_WCHAR_T #endif // _MSC_VER - // add the sym_name and multi-byte character to the character maps - void add_to_cmaps (const std::string&, const std::string&); + // add the symbolic name of a character and the raw multibyte + // character corresponding to it to the character maps + void add_to_cmaps (const std::string&, + const std::string&, + bool = false); // the scanner used to process the charmap file Scanner scanner_; @@ -181,16 +198,18 @@ #endif // _MSC_VER // n_cmap maps the symbolic name to a narrow character value - // rn_cmap does exactly the opposite + // rn_cmap does the opposite std::map n_cmap_; std::map rn_cmap_; - std::map n_cmap2_; - std::map rn_cmap2_; - typedef std::map ::const_iterator rn_cmap2_iter; - typedef std::map ::const_iterator n_cmap2_iter; - std::set valid_mb_set_; - std::set valid_wchar_set_; + // mb_cmap maps a multibyte character representation to its + // corresponding wide character value + // rmb_cmap does the opposite + std::map mb_cmap_; + std::map rmb_cmap_; + + typedef std::map ::const_iterator rmb_cmap_iter; + typedef std::map ::const_iterator mb_cmap_iter; // w_cmap maps the symbolic name to a wide character value // rw_cmap does exactly the opposite @@ -232,11 +251,8 @@ // should we use UCS4 as the internal representation bool UCS4_internal_; - // maps the string value to the symbolic name - // this map is required for the UNDEFINED keyword - // in localedef where the elements must be added in - // increasing encoded order. - std::map strval_map_; + // list of all known symbolic character names + std::list symnames_list_; Scanner::token_t next; };

", + /* 0x51 Q */ "", + /* 0x52 R */ "", + /* 0x53 S */ "", + /* 0x54 T */ "", + /* 0x55 U */ "", + /* 0x56 V */ "", + /* 0x57 W */ "", + /* 0x58 X */ "", + /* 0x59 Y */ "", + /* 0x5a Z */ "", + /* 0x5b [ */ "", + /* 0x5c \ */ "", // "", + /* 0x5d ] */ "", + /* 0x5e ^ */ "", // "", + /* 0x5f _ */ "", // "", + /* 0x60 ` */ "", + /* 0x61 a */ "", + /* 0x62 b */ "", + /* 0x63 c */ "", + /* 0x64 d */ "", + /* 0x65 e */ "", + /* 0x66 f */ "", + /* 0x67 g */ "", + /* 0x68 h */ "", + /* 0x69 i */ "", + /* 0x6a j */ "", + /* 0x6b k */ "", + /* 0x6c l */ "", + /* 0x6d m */ "", + /* 0x6e n */ "", + /* 0x6f o */ "", + /* 0x70 p */ "