From 22ddada2916ac8b3d24518b1a37072044f35ff6e Mon Sep 17 00:00:00 2001 From: Daniel Scharrer Date: Sun, 8 Jun 2014 07:48:51 +0200 Subject: [PATCH] Move to_utf8() and helpers to their own source file --- CMakeLists.txt | 2 + src/setup/header.cpp | 1 + src/setup/message.cpp | 1 + src/util/encoding.cpp | 269 ++++++++++++++++++++++++++++++++++++++++++ src/util/encoding.hpp | 49 ++++++++ src/util/load.cpp | 241 ------------------------------------- src/util/load.hpp | 13 +- 7 files changed, 323 insertions(+), 253 deletions(-) create mode 100644 src/util/encoding.cpp create mode 100644 src/util/encoding.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 135d9d1..4f95b7d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -288,6 +288,8 @@ set(INNOEXTRACT_SOURCES src/util/boostfs_compat.hpp src/util/console.hpp src/util/console.cpp + src/util/encoding.hpp + src/util/encoding.cpp src/util/endian.hpp src/util/enum.hpp src/util/flags.hpp diff --git a/src/setup/header.cpp b/src/setup/header.cpp index d2daa53..b16f00f 100644 --- a/src/setup/header.cpp +++ b/src/setup/header.cpp @@ -26,6 +26,7 @@ #include #include "setup/version.hpp" +#include "util/encoding.hpp" #include "util/load.hpp" #include "util/storedenum.hpp" diff --git a/src/setup/message.cpp b/src/setup/message.cpp index 109d7d6..8f1b692 100644 --- a/src/setup/message.cpp +++ b/src/setup/message.cpp @@ -24,6 +24,7 @@ #include "setup/language.hpp" #include "setup/version.hpp" +#include "util/encoding.hpp" #include "util/load.hpp" namespace setup { diff --git a/src/util/encoding.cpp b/src/util/encoding.cpp new file mode 100644 index 0000000..05aaf52 --- /dev/null +++ b/src/util/encoding.cpp @@ -0,0 +1,269 @@ +/* + * Copyright (C) 2011-2013 Daniel Scharrer + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author(s) be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#include "util/encoding.hpp" + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "util/log.hpp" +#include "util/math.hpp" + +namespace util { + +namespace { + +static const codepage_id cp_utf8 = 65001; +static const codepage_id cp_ascii = 20127; +static const char replacement_char = '_'; + +typedef boost::unordered_map converter_map; +converter_map converters; + +static size_t get_encoding_size(codepage_id codepage) { + switch(codepage) { + case 1200: return 2u; // UTF-16LE + case 1201: return 2u; // UTF-16BE + case 12000: return 4u; // UTF-32LE + case 12001: return 4u; // UTF-32BE + default: return 1u; + } +} + +//! Get names for encodings where iconv doesn't have the codepage alias +static const char * get_encoding_name(codepage_id codepage) { + switch(codepage) { + case 708: return "ISO-8859-6"; + case 936: return "GBK"; + case 949: return "UHC"; + case 950: return "BIG5"; + // iconv's behavior for "UTF-16" is platform-dependent if there is no BOM. + // There never is any BOM in Inno Setup files and it's always little-endian, + // so we specify the exact encoding. + case 1200: return "UTF-16LE"; + case 1201: return "UTF-16BE"; + case 1252: return "MS-ANSI"; + case 1361: return "JOHAB"; + case 10000: return "MACINTOSH"; + case 10002: return "BIG5"; + case 10008: return "GB2312"; + case 12000: return "UTF-32LE"; + case 12001: return "UTF-32BE"; + case 20003: return "IBM5550"; + case 20127: return "US-ASCII"; + case 20261: return "T.61"; + case 20269: return "ISO_6937"; + case 20273: return "IBM273"; + case 20277: return "IBM277"; + case 20278: return "IBM278"; + case 20280: return "IBM280"; + case 20284: return "IBM284"; + case 20285: return "IBM285"; + case 20290: return "IBM290"; + case 20297: return "IBM297"; + case 20420: return "IBM420"; + case 20423: return "IBM423"; + case 20424: return "IBM424"; + case 20866: return "KOI8-R"; + case 20871: return "IBM871"; + case 20880: return "IBM880"; + case 20905: return "IBM905"; + case 20924: return "IBM1047"; + case 20932: return "EUC-JP-MS"; + case 20936: return "EUC-CN"; + case 21025: return "IBM1025"; + case 21866: return "KOI8-U"; + case 28591: return "ISO-8859-1"; + case 28592: return "ISO-8859-2"; + case 28593: return "ISO-8859-3"; + case 28594: return "ISO-8859-4"; + case 28595: return "ISO-8859-5"; + case 28596: return "ISO-8859-6"; + case 28597: return "ISO-8859-7"; + case 28598: return "ISO-8859-8"; + case 28599: return "ISO-8859-9"; + case 28603: return "ISO-8859-13"; + case 28605: return "ISO-8859-15"; + case 38598: return "ISO-8859-8"; + case 50220: return "ISO-2022-JP"; + case 50221: return "ISO-2022-JP-2"; + case 50222: return "ISO-2022-JP-3"; + case 50225: return "ISO-2022-KR"; + case 50227: return "ISO-2022-CN"; + case 50229: return "ISO-2022-CN-EXT"; + case 50930: return "EBCDIC-JP-E"; + case 51932: return "EUC-JP"; + case 51936: return "EUC-CN"; + case 51949: return "EUC-KR"; + case 51950: return "EUC-CN"; + case 54936: return "GB18030"; + case 65000: return "UTF-7"; + case 65001: return "UTF-8"; + default: return NULL; + } +} + +static iconv_t get_converter(codepage_id codepage) { + + // Try to reuse an existing converter if possible + converter_map::const_iterator i = converters.find(codepage); + if(i != converters.end()) { + return i->second; + } + + iconv_t handle = iconv_t(-1); + + const char * encoding = get_encoding_name(codepage); + if(encoding) { + handle = iconv_open("UTF-8", encoding); + } + + // Otherwise, try a few different codepage name prefixes + if(handle == iconv_t(-1)) { + const char * prefixes[] = { "MSCP", "CP", "WINDOWS-", "MS", "IBM", "IBM-", "" }; + BOOST_FOREACH(const char * prefix, prefixes) { + std::ostringstream oss; + oss << prefix << std::setfill('0') << std::setw(3) << codepage; + handle = iconv_open("UTF-8", oss.str().c_str()); + if(handle != iconv_t(-1)) { + break; + } + } + } + + if(handle == iconv_t(-1)) { + log_warning << "could not get codepage " << codepage << " -> UTF-8 converter"; + } + + return converters[codepage] = handle; +} + +//! Fallback conversion that will at least work for ASCII characters +static void to_utf8_fallback(const std::string & from, std::string & to, + codepage_id codepage) { + + size_t skip = get_encoding_size(codepage); + + to.clear(); + to.reserve(ceildiv(from.size(), skip)); + + for(size_t i = 0; i < from.size(); i += skip) { + if((unsigned char)from[i] <= 127) { + // copy ASCII characters + to.push_back(from[i]); + } else { + // replace everything else with underscores + to.push_back(replacement_char); + } + } +} + +} // anonymous namespace + +void to_utf8(const std::string & from, std::string & to, codepage_id codepage) { + + if(codepage == cp_utf8 || codepage == cp_ascii) { + // copy UTF-8 directly + to = from; + return; + } + + iconv_t converter = get_converter(codepage); + if(converter == iconv_t(-1)) { + to_utf8_fallback(from, to, codepage); + return; + } + + + /* + * Some iconv implementations declare the second parameter of iconv() as + * const char **, others as char **. + * Use this little hack to compile with both variants. + */ + struct inbuf_ { + const char * buf; + explicit inbuf_(const char * data) : buf(data) { } + operator const char **() { return &buf; } + operator char **() { return const_cast(&buf); } + } inbuf(from.data()); + + size_t insize = from.size(); + + size_t outbase = 0; + + if(!insize) { + to.clear(); + return; + } + + iconv(converter, NULL, NULL, NULL, NULL); + + size_t skip = get_encoding_size(codepage); + + bool warn = false; + + while(insize) { + + to.resize(outbase + ceildiv(insize, skip) + 4); + + char * outbuf = &to[0] + outbase; + size_t outsize = to.size() - outbase; + + size_t ret = iconv(converter, inbuf, &insize, &outbuf, &outsize); + if(ret == size_t(-1)) { + if(errno == E2BIG) { + // not enough output space - we'll allocate more in the next loop + } else if(/*errno == EILSEQ &&*/ insize >= 2) { + // invalid byte (sequence) - add a replacement char and try the next byte + if(outsize == 0) { + to.push_back(replacement_char); + } else { + *outbuf = replacement_char; + outsize--; + } + inbuf.buf += skip; + insize -= skip; + warn = true; + } else { + // something else went wrong - return what we have so far + insize = 0; + warn = true; + } + } + + outbase = to.size() - outsize; + } + + if(warn) { + log_warning << "unexpected data while converting from CP" << codepage << " to UTF-8"; + } + + to.resize(outbase); +} + +} // namespace util diff --git a/src/util/encoding.hpp b/src/util/encoding.hpp new file mode 100644 index 0000000..8ede097 --- /dev/null +++ b/src/util/encoding.hpp @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2011-2013 Daniel Scharrer + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author(s) be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/*! + * \file + * + * Utility function to convert strings to UTF-8. + */ +#ifndef INNOEXTRACT_UTIL_ENCODING_HPP +#define INNOEXTRACT_UTIL_ENCODING_HPP + +#include + +#include + +namespace util { + +typedef boost::uint32_t codepage_id; + +/*! + * Convert a string to UTF-8 from a specified encoding. + * \param from The input string to convert. + * \param to The output for the converted string. + * \param codepage The Windows codepage number for the input string encoding. + * + * \note This function is not thread-safe. + */ +void to_utf8(const std::string & from, std::string & to, codepage_id codepage = 1252); + +} // namespace util + +#endif // INNOEXTRACT_UTIL_ENCODING_HPP diff --git a/src/util/load.cpp b/src/util/load.cpp index 1548f6e..b787b2e 100644 --- a/src/util/load.cpp +++ b/src/util/load.cpp @@ -20,172 +20,12 @@ #include "util/load.hpp" -#include -#include #include -#include -#include -#include - -#include #include -#include - -#include "util/log.hpp" -#include "util/math.hpp" namespace util { -namespace { - -static const codepage_id cp_utf8 = 65001; -static const codepage_id cp_ascii = 20127; -static const char replacement_char = '_'; - -typedef boost::unordered_map converter_map; -converter_map converters; - -static size_t get_encoding_size(codepage_id codepage) { - switch(codepage) { - case 1200: return 2u; // UTF-16LE - case 1201: return 2u; // UTF-16BE - case 12000: return 4u; // UTF-32LE - case 12001: return 4u; // UTF-32BE - default: return 1u; - } -} - -//! Get names for encodings where iconv doesn't have the codepage alias -static const char * get_encoding_name(codepage_id codepage) { - switch(codepage) { - case 708: return "ISO-8859-6"; - case 936: return "GBK"; - case 949: return "UHC"; - case 950: return "BIG5"; - // iconv's behavior for "UTF-16" is platform-dependent if there is no BOM. - // There never is any BOM in Inno Setup files and it's always little-endian, - // so we specify the exact encoding. - case 1200: return "UTF-16LE"; - case 1201: return "UTF-16BE"; - case 1252: return "MS-ANSI"; - case 1361: return "JOHAB"; - case 10000: return "MACINTOSH"; - case 10002: return "BIG5"; - case 10008: return "GB2312"; - case 12000: return "UTF-32LE"; - case 12001: return "UTF-32BE"; - case 20003: return "IBM5550"; - case 20127: return "US-ASCII"; - case 20261: return "T.61"; - case 20269: return "ISO_6937"; - case 20273: return "IBM273"; - case 20277: return "IBM277"; - case 20278: return "IBM278"; - case 20280: return "IBM280"; - case 20284: return "IBM284"; - case 20285: return "IBM285"; - case 20290: return "IBM290"; - case 20297: return "IBM297"; - case 20420: return "IBM420"; - case 20423: return "IBM423"; - case 20424: return "IBM424"; - case 20866: return "KOI8-R"; - case 20871: return "IBM871"; - case 20880: return "IBM880"; - case 20905: return "IBM905"; - case 20924: return "IBM1047"; - case 20932: return "EUC-JP-MS"; - case 20936: return "EUC-CN"; - case 21025: return "IBM1025"; - case 21866: return "KOI8-U"; - case 28591: return "ISO-8859-1"; - case 28592: return "ISO-8859-2"; - case 28593: return "ISO-8859-3"; - case 28594: return "ISO-8859-4"; - case 28595: return "ISO-8859-5"; - case 28596: return "ISO-8859-6"; - case 28597: return "ISO-8859-7"; - case 28598: return "ISO-8859-8"; - case 28599: return "ISO-8859-9"; - case 28603: return "ISO-8859-13"; - case 28605: return "ISO-8859-15"; - case 38598: return "ISO-8859-8"; - case 50220: return "ISO-2022-JP"; - case 50221: return "ISO-2022-JP-2"; - case 50222: return "ISO-2022-JP-3"; - case 50225: return "ISO-2022-KR"; - case 50227: return "ISO-2022-CN"; - case 50229: return "ISO-2022-CN-EXT"; - case 50930: return "EBCDIC-JP-E"; - case 51932: return "EUC-JP"; - case 51936: return "EUC-CN"; - case 51949: return "EUC-KR"; - case 51950: return "EUC-CN"; - case 54936: return "GB18030"; - case 65000: return "UTF-7"; - case 65001: return "UTF-8"; - default: return NULL; - } -} - -static iconv_t get_converter(codepage_id codepage) { - - // Try to reuse an existing converter if possible - converter_map::const_iterator i = converters.find(codepage); - if(i != converters.end()) { - return i->second; - } - - iconv_t handle = iconv_t(-1); - - const char * encoding = get_encoding_name(codepage); - if(encoding) { - handle = iconv_open("UTF-8", encoding); - } - - // Otherwise, try a few different codepage name prefixes - if(handle == iconv_t(-1)) { - const char * prefixes[] = { "MSCP", "CP", "WINDOWS-", "MS", "IBM", "IBM-", "" }; - BOOST_FOREACH(const char * prefix, prefixes) { - std::ostringstream oss; - oss << prefix << std::setfill('0') << std::setw(3) << codepage; - handle = iconv_open("UTF-8", oss.str().c_str()); - if(handle != iconv_t(-1)) { - break; - } - } - } - - if(handle == iconv_t(-1)) { - log_warning << "could not get codepage " << codepage << " -> UTF-8 converter"; - } - - return converters[codepage] = handle; -} - -//! Fallback conversion that will at least work for ASCII characters -static void to_utf8_fallback(const std::string & from, std::string & to, - codepage_id codepage) { - - size_t skip = get_encoding_size(codepage); - - to.clear(); - to.reserve(ceildiv(from.size(), skip)); - - for(size_t i = 0; i < from.size(); i += skip) { - if((unsigned char)from[i] <= 127) { - // copy ASCII characters - to.push_back(from[i]); - } else { - // replace everything else with underscores - to.push_back(replacement_char); - } - } -} - -} // anonymous namespace - void binary_string::load(std::istream & is, std::string & target) { boost::uint32_t length = util::load(is); @@ -218,87 +58,6 @@ void encoded_string::load(std::istream & is, std::string & target, codepage_id c to_utf8(binary_string::load(is), target, codepage); } -void to_utf8(const std::string & from, std::string & to, codepage_id codepage) { - - if(codepage == cp_utf8 || codepage == cp_ascii) { - // copy UTF-8 directly - to = from; - return; - } - - iconv_t converter = get_converter(codepage); - if(converter == iconv_t(-1)) { - to_utf8_fallback(from, to, codepage); - return; - } - - - /* - * Some iconv implementations declare the second parameter of iconv() as - * const char **, others as char **. - * Use this little hack to compile with both variants. - */ - struct inbuf_ { - const char * buf; - explicit inbuf_(const char * data) : buf(data) { } - operator const char **() { return &buf; } - operator char **() { return const_cast(&buf); } - } inbuf(from.data()); - - size_t insize = from.size(); - - size_t outbase = 0; - - if(!insize) { - to.clear(); - return; - } - - iconv(converter, NULL, NULL, NULL, NULL); - - size_t skip = get_encoding_size(codepage); - - bool warn = false; - - while(insize) { - - to.resize(outbase + ceildiv(insize, skip) + 4); - - char * outbuf = &to[0] + outbase; - size_t outsize = to.size() - outbase; - - size_t ret = iconv(converter, inbuf, &insize, &outbuf, &outsize); - if(ret == size_t(-1)) { - if(errno == E2BIG) { - // not enough output space - we'll allocate more in the next loop - } else if(/*errno == EILSEQ &&*/ insize >= 2) { - // invalid byte (sequence) - add a replacement char and try the next byte - if(outsize == 0) { - to.push_back(replacement_char); - } else { - *outbuf = replacement_char; - outsize--; - } - inbuf.buf += skip; - insize -= skip; - warn = true; - } else { - // something else went wrong - return what we have so far - insize = 0; - warn = true; - } - } - - outbase = to.size() - outsize; - } - - if(warn) { - log_warning << "unexpected data while converting from CP" << codepage << " to UTF-8"; - } - - to.resize(outbase); -} - unsigned to_unsigned(const char * chars, size_t count) { #if BOOST_VERSION < 105200 return boost::lexical_cast(std::string(chars, count)); diff --git a/src/util/load.hpp b/src/util/load.hpp index 2829170..7fc1265 100644 --- a/src/util/load.hpp +++ b/src/util/load.hpp @@ -33,23 +33,12 @@ #include #include +#include "util/encoding.hpp" #include "util/endian.hpp" #include "util/types.hpp" namespace util { -typedef boost::uint32_t codepage_id; - -/*! - * Convert a string to UTF-8 from a specified encoding. - * \param from The input string to convert. - * \param to The output for the converted string. - * \param codepage The Windows codepage number for the input string encoding. - * - * \note This function is not thread-safe. - */ -void to_utf8(const std::string & from, std::string & to, codepage_id codepage = 1252); - /*! * Wrapper to load a length-prefixed string from an input stream into a std::string. * The string length is stored as 32-bit integer.