diff --git a/src/util/load.cpp b/src/util/load.cpp index 245cbf1..b7e95ac 100644 --- a/src/util/load.cpp +++ b/src/util/load.cpp @@ -32,6 +32,10 @@ namespace { +static const boost::uint32_t cp_utf8 = 65001; +static const boost::uint32_t cp_utf16 = 1200; +static const char replacement_char = '_'; + std::map converters; iconv_t get_converter(boost::uint32_t codepage) { @@ -43,7 +47,7 @@ iconv_t get_converter(boost::uint32_t codepage) { } std::ostringstream oss; - if(codepage == 1200) { + if(codepage == cp_utf16) { // iconv's behavior for "UTF-16" is platform-dependant if there is no BOM. // There never is any BOM in Inno Setup files and it's always little-endian, // so we specify the exact encoding. @@ -52,7 +56,13 @@ iconv_t get_converter(boost::uint32_t codepage) { oss << "CP" << codepage; } - return converters[codepage] = iconv_open("UTF-8", oss.str().c_str()); + iconv_t handle = iconv_open("UTF-8", oss.str().c_str()); + + if(handle == iconv_t(-1)) { + log_warning << "could not get " << oss.str() << " -> UTF-8 converter"; + } + + return converters[codepage] = handle; } }; @@ -83,9 +93,40 @@ void encoded_string::load(std::istream & is, std::string & target, boost::uint32 to_utf8(temp, target, codepage); } +//! Fallback conversion that will at least work for ASCII characters +static void to_utf8_fallback(const std::string & from, std::string & to, + boost::uint32_t codepage) { + + size_t skip = ((codepage == cp_utf16) ? 2 : 1); + + to.clear(); + to.reserve(ceildiv(from.size(), skip)); + + for(size_t i = 0; i < from.size(); i += skip) { + if((unsigned char)from[i] <= 127) { + // copy ASCII characters + to.push_back(from[i]); + } else { + // replace everything else with underscores + to.push_back(replacement_char); + } + } +} + void to_utf8(const std::string & from, std::string & to, boost::uint32_t codepage) { + if(codepage == cp_utf8) { + // copy UTF-8 directly + to = from; + return; + } + iconv_t converter = get_converter(codepage); + if(converter == iconv_t(-1)) { + to_utf8_fallback(from, to, codepage); + return; + } + /* * Some iconv implementations declare the second parameter of iconv() as @@ -110,22 +151,45 @@ void to_utf8(const std::string & from, std::string & to, boost::uint32_t codepag iconv(converter, NULL, NULL, NULL, NULL); + size_t skip = ((codepage == cp_utf16) ? 2 : 1); + + bool warn = false; + while(insize) { - to.resize(outbase + insize + 4); + to.resize(outbase + ceildiv(insize, skip) + 4); char * outbuf = &to[0] + outbase; size_t outsize = to.size() - outbase; size_t ret = iconv(converter, inbuf, &insize, &outbuf, &outsize); - if(ret == size_t(-1) && errno != E2BIG) { - log_error << "iconv error while converting from CP" << codepage << ": " << errno; - to.clear(); - return; + if(ret == size_t(-1)) { + if(errno == E2BIG) { + // not enough output space - we'll allocate more in the next loop + } else if(/*errno == EILSEQ &&*/ insize >= 2) { + // invalid byte (sequence) - add a replacement char nd try the next byte + if(outsize == 0) { + to.push_back(replacement_char); + } else { + *outbuf = replacement_char; + outsize--; + } + inbuf.buf += skip; + insize -= skip; + warn = true; + } else { + // something else went wrong - return what we have so far + insize = 0; + warn = true; + } } outbase = to.size() - outsize; } + if(warn) { + log_warning << "unexpected data while converting from CP" << codepage << " to UTF-8"; + } + to.resize(outbase); }