You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
596 lines
16 KiB
596 lines
16 KiB
/* |
|
* Copyright (C) 2011-2013 Daniel Scharrer |
|
* |
|
* This software is provided 'as-is', without any express or implied |
|
* warranty. In no event will the author(s) be held liable for any damages |
|
* arising from the use of this software. |
|
* |
|
* Permission is granted to anyone to use this software for any purpose, |
|
* including commercial applications, and to alter it and redistribute it |
|
* freely, subject to the following restrictions: |
|
* |
|
* 1. The origin of this software must not be misrepresented; you must not |
|
* claim that you wrote the original software. If you use this software |
|
* in a product, an acknowledgment in the product documentation would be |
|
* appreciated but is not required. |
|
* 2. Altered source versions must be plainly marked as such, and must not be |
|
* misrepresented as being the original software. |
|
* 3. This notice may not be removed or altered from any source distribution. |
|
*/ |
|
// Parts based on: |
|
//////////////////////////////////////////////////////////// |
|
// |
|
// SFML - Simple and Fast Multimedia Library |
|
// Copyright (C) 2007-2009 Laurent Gomila (laurent.gom@gmail.com) |
|
// |
|
// This software is provided 'as-is', without any express or implied warranty. |
|
// In no event will the authors be held liable for any damages arising from the |
|
// use of this software. |
|
// |
|
// Permission is granted to anyone to use this software for any purpose, |
|
// including commercial applications, and to alter it and redistribute it freely, |
|
// subject to the following restrictions: |
|
// |
|
// 1. The origin of this software must not be misrepresented; |
|
// you must not claim that you wrote the original software. |
|
// If you use this software in a product, an acknowledgment |
|
// in the product documentation would be appreciated but is not required. |
|
// |
|
// 2. Altered source versions must be plainly marked as such, |
|
// and must not be misrepresented as being the original software. |
|
// |
|
// 3. This notice may not be removed or altered from any source distribution. |
|
// |
|
//////////////////////////////////////////////////////////// |
|
// |
|
// This code has been taken from SFML and altered to fit the project's needs. |
|
// |
|
//////////////////////////////////////////////////////////// |
|
|
|
#include "util/encoding.hpp" |
|
|
|
#include <stddef.h> |
|
|
|
#include <algorithm> |
|
#include <iomanip> |
|
#include <iterator> |
|
#include <sstream> |
|
#include <vector> |
|
|
|
#include "configure.hpp" |
|
|
|
#if INNOEXTRACT_HAVE_ICONV |
|
#include <iconv.h> |
|
#include <errno.h> |
|
#endif |
|
|
|
#if INNOEXTRACT_HAVE_WIN32_CONV |
|
#include <windows.h> |
|
#endif |
|
|
|
#include <boost/foreach.hpp> |
|
#include <boost/static_assert.hpp> |
|
#include <boost/unordered_map.hpp> |
|
|
|
#include "util/log.hpp" |
|
#include "util/math.hpp" |
|
|
|
namespace util { |
|
|
|
enum known_codepages { |
|
cp_utf16le = 1200, |
|
cp_windows1252 = 1252, |
|
cp_ascii = 20127, |
|
cp_iso_8859_1 = 28591, |
|
cp_utf8 = 65001, |
|
}; |
|
|
|
namespace { |
|
|
|
static const char replacement_char = '_'; |
|
|
|
typedef boost::uint32_t unicode_char; |
|
|
|
static size_t get_encoding_size(codepage_id codepage) { |
|
switch(codepage) { |
|
case 1200: return 2u; // UTF-16LE |
|
case 1201: return 2u; // UTF-16BE |
|
case 12000: return 4u; // UTF-32LE |
|
case 12001: return 4u; // UTF-32BE |
|
default: return 1u; |
|
} |
|
} |
|
|
|
//! Fallback conversion that will at least work for ASCII characters |
|
static void to_utf8_fallback(const std::string & from, std::string & to, codepage_id cp) { |
|
|
|
size_t skip = get_encoding_size(cp); |
|
|
|
size_t shift = 0; |
|
switch(cp) { |
|
case 1201: shift = 1u * 8u; break; // UTF-16BE |
|
case 12001: shift = 3u * 8u; break; // UTF-32BE |
|
} |
|
|
|
to.clear(); |
|
to.reserve(ceildiv(from.size(), skip)); |
|
|
|
bool warn = false; |
|
|
|
for(std::string::const_iterator it = from.begin(); it != from.end();) { |
|
|
|
unicode_char unicode = 0; |
|
for(size_t i = 0; i < skip; i++) { |
|
unicode |= unicode_char(boost::uint8_t(*it++)) << (i * 8); |
|
} |
|
|
|
char ascii = char((unicode >> shift) & 0x7f); |
|
|
|
// replace non-ASCII characters with underscores |
|
if((unicode_char(ascii) << shift) != unicode) { |
|
warn = true; |
|
ascii = replacement_char; |
|
} |
|
|
|
to.push_back(ascii); |
|
} |
|
|
|
if(warn) { |
|
static bool warned = false; |
|
log_warning << "unknown data while converting from CP" << cp << " to UTF-8"; |
|
if(!warned && (cp == cp_windows1252 || cp == cp_utf16le)) { |
|
#if INNOEXTRACT_HAVE_ICONV |
|
log_warning << "make sure your iconv installation supports Windows-1252 and UTF-16LE"; |
|
#elif !INNOEXTRACT_HAVE_BUILTIN_CONV && !INNOEXTRACT_HAVE_WIN32_CONV |
|
log_warning << "build innoextract with charset conversion routines enabled!"; |
|
#endif |
|
warned = true; |
|
} |
|
} |
|
|
|
} |
|
|
|
#if INNOEXTRACT_HAVE_BUILTIN_CONV |
|
|
|
static size_t utf8_length(unicode_char chr) { |
|
if (chr < 0x80) return 1; |
|
else if(chr < 0x800) return 2; |
|
else if(chr < 0x10000) return 3; |
|
else if(chr <= 0x0010ffff) return 4; |
|
return 1; |
|
} |
|
|
|
static void utf8_write(std::string & to, unicode_char chr) { |
|
|
|
static const boost::uint8_t first_bytes[7] = { |
|
0x00, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc |
|
}; |
|
|
|
// Get number of bytes to write |
|
size_t length = utf8_length(chr); |
|
|
|
// Extract bytes to write |
|
boost::uint8_t bytes[4]; |
|
switch(length) { |
|
case 4: bytes[3] = static_cast<boost::uint8_t>((chr | 0x80) & 0xBF), chr >>= 6; |
|
case 3: bytes[2] = static_cast<boost::uint8_t>((chr | 0x80) & 0xBF), chr >>= 6; |
|
case 2: bytes[1] = static_cast<boost::uint8_t>((chr | 0x80) & 0xBF), chr >>= 6; |
|
case 1: bytes[0] = static_cast<boost::uint8_t>(chr | first_bytes[length]); |
|
} |
|
|
|
// Add them to the output |
|
const boost::uint8_t * cur_byte = bytes; |
|
switch(length) { |
|
case 4: to.push_back(char(*cur_byte++)); |
|
case 3: to.push_back(char(*cur_byte++)); |
|
case 2: to.push_back(char(*cur_byte++)); |
|
case 1: to.push_back(char(*cur_byte++)); |
|
} |
|
|
|
} |
|
|
|
//! \return true c is is the first part of an UTF-16 surrogate pair |
|
static bool is_utf16_high_surrogate(unicode_char chr) { |
|
return chr >= 0xd800 && chr <= 0xdbff; |
|
} |
|
|
|
//! \return true c is is the second part of an UTF-16 surrogate pair |
|
static bool is_utf16_low_surrogate(unicode_char chr) { |
|
return chr >= 0xdc00 && chr <= 0xdfff; |
|
} |
|
|
|
static void utf16le_to_utf8(const std::string & from, std::string & to) { |
|
|
|
if(from.size() % 2 != 0) { |
|
log_warning << "unexpected trailing byte in UTF-16 string"; |
|
} |
|
|
|
to.clear(); |
|
to.reserve(from.size() / 2); // optimistically, most strings only have ASCII characters |
|
|
|
bool warn = false; |
|
|
|
std::string::const_iterator it = from.begin(); |
|
std::string::const_iterator end = from.end(); |
|
while(it != end) { |
|
|
|
unicode_char chr = boost::uint8_t(*it++); |
|
if(it == end) { |
|
warn = true; |
|
utf8_write(to, replacement_char); |
|
break; |
|
} |
|
chr |= unicode_char(boost::uint8_t(*it++)) << 8; |
|
|
|
// If it's a surrogate pair, convert to a single UTF-32 character |
|
if(is_utf16_high_surrogate(chr)) { |
|
if(it == end) { |
|
warn = true; |
|
utf8_write(to, replacement_char); |
|
break; |
|
} |
|
unicode_char d = boost::uint8_t(*it++); |
|
if(it == end) { |
|
warn = true; |
|
utf8_write(to, replacement_char); |
|
break; |
|
} |
|
d |= unicode_char(boost::uint8_t(*it++)) << 8; |
|
if(is_utf16_low_surrogate(d)) { |
|
chr = ((chr - 0xd800) << 10) + (d - 0xdc00) + 0x0010000; |
|
} else { |
|
warn = true; |
|
utf8_write(to, replacement_char); |
|
continue; |
|
} |
|
} |
|
|
|
// Replace invalid characters |
|
if(chr > 0x0010FFFF) { |
|
warn = true; |
|
// Invalid character (greater than the maximum unicode value) |
|
utf8_write(to, replacement_char); |
|
continue; |
|
} |
|
|
|
utf8_write(to, chr); |
|
} |
|
|
|
if(warn) { |
|
log_warning << "unexpected data while converting from UTF-16LE to UTF-8"; |
|
} |
|
|
|
} |
|
|
|
static void windows1252_to_utf8(const std::string & from, std::string & to) { |
|
|
|
static unicode_char replacements[] = { |
|
0x20ac, replacement_char, 0x201a, 0x192, 0x201e, 0x2026, 0x2020, 0x2021, 0x2c6, |
|
0x2030, 0x160, 0x2039, 0x152, replacement_char, 0x17d, replacement_char, |
|
replacement_char, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, 0x2dc, |
|
0x2122, 0x161, 0x203a, 0x153, replacement_char, 0x17e, 0x178 |
|
}; |
|
|
|
BOOST_STATIC_ASSERT(sizeof(replacements) == (160 - 128) * sizeof(*replacements)); |
|
|
|
to.clear(); |
|
to.reserve(from.size()); // optimistically, most strings only have ASCII characters |
|
|
|
bool warn = false; |
|
|
|
BOOST_FOREACH(char c, from) { |
|
|
|
// Windows-1252 maps almost directly to Unicode - yay! |
|
unicode_char chr = boost::uint8_t(c); |
|
if(chr >= 128 && chr < 160) { |
|
chr = replacements[chr - 128]; |
|
warn = warn || (chr == unicode_char(replacement_char)); |
|
} |
|
|
|
utf8_write(to, chr); |
|
} |
|
|
|
if(warn) { |
|
log_warning << "unexpected data while converting from Windows-1252 to UTF-8"; |
|
} |
|
|
|
} |
|
|
|
static bool to_utf8_builtin(const std::string & from, std::string & to, codepage_id cp) { |
|
|
|
switch(cp) { |
|
case cp_utf16le: utf16le_to_utf8(from, to); return true; |
|
case cp_windows1252: windows1252_to_utf8(from, to); return true; |
|
case cp_iso_8859_1: windows1252_to_utf8(from, to); return true; |
|
default: return false; |
|
} |
|
|
|
} |
|
|
|
#endif // INNOEXTRACT_HAVE_BUILTIN_CONV |
|
|
|
#if INNOEXTRACT_HAVE_ICONV |
|
|
|
typedef boost::unordered_map<codepage_id, iconv_t> converter_map; |
|
static converter_map converters; |
|
|
|
//! Get names for encodings where iconv doesn't have the codepage alias |
|
static const char * get_encoding_name(codepage_id codepage) { |
|
switch(codepage) { |
|
case 708: return "ISO-8859-6"; |
|
case 936: return "GBK"; |
|
case 949: return "UHC"; |
|
case 950: return "BIG5"; |
|
// iconv's behavior for "UTF-16" is platform-dependent if there is no BOM. |
|
// There never is any BOM in Inno Setup files and it's always little-endian, |
|
// so we specify the exact encoding. |
|
case 1200: return "UTF-16LE"; |
|
case 1201: return "UTF-16BE"; |
|
case 1252: return "MS-ANSI"; |
|
case 1361: return "JOHAB"; |
|
case 10000: return "MACINTOSH"; |
|
case 10002: return "BIG5"; |
|
case 10008: return "GB2312"; |
|
case 12000: return "UTF-32LE"; |
|
case 12001: return "UTF-32BE"; |
|
case 20003: return "IBM5550"; |
|
case 20127: return "US-ASCII"; |
|
case 20261: return "T.61"; |
|
case 20269: return "ISO_6937"; |
|
case 20273: return "IBM273"; |
|
case 20277: return "IBM277"; |
|
case 20278: return "IBM278"; |
|
case 20280: return "IBM280"; |
|
case 20284: return "IBM284"; |
|
case 20285: return "IBM285"; |
|
case 20290: return "IBM290"; |
|
case 20297: return "IBM297"; |
|
case 20420: return "IBM420"; |
|
case 20423: return "IBM423"; |
|
case 20424: return "IBM424"; |
|
case 20866: return "KOI8-R"; |
|
case 20871: return "IBM871"; |
|
case 20880: return "IBM880"; |
|
case 20905: return "IBM905"; |
|
case 20924: return "IBM1047"; |
|
case 20932: return "EUC-JP-MS"; |
|
case 20936: return "EUC-CN"; |
|
case 21025: return "IBM1025"; |
|
case 21866: return "KOI8-U"; |
|
case 28591: return "ISO-8859-1"; |
|
case 28592: return "ISO-8859-2"; |
|
case 28593: return "ISO-8859-3"; |
|
case 28594: return "ISO-8859-4"; |
|
case 28595: return "ISO-8859-5"; |
|
case 28596: return "ISO-8859-6"; |
|
case 28597: return "ISO-8859-7"; |
|
case 28598: return "ISO-8859-8"; |
|
case 28599: return "ISO-8859-9"; |
|
case 28603: return "ISO-8859-13"; |
|
case 28605: return "ISO-8859-15"; |
|
case 38598: return "ISO-8859-8"; |
|
case 50220: return "ISO-2022-JP"; |
|
case 50221: return "ISO-2022-JP-2"; |
|
case 50222: return "ISO-2022-JP-3"; |
|
case 50225: return "ISO-2022-KR"; |
|
case 50227: return "ISO-2022-CN"; |
|
case 50229: return "ISO-2022-CN-EXT"; |
|
case 50930: return "EBCDIC-JP-E"; |
|
case 51932: return "EUC-JP"; |
|
case 51936: return "EUC-CN"; |
|
case 51949: return "EUC-KR"; |
|
case 51950: return "EUC-CN"; |
|
case 54936: return "GB18030"; |
|
case 65000: return "UTF-7"; |
|
case 65001: return "UTF-8"; |
|
default: return NULL; |
|
} |
|
} |
|
|
|
static iconv_t get_converter(codepage_id codepage) { |
|
|
|
// Try to reuse an existing converter if possible |
|
converter_map::const_iterator i = converters.find(codepage); |
|
if(i != converters.end()) { |
|
return i->second; |
|
} |
|
|
|
iconv_t handle = iconv_t(-1); |
|
|
|
const char * encoding = get_encoding_name(codepage); |
|
if(encoding) { |
|
handle = iconv_open("UTF-8", encoding); |
|
} |
|
|
|
// Otherwise, try a few different codepage name prefixes |
|
if(handle == iconv_t(-1)) { |
|
const char * prefixes[] = { "MSCP", "CP", "WINDOWS-", "MS", "IBM", "IBM-", "" }; |
|
BOOST_FOREACH(const char * prefix, prefixes) { |
|
std::ostringstream oss; |
|
oss << prefix << std::setfill('0') << std::setw(3) << codepage; |
|
handle = iconv_open("UTF-8", oss.str().c_str()); |
|
if(handle != iconv_t(-1)) { |
|
break; |
|
} |
|
} |
|
} |
|
|
|
if(handle == iconv_t(-1)) { |
|
log_warning << "could not get codepage " << codepage << " -> UTF-8 converter"; |
|
} |
|
|
|
return converters[codepage] = handle; |
|
} |
|
|
|
static bool to_utf8_iconv(const std::string & from, std::string & to, codepage_id cp) { |
|
|
|
iconv_t converter = get_converter(cp); |
|
if(converter == iconv_t(-1)) { |
|
return false; |
|
} |
|
|
|
/* |
|
* Some iconv implementations declare the second parameter of iconv() as |
|
* const char **, others as char **. |
|
* Use this little hack to compile with both variants. |
|
*/ |
|
struct inbuf_ { |
|
const char * buf; |
|
explicit inbuf_(const char * data) : buf(data) { } |
|
operator const char **() { return &buf; } |
|
operator char **() { return const_cast<char **>(&buf); } |
|
} inbuf(from.data()); |
|
|
|
size_t insize = from.size(); |
|
|
|
size_t outbase = 0; |
|
|
|
iconv(converter, NULL, NULL, NULL, NULL); |
|
|
|
size_t skip = get_encoding_size(cp); |
|
|
|
bool warn = false; |
|
|
|
while(insize) { |
|
|
|
to.resize(outbase + ceildiv(insize, skip) + 4); |
|
|
|
char * outbuf = &to[0] + outbase; |
|
size_t outsize = to.size() - outbase; |
|
|
|
size_t ret = iconv(converter, inbuf, &insize, &outbuf, &outsize); |
|
if(ret == size_t(-1)) { |
|
if(errno == E2BIG) { |
|
// not enough output space - we'll allocate more in the next loop |
|
} else if(/*errno == EILSEQ &&*/ insize >= 2) { |
|
// invalid byte (sequence) - add a replacement char and try the next byte |
|
if(outsize == 0) { |
|
to.push_back(replacement_char); |
|
} else { |
|
*outbuf = replacement_char; |
|
outsize--; |
|
} |
|
inbuf.buf += skip; |
|
insize -= skip; |
|
warn = true; |
|
} else { |
|
// something else went wrong - return what we have so far |
|
insize = 0; |
|
warn = true; |
|
} |
|
} |
|
|
|
outbase = to.size() - outsize; |
|
} |
|
|
|
if(warn) { |
|
log_warning << "unexpected data while converting from CP" << cp << " to UTF-8"; |
|
} |
|
|
|
to.resize(outbase); |
|
|
|
return true; |
|
} |
|
|
|
#endif // INNOEXTRACT_HAVE_ICONV |
|
|
|
#if INNOEXTRACT_HAVE_WIN32_CONV |
|
|
|
static std::string windows_error_string(DWORD code) { |
|
char * error; |
|
DWORD n = FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_ALLOCATE_BUFFER, |
|
NULL, code, 0, reinterpret_cast<char *>(&error), 0, |
|
NULL); |
|
if(n == 0) { |
|
return "unknown"; |
|
} else { |
|
std::string ret(error, size_t(n)); |
|
LocalFree(error); |
|
if(!ret.empty() && ret[ret.size() - 1] == '\n') { |
|
ret.resize(ret.size() - 1); |
|
} |
|
return ret; |
|
} |
|
} |
|
|
|
static bool to_utf8_win32(const std::string & from, std::string & to, codepage_id cp) { |
|
|
|
int ret = 0; |
|
|
|
// Convert from the source codepage to UTF-16LE |
|
const WCHAR * utf16; |
|
int utf16_size; |
|
std::vector<WCHAR> buffer; |
|
if(cp == cp_utf16le) { |
|
utf16 = reinterpret_cast<const WCHAR *>(from.data()); |
|
utf16_size = int(from.size()) / 2; |
|
} else { |
|
utf16_size = MultiByteToWideChar(cp, 0, from.data(), int(from.length()), NULL, 0); |
|
if(utf16_size > 0) { |
|
buffer.resize(size_t(utf16_size)); |
|
ret = MultiByteToWideChar(cp, 0, from.data(), int(from.length()), |
|
&buffer.front(), utf16_size); |
|
} |
|
if(utf16_size <= 0 || ret <= 0) { |
|
log_warning << "error while converting from CP" << cp << " to UTF-16: " |
|
<< windows_error_string(GetLastError()); |
|
return false; |
|
} |
|
utf16 = &buffer.front(); |
|
} |
|
|
|
// Convert from UTF-16LE to UTF-8 |
|
int utf8_size = WideCharToMultiByte(CP_UTF8, 0, utf16, utf16_size, NULL, 0, NULL, NULL); |
|
if(utf8_size > 0) { |
|
to.resize(size_t(utf8_size)); |
|
ret = WideCharToMultiByte(CP_UTF8, 0, utf16, utf16_size, |
|
&to[0], utf8_size, NULL, NULL); |
|
} |
|
if(utf8_size <= 0 || ret <= 0) { |
|
log_warning << "error while converting from UTF-16 to UTF-8: " |
|
<< windows_error_string(GetLastError()); |
|
return false; |
|
} |
|
|
|
return true; |
|
} |
|
|
|
#endif // INNOEXTRACT_HAVE_WIN32_CONV |
|
|
|
} // anonymous namespace |
|
|
|
void to_utf8(const std::string & from, std::string & to, codepage_id cp) { |
|
|
|
if(from.empty()) { |
|
to.clear(); |
|
return; |
|
} |
|
|
|
if(cp == cp_utf8 || cp == cp_ascii) { |
|
to = from; |
|
return; |
|
} |
|
|
|
#if INNOEXTRACT_HAVE_BUILTIN_CONV |
|
if(to_utf8_builtin(from, to, cp)) { |
|
return; |
|
} |
|
#endif |
|
|
|
#if INNOEXTRACT_HAVE_ICONV |
|
if(to_utf8_iconv(from, to, cp)) { |
|
return; |
|
} |
|
#endif |
|
|
|
#if INNOEXTRACT_HAVE_WIN32_CONV |
|
if(to_utf8_win32(from, to, cp)) { |
|
return; |
|
} |
|
#endif |
|
|
|
to_utf8_fallback(from, to, cp); |
|
|
|
} |
|
|
|
} // namespace util
|
|
|