diff --git a/CHANGELOG b/CHANGELOG index 871837d..4c082fc 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -17,6 +17,7 @@ innoextract 1.8 (WIP) - Fixed output directory being created even when not extracting files - Fixed a hang when using the --language option - Changed header parsing to select the first version without warnings and failing that the first without errors + - Changed filesystem and output encoding to WTF-8 (extended UTF-8) to represent broken UTF-16 data innoextract 1.7 (2018-06-12) - Added support for Inno Setup 5.6.0 installers diff --git a/src/util/encoding.cpp b/src/util/encoding.cpp index 4c992b8..9c9f97d 100644 --- a/src/util/encoding.cpp +++ b/src/util/encoding.cpp @@ -422,7 +422,9 @@ bool is_utf16_low_surrogate(unicode_char chr) { return chr >= 0xdc00 && chr <= 0xdfff; } -void utf16le_to_utf8(const std::string & from, std::string & to) { +} // anonymous namespace + +void utf16le_to_wtf8(const std::string & from, std::string & to) { if(from.size() % 2 != 0) { log_warning << "Unexpected trailing byte in UTF-16 string."; @@ -435,49 +437,30 @@ void utf16le_to_utf8(const std::string & from, std::string & to) { std::string::const_iterator it = from.begin(); std::string::const_iterator end = from.end(); + if(from.size() % 2 != 0) { + end--; + } while(it != end) { unicode_char chr = boost::uint8_t(*it++); - if(it == end) { - warn = true; - utf8_write(to, replacement_char); - break; - } chr |= unicode_char(boost::uint8_t(*it++)) << 8; // If it's a surrogate pair, convert to a single UTF-32 character - if(is_utf16_high_surrogate(chr)) { - if(it == end) { - warn = true; - utf8_write(to, replacement_char); - break; - } - unicode_char d = boost::uint8_t(*it++); - if(it == end) { - warn = true; - utf8_write(to, replacement_char); - break; - } - d |= unicode_char(boost::uint8_t(*it++)) << 8; + if(is_utf16_high_surrogate(chr) && it != end) { + unicode_char d = boost::uint8_t(*it); + d |= unicode_char(boost::uint8_t(*(it + 1))) << 8; if(is_utf16_low_surrogate(d)) { chr = ((chr - 0xd800) << 10) + (d - 0xdc00) + 0x0010000; - } else { - warn = true; - utf8_write(to, replacement_char); - continue; + it += 2; } } - // Replace invalid characters - if(chr > 0x0010FFFF) { - warn = true; - // Invalid character (greater than the maximum unicode value) - utf8_write(to, replacement_char); - continue; - } - utf8_write(to, chr); } + if(end != from.end()) { + warn = true; + utf8_write(to, replacement_char); + } if(warn) { log_warning << "Unexpected data while converting from UTF-16LE to UTF-8."; @@ -485,21 +468,16 @@ void utf16le_to_utf8(const std::string & from, std::string & to) { } -void utf8_to_utf16le(const std::string & from, std::string & to) { +void wtf8_to_utf16le(const std::string & from, std::string & to) { to.clear(); to.reserve(from.size() * 2); // optimistically, most strings only have ASCII characters - bool warn = false; - for(std::string::const_iterator i = from.begin(); i != from.end(); ) { unicode_char chr = utf8_read(i, from.end()); - if((chr >= 0xd800 && chr <= 0xdfff) || chr > 0x10ffff) { - chr = replacement_char; - warn = true; - } else if(chr >= 0x10000) { + if(chr >= 0x10000) { chr -= 0x10000; unicode_char high_surrogate = 0xd800 + (chr >> 10); to.push_back(char(boost::uint8_t(high_surrogate))); @@ -511,12 +489,10 @@ void utf8_to_utf16le(const std::string & from, std::string & to) { to.push_back(char(boost::uint8_t(chr >> 8))); } - if(warn) { - log_warning << "Unexpected data while converting from UTF-8 to UTF-16LE."; - } - } +namespace { + unicode_char windows1252_replacements[] = { 0x20ac, replacement_char, 0x201a, 0x192, 0x201e, 0x2026, 0x2020, 0x2021, 0x2c6, 0x2030, 0x160, 0x2039, 0x152, replacement_char, 0x17d, replacement_char, @@ -732,7 +708,7 @@ std::string windows_error_string(DWORD code) { } bool to_utf8_win32(const std::string & from, std::string & to, codepage_id codepage) { - + // Convert from the source codepage to UTF-16LE std::string buffer; int ret = MultiByteToWideChar(codepage, 0, from.data(), int(from.length()), NULL, 0); @@ -747,7 +723,7 @@ bool to_utf8_win32(const std::string & from, std::string & to, codepage_id codep return false; } - utf16le_to_utf8(buffer, to); + utf16le_to_wtf8(buffer, to); return true; } @@ -755,7 +731,7 @@ bool to_utf8_win32(const std::string & from, std::string & to, codepage_id codep bool from_utf8_win32(const std::string & from, std::string & to, codepage_id codepage) { std::string buffer; - utf8_to_utf16le(from, buffer); + wtf8_to_utf16le(from, buffer); // Convert from UTF-16LE to the target codepage LPCWSTR data = reinterpret_cast(buffer.c_str()); @@ -779,7 +755,7 @@ bool from_utf8_win32(const std::string & from, std::string & to, codepage_id cod void to_utf8(const std::string & from, std::string & to, codepage_id codepage) { switch(codepage) { - case cp_utf16le: utf16le_to_utf8(from, to); return; + case cp_utf16le: utf16le_to_wtf8(from, to); return; case cp_windows1252: windows1252_to_utf8(from, to); return; case cp_iso_8859_1: windows1252_to_utf8(from, to); return; default: break; @@ -828,7 +804,7 @@ void from_utf8(const std::string & from, std::string & to, codepage_id codepage) } switch(codepage) { - case cp_utf16le: utf8_to_utf16le(from, to); return; + case cp_utf16le: wtf8_to_utf16le(from, to); return; case cp_windows1252: utf8_to_windows1252(from, to); return; default: break; } diff --git a/src/util/encoding.hpp b/src/util/encoding.hpp index f2a7378..ee67876 100644 --- a/src/util/encoding.hpp +++ b/src/util/encoding.hpp @@ -145,6 +145,16 @@ enum known_codepages { typedef boost::uint32_t codepage_id; +/*! + * Convert a possibly broken UTF-16 string to WTF-8, an extension of UTF-8. + */ +void utf16le_to_wtf8(const std::string & from, std::string & to); + +/*! + * Convert WTF-8 to UTF-16 while preserving unpaired surrogates. + */ +void wtf8_to_utf16le(const std::string & from, std::string & to); + /*! * Convert a string in place to UTF-8 from a specified encoding. * \param data The input string to convert. diff --git a/src/util/process.cpp b/src/util/process.cpp index 5203029..e13ae2d 100644 --- a/src/util/process.cpp +++ b/src/util/process.cpp @@ -60,6 +60,8 @@ extern char ** environ; #endif +#include "util/encoding.hpp" + namespace util { #if defined(_WIN32) || !(INNOEXTRACT_HAVE_POSIX_SPAWNP \ @@ -87,15 +89,6 @@ static std::string format_command_line(const char * const args[]) { } #endif -#if defined(_WIN32) -static WCHAR * utf8_to_wchar(const char * string) { - int n = MultiByteToWideChar(CP_UTF8, 0, string, -1, NULL, 0); - WCHAR * wstr = new WCHAR[n]; - MultiByteToWideChar(CP_UTF8, 0, string, -1, wstr, n); - return wstr; -} -#endif - int run(const char * const args[]) { std::cout.flush(); @@ -104,8 +97,12 @@ int run(const char * const args[]) { #if defined(_WIN32) // Format the command line arguments - WCHAR * exe = utf8_to_wchar(args[0]); - WCHAR * cmdline = utf8_to_wchar(format_command_line(args + 1).c_str()); + std::string exe; + wtf8_to_utf16le(args[0], exe); + exe.push_back('\0'); + std::string cmdline; + wtf8_to_utf16le(format_command_line(args + 1), exe); + cmdline.push_back('\0'); STARTUPINFO si; memset(&si, 0, sizeof(STARTUPINFO)); @@ -114,10 +111,8 @@ int run(const char * const args[]) { PROCESS_INFORMATION pi; memset(&pi, 0, sizeof(PROCESS_INFORMATION)); - bool success = (CreateProcessW(exe, cmdline, 0, 0, 0, 0, 0, 0, &si, &pi) != 0); - - delete[] cmdline; - delete[] exe; + bool success = (CreateProcessW(reinterpret_cast(exe.c_str()), + reinterpret_cast(&cmdline[0]), 0, 0, 0, 0, 0, 0, &si, &pi) != 0); if(!success) { return -1; // Could not start process diff --git a/src/util/windows.cpp b/src/util/windows.cpp index 8269666..e99846b 100644 --- a/src/util/windows.cpp +++ b/src/util/windows.cpp @@ -34,6 +34,7 @@ #include #include +#include #include #include @@ -53,6 +54,7 @@ namespace { typedef boost::filesystem::detail::utf8_codecvt_facet utf8_codecvt; #endif #include "util/ansi.hpp" +#include "util/encoding.hpp" // Disable telemetry added in Visual Studio 2015 #if defined(_MSC_VER) && _MSC_VER >= 1900 @@ -505,17 +507,16 @@ int main() { // Convert the UTF-16 command-line parameters to UTF-8 int argc = 0; char ** argv = NULL; + std::vector args; { wchar_t ** wargv = CommandLineToArgvW(GetCommandLineW(), &argc); - + args.resize(size_t(argc)); argv = new char *[argc + 1]; argv[argc] = NULL; - for(int i = 0; i < argc; i++) { - int n = WideCharToMultiByte(CP_UTF8, 0, wargv[i], -1, NULL, 0, NULL, NULL); - argv[i] = new char[n]; - WideCharToMultiByte(CP_UTF8, 0, wargv[i], -1, argv[i], n, NULL, NULL); + for(size_t i = 0; i < args.size(); i++) { + util::utf16le_to_wtf8(std::string(reinterpret_cast(wargv[i]), wcslen(wargv[i]) * 2), args[i]); + argv[i] = &args[i][0]; } - LocalFree(wargv); }