Browse Source

encoding: Use WTF-8 to represent broken UTF-16 data

pull/108/head
Daniel Scharrer 7 years ago
parent
commit
5f91e0ed48
  1. 1
      CHANGELOG
  2. 70
      src/util/encoding.cpp
  3. 10
      src/util/encoding.hpp
  4. 25
      src/util/process.cpp
  5. 13
      src/util/windows.cpp

1
CHANGELOG

@ -17,6 +17,7 @@ innoextract 1.8 (WIP)
- Fixed output directory being created even when not extracting files
- Fixed a hang when using the --language option
- Changed header parsing to select the first version without warnings and failing that the first without errors
- Changed filesystem and output encoding to WTF-8 (extended UTF-8) to represent broken UTF-16 data
innoextract 1.7 (2018-06-12)
- Added support for Inno Setup 5.6.0 installers

70
src/util/encoding.cpp

@ -422,7 +422,9 @@ bool is_utf16_low_surrogate(unicode_char chr) {
return chr >= 0xdc00 && chr <= 0xdfff;
}
void utf16le_to_utf8(const std::string & from, std::string & to) {
} // anonymous namespace
void utf16le_to_wtf8(const std::string & from, std::string & to) {
if(from.size() % 2 != 0) {
log_warning << "Unexpected trailing byte in UTF-16 string.";
@ -435,49 +437,30 @@ void utf16le_to_utf8(const std::string & from, std::string & to) {
std::string::const_iterator it = from.begin();
std::string::const_iterator end = from.end();
if(from.size() % 2 != 0) {
end--;
}
while(it != end) {
unicode_char chr = boost::uint8_t(*it++);
if(it == end) {
warn = true;
utf8_write(to, replacement_char);
break;
}
chr |= unicode_char(boost::uint8_t(*it++)) << 8;
// If it's a surrogate pair, convert to a single UTF-32 character
if(is_utf16_high_surrogate(chr)) {
if(it == end) {
warn = true;
utf8_write(to, replacement_char);
break;
}
unicode_char d = boost::uint8_t(*it++);
if(it == end) {
warn = true;
utf8_write(to, replacement_char);
break;
}
d |= unicode_char(boost::uint8_t(*it++)) << 8;
if(is_utf16_high_surrogate(chr) && it != end) {
unicode_char d = boost::uint8_t(*it);
d |= unicode_char(boost::uint8_t(*(it + 1))) << 8;
if(is_utf16_low_surrogate(d)) {
chr = ((chr - 0xd800) << 10) + (d - 0xdc00) + 0x0010000;
} else {
warn = true;
utf8_write(to, replacement_char);
continue;
it += 2;
}
}
// Replace invalid characters
if(chr > 0x0010FFFF) {
warn = true;
// Invalid character (greater than the maximum unicode value)
utf8_write(to, replacement_char);
continue;
}
utf8_write(to, chr);
}
if(end != from.end()) {
warn = true;
utf8_write(to, replacement_char);
}
if(warn) {
log_warning << "Unexpected data while converting from UTF-16LE to UTF-8.";
@ -485,21 +468,16 @@ void utf16le_to_utf8(const std::string & from, std::string & to) {
}
void utf8_to_utf16le(const std::string & from, std::string & to) {
void wtf8_to_utf16le(const std::string & from, std::string & to) {
to.clear();
to.reserve(from.size() * 2); // optimistically, most strings only have ASCII characters
bool warn = false;
for(std::string::const_iterator i = from.begin(); i != from.end(); ) {
unicode_char chr = utf8_read(i, from.end());
if((chr >= 0xd800 && chr <= 0xdfff) || chr > 0x10ffff) {
chr = replacement_char;
warn = true;
} else if(chr >= 0x10000) {
if(chr >= 0x10000) {
chr -= 0x10000;
unicode_char high_surrogate = 0xd800 + (chr >> 10);
to.push_back(char(boost::uint8_t(high_surrogate)));
@ -511,12 +489,10 @@ void utf8_to_utf16le(const std::string & from, std::string & to) {
to.push_back(char(boost::uint8_t(chr >> 8)));
}
if(warn) {
log_warning << "Unexpected data while converting from UTF-8 to UTF-16LE.";
}
}
namespace {
unicode_char windows1252_replacements[] = {
0x20ac, replacement_char, 0x201a, 0x192, 0x201e, 0x2026, 0x2020, 0x2021, 0x2c6,
0x2030, 0x160, 0x2039, 0x152, replacement_char, 0x17d, replacement_char,
@ -732,7 +708,7 @@ std::string windows_error_string(DWORD code) {
}
bool to_utf8_win32(const std::string & from, std::string & to, codepage_id codepage) {
// Convert from the source codepage to UTF-16LE
std::string buffer;
int ret = MultiByteToWideChar(codepage, 0, from.data(), int(from.length()), NULL, 0);
@ -747,7 +723,7 @@ bool to_utf8_win32(const std::string & from, std::string & to, codepage_id codep
return false;
}
utf16le_to_utf8(buffer, to);
utf16le_to_wtf8(buffer, to);
return true;
}
@ -755,7 +731,7 @@ bool to_utf8_win32(const std::string & from, std::string & to, codepage_id codep
bool from_utf8_win32(const std::string & from, std::string & to, codepage_id codepage) {
std::string buffer;
utf8_to_utf16le(from, buffer);
wtf8_to_utf16le(from, buffer);
// Convert from UTF-16LE to the target codepage
LPCWSTR data = reinterpret_cast<LPCWSTR>(buffer.c_str());
@ -779,7 +755,7 @@ bool from_utf8_win32(const std::string & from, std::string & to, codepage_id cod
void to_utf8(const std::string & from, std::string & to, codepage_id codepage) {
switch(codepage) {
case cp_utf16le: utf16le_to_utf8(from, to); return;
case cp_utf16le: utf16le_to_wtf8(from, to); return;
case cp_windows1252: windows1252_to_utf8(from, to); return;
case cp_iso_8859_1: windows1252_to_utf8(from, to); return;
default: break;
@ -828,7 +804,7 @@ void from_utf8(const std::string & from, std::string & to, codepage_id codepage)
}
switch(codepage) {
case cp_utf16le: utf8_to_utf16le(from, to); return;
case cp_utf16le: wtf8_to_utf16le(from, to); return;
case cp_windows1252: utf8_to_windows1252(from, to); return;
default: break;
}

10
src/util/encoding.hpp

@ -145,6 +145,16 @@ enum known_codepages {
typedef boost::uint32_t codepage_id;
/*!
* Convert a possibly broken UTF-16 string to WTF-8, an extension of UTF-8.
*/
void utf16le_to_wtf8(const std::string & from, std::string & to);
/*!
* Convert WTF-8 to UTF-16 while preserving unpaired surrogates.
*/
void wtf8_to_utf16le(const std::string & from, std::string & to);
/*!
* Convert a string in place to UTF-8 from a specified encoding.
* \param data The input string to convert.

25
src/util/process.cpp

@ -60,6 +60,8 @@ extern char ** environ;
#endif
#include "util/encoding.hpp"
namespace util {
#if defined(_WIN32) || !(INNOEXTRACT_HAVE_POSIX_SPAWNP \
@ -87,15 +89,6 @@ static std::string format_command_line(const char * const args[]) {
}
#endif
#if defined(_WIN32)
static WCHAR * utf8_to_wchar(const char * string) {
int n = MultiByteToWideChar(CP_UTF8, 0, string, -1, NULL, 0);
WCHAR * wstr = new WCHAR[n];
MultiByteToWideChar(CP_UTF8, 0, string, -1, wstr, n);
return wstr;
}
#endif
int run(const char * const args[]) {
std::cout.flush();
@ -104,8 +97,12 @@ int run(const char * const args[]) {
#if defined(_WIN32)
// Format the command line arguments
WCHAR * exe = utf8_to_wchar(args[0]);
WCHAR * cmdline = utf8_to_wchar(format_command_line(args + 1).c_str());
std::string exe;
wtf8_to_utf16le(args[0], exe);
exe.push_back('\0');
std::string cmdline;
wtf8_to_utf16le(format_command_line(args + 1), exe);
cmdline.push_back('\0');
STARTUPINFO si;
memset(&si, 0, sizeof(STARTUPINFO));
@ -114,10 +111,8 @@ int run(const char * const args[]) {
PROCESS_INFORMATION pi;
memset(&pi, 0, sizeof(PROCESS_INFORMATION));
bool success = (CreateProcessW(exe, cmdline, 0, 0, 0, 0, 0, 0, &si, &pi) != 0);
delete[] cmdline;
delete[] exe;
bool success = (CreateProcessW(reinterpret_cast<LPCWSTR>(exe.c_str()),
reinterpret_cast<LPWSTR>(&cmdline[0]), 0, 0, 0, 0, 0, 0, &si, &pi) != 0);
if(!success) {
return -1; // Could not start process

13
src/util/windows.cpp

@ -34,6 +34,7 @@
#include <stdexcept>
#include <vector>
#include <wchar.h>
#include <windows.h>
#include <shellapi.h>
@ -53,6 +54,7 @@ namespace { typedef boost::filesystem::detail::utf8_codecvt_facet utf8_codecvt;
#endif
#include "util/ansi.hpp"
#include "util/encoding.hpp"
// Disable telemetry added in Visual Studio 2015
#if defined(_MSC_VER) && _MSC_VER >= 1900
@ -505,17 +507,16 @@ int main() {
// Convert the UTF-16 command-line parameters to UTF-8
int argc = 0;
char ** argv = NULL;
std::vector<std::string> args;
{
wchar_t ** wargv = CommandLineToArgvW(GetCommandLineW(), &argc);
args.resize(size_t(argc));
argv = new char *[argc + 1];
argv[argc] = NULL;
for(int i = 0; i < argc; i++) {
int n = WideCharToMultiByte(CP_UTF8, 0, wargv[i], -1, NULL, 0, NULL, NULL);
argv[i] = new char[n];
WideCharToMultiByte(CP_UTF8, 0, wargv[i], -1, argv[i], n, NULL, NULL);
for(size_t i = 0; i < args.size(); i++) {
util::utf16le_to_wtf8(std::string(reinterpret_cast<char *>(wargv[i]), wcslen(wargv[i]) * 2), args[i]);
argv[i] = &args[i][0];
}
LocalFree(wargv);
}

Loading…
Cancel
Save