encoding: Use WTF-8 to represent broken UTF-16 data

7 years ago · 5f91e0ed48
5 changed files with 51 additions and 68 deletions
--- a/1
+++ b/1
@ -17,6 +17,7 @@ innoextract 1.8 (WIP)
 - Fixed output directory being created even when not extracting files
 - Fixed a hang when using the --language option
 - Changed header parsing to select the first version without warnings and failing that the first without errors
+ - Changed filesystem and output encoding to WTF-8 (extended UTF-8) to represent broken UTF-16 data

 innoextract 1.7 (2018-06-12)
 - Added support for Inno Setup 5.6.0 installers
--- a/src/util/encoding.cpp
+++ b/src/util/encoding.cpp
@ -422,7 +422,9 @@ bool is_utf16_low_surrogate(unicode_char chr) {
 	return chr >= 0xdc00 && chr <= 0xdfff;
 }

-void utf16le_to_utf8(const std::string & from, std::string & to) {
+} // anonymous namespace
+
+void utf16le_to_wtf8(const std::string & from, std::string & to) {
 	
 	if(from.size() % 2 != 0) {
 		log_warning << "Unexpected trailing byte in UTF-16 string.";
@ -435,49 +437,30 @@ void utf16le_to_utf8(const std::string & from, std::string & to) {
 	
 	std::string::const_iterator it = from.begin();
 	std::string::const_iterator end = from.end();
+	if(from.size() % 2 != 0) {
+		end--;
+	}
 	while(it != end) {
 		
 		unicode_char chr = boost::uint8_t(*it++);
-		if(it == end) {
-			warn = true;
-			utf8_write(to, replacement_char);
-			break;
-		}
 		chr |= unicode_char(boost::uint8_t(*it++)) << 8;
 		
 		// If it's a surrogate pair, convert to a single UTF-32 character
-		if(is_utf16_high_surrogate(chr)) {
-			if(it == end) {
-				warn = true;
-				utf8_write(to, replacement_char);
-				break;
-			}
-			unicode_char d = boost::uint8_t(*it++);
-			if(it == end) {
-				warn = true;
-				utf8_write(to, replacement_char);
-				break;
-			}
-			d |= unicode_char(boost::uint8_t(*it++)) << 8;
+		if(is_utf16_high_surrogate(chr) && it != end) {
+			unicode_char d = boost::uint8_t(*it);
+			d |= unicode_char(boost::uint8_t(*(it + 1))) << 8;
 			if(is_utf16_low_surrogate(d)) {
 				chr = ((chr - 0xd800) << 10) + (d - 0xdc00) + 0x0010000;
-			} else {
-				warn = true;
-				utf8_write(to, replacement_char);
-				continue;
+				it += 2;
 			}
 		}
 		
-		// Replace invalid characters
-		if(chr > 0x0010FFFF) {
-			warn = true;
-			// Invalid character (greater than the maximum unicode value)
-			utf8_write(to, replacement_char);
-			continue;
-		}
-		
 		utf8_write(to, chr);
 	}
+	if(end != from.end()) {
+		warn = true;
+		utf8_write(to, replacement_char);
+	}
 	
 	if(warn) {
 		log_warning << "Unexpected data while converting from UTF-16LE to UTF-8.";
@ -485,21 +468,16 @@ void utf16le_to_utf8(const std::string & from, std::string & to) {
 	
 }

-void utf8_to_utf16le(const std::string & from, std::string & to) {
+void wtf8_to_utf16le(const std::string & from, std::string & to) {
 	
 	to.clear();
 	to.reserve(from.size() * 2); // optimistically, most strings only have ASCII characters
 	
-	bool warn = false;
-	
 	for(std::string::const_iterator i = from.begin(); i != from.end(); ) {
 		
 		unicode_char chr = utf8_read(i, from.end());
 		
-		if((chr >= 0xd800 && chr <= 0xdfff) || chr > 0x10ffff) {
-			chr = replacement_char;
-			warn = true;
-		} else if(chr >= 0x10000) {
+		if(chr >= 0x10000) {
 			chr -= 0x10000;
 			unicode_char high_surrogate = 0xd800 + (chr >> 10);
 			to.push_back(char(boost::uint8_t(high_surrogate)));
@ -511,12 +489,10 @@ void utf8_to_utf16le(const std::string & from, std::string & to) {
 		to.push_back(char(boost::uint8_t(chr >> 8)));
 	}
 	
-	if(warn) {
-		log_warning << "Unexpected data while converting from UTF-8 to UTF-16LE.";
-	}
-	
 }

+namespace {
+
 unicode_char windows1252_replacements[] = {
 	0x20ac, replacement_char, 0x201a, 0x192, 0x201e, 0x2026, 0x2020, 0x2021, 0x2c6,
 	0x2030, 0x160, 0x2039, 0x152, replacement_char, 0x17d, replacement_char,
@ -732,7 +708,7 @@ std::string windows_error_string(DWORD code) {
 }

 bool to_utf8_win32(const std::string & from, std::string & to, codepage_id codepage) {
-	
+	 
 	// Convert from the source codepage to UTF-16LE
 	std::string buffer;
 	int ret = MultiByteToWideChar(codepage, 0, from.data(), int(from.length()), NULL, 0);
@ -747,7 +723,7 @@ bool to_utf8_win32(const std::string & from, std::string & to, codepage_id codep
 		return false;
 	}
 	
-	utf16le_to_utf8(buffer, to);
+	utf16le_to_wtf8(buffer, to);
 	
 	return true;
 }
@ -755,7 +731,7 @@ bool to_utf8_win32(const std::string & from, std::string & to, codepage_id codep
 bool from_utf8_win32(const std::string & from, std::string & to, codepage_id codepage) {
 	
 	std::string buffer;
-	utf8_to_utf16le(from, buffer);
+	wtf8_to_utf16le(from, buffer);
 	
 	// Convert from UTF-16LE to the target codepage
 	LPCWSTR data = reinterpret_cast<LPCWSTR>(buffer.c_str());
@ -779,7 +755,7 @@ bool from_utf8_win32(const std::string & from, std::string & to, codepage_id cod
 void to_utf8(const std::string & from, std::string & to, codepage_id codepage) {
 	
 	switch(codepage) {
-		case cp_utf16le:     utf16le_to_utf8(from, to); return;
+		case cp_utf16le:     utf16le_to_wtf8(from, to); return;
 		case cp_windows1252: windows1252_to_utf8(from, to); return;
 		case cp_iso_8859_1:  windows1252_to_utf8(from, to); return;
 		default: break;
@ -828,7 +804,7 @@ void from_utf8(const std::string & from, std::string & to, codepage_id codepage)
 	}
 	
 	switch(codepage) {
-		case cp_utf16le:     utf8_to_utf16le(from, to); return;
+		case cp_utf16le:     wtf8_to_utf16le(from, to); return;
 		case cp_windows1252: utf8_to_windows1252(from, to); return;
 		default: break;
 	}
--- a/src/util/encoding.hpp
+++ b/src/util/encoding.hpp
@ -145,6 +145,16 @@ enum known_codepages {

 typedef boost::uint32_t codepage_id;

+/*!
+ * Convert a possibly broken UTF-16 string to WTF-8, an extension of UTF-8.
+ */
+void utf16le_to_wtf8(const std::string & from, std::string & to);
+
+/*!
+ * Convert WTF-8 to UTF-16 while preserving unpaired surrogates.
+ */
+void wtf8_to_utf16le(const std::string & from, std::string & to);
+
 /*!
 * Convert a string in place to UTF-8 from a specified encoding.
 * \param data     The input string to convert.
--- a/src/util/process.cpp
+++ b/src/util/process.cpp
@ -60,6 +60,8 @@ extern char ** environ;

 #endif

+#include "util/encoding.hpp"
+
 namespace util {

 #if defined(_WIN32) || !(INNOEXTRACT_HAVE_POSIX_SPAWNP \
@ -87,15 +89,6 @@ static std::string format_command_line(const char * const args[]) {
 }
 #endif

-#if defined(_WIN32)
-static WCHAR * utf8_to_wchar(const char * string) {
-	int n = MultiByteToWideChar(CP_UTF8, 0, string, -1, NULL, 0);
-	WCHAR * wstr = new WCHAR[n];
-	MultiByteToWideChar(CP_UTF8, 0, string, -1, wstr, n);
-	return wstr;
-}
-#endif
-
 int run(const char * const args[]) {
 	
 	std::cout.flush();
@ -104,8 +97,12 @@ int run(const char * const args[]) {
 #if defined(_WIN32)
 	
 	// Format the command line arguments
-	WCHAR * exe = utf8_to_wchar(args[0]);
-	WCHAR * cmdline = utf8_to_wchar(format_command_line(args + 1).c_str());
+	std::string exe;
+	wtf8_to_utf16le(args[0], exe);
+	exe.push_back('\0');
+	std::string cmdline;
+	wtf8_to_utf16le(format_command_line(args + 1), exe);
+	cmdline.push_back('\0');
 	
 	STARTUPINFO si;
 	memset(&si, 0, sizeof(STARTUPINFO));
@ -114,10 +111,8 @@ int run(const char * const args[]) {
 	PROCESS_INFORMATION pi;
 	memset(&pi, 0, sizeof(PROCESS_INFORMATION));
 	
-	bool success = (CreateProcessW(exe, cmdline, 0, 0, 0, 0, 0, 0, &si, &pi) != 0);
-	
-	delete[] cmdline;
-	delete[] exe;
+	bool success = (CreateProcessW(reinterpret_cast<LPCWSTR>(exe.c_str()),
+	                               reinterpret_cast<LPWSTR>(&cmdline[0]), 0, 0, 0, 0, 0, 0, &si, &pi) != 0);
 	
 	if(!success) {
 		return -1; // Could not start process
--- a/src/util/windows.cpp
+++ b/src/util/windows.cpp
@ -34,6 +34,7 @@
 #include <stdexcept>
 #include <vector>

+#include <wchar.h>
 #include <windows.h>
 #include <shellapi.h>

@ -53,6 +54,7 @@ namespace { typedef boost::filesystem::detail::utf8_codecvt_facet utf8_codecvt;
 #endif

 #include "util/ansi.hpp"
+#include "util/encoding.hpp"

 // Disable telemetry added in Visual Studio 2015
 #if defined(_MSC_VER) && _MSC_VER >= 1900
@ -505,17 +507,16 @@ int main() {
 	// Convert the UTF-16 command-line parameters to UTF-8
 	int argc = 0;
 	char ** argv = NULL;
+	std::vector<std::string> args;
 	{
 		wchar_t ** wargv = CommandLineToArgvW(GetCommandLineW(), &argc);
-		
+		args.resize(size_t(argc));
 		argv = new char *[argc + 1];
 		argv[argc] = NULL;
-		for(int i = 0; i < argc; i++) {
-			int n = WideCharToMultiByte(CP_UTF8, 0, wargv[i], -1, NULL, 0,  NULL, NULL);
-			argv[i] = new char[n];
-			WideCharToMultiByte(CP_UTF8, 0, wargv[i], -1, argv[i], n, NULL, NULL);
+		for(size_t i = 0; i < args.size(); i++) {
+			util::utf16le_to_wtf8(std::string(reinterpret_cast<char *>(wargv[i]), wcslen(wargv[i]) * 2), args[i]);
+			argv[i] = &args[i][0];
 		}
-		
 		LocalFree(wargv);
 	}