From 89258fdd64074a4f2f459afcc1c32f9c79a5641e Mon Sep 17 00:00:00 2001 From: Daniel Scharrer Date: Mon, 9 Jun 2014 11:08:58 +0200 Subject: [PATCH] Add support for building without iconv anywhere Most of the time only Windows-1252 or UTF-16LE conversions are needed, so just bundle conversion routines for those. --- CMakeLists.txt | 47 ++++-- README.md | 5 +- src/configure.hpp.in | 2 + src/util/encoding.cpp | 381 ++++++++++++++++++++++++++++++++++-------- 4 files changed, 346 insertions(+), 89 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4bee31b..bac5c70 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,8 +5,9 @@ cmake_minimum_required(VERSION 2.8) # Define configuration options -option(USE_LZMA "Build lzma decompression support." ON) -option(USE_ICONV "Build against libiconv instead of native OS functions." OFF) +option(USE_LZMA "Build lzma decompression support" ON) +set(WITH_CONV CACHE STRING "The library to use for charset conversions") +option(ENABLE_BUILTIN_CONV "Build internal charset conversion routines" ON) option(DEBUG_EXTRA "Expensive debug options" OFF) option(SET_WARNING_FLAGS "Adjust compiler warning flags" ON) option(SET_OPTIMIZATION_FLAGS "Adjust compiler optimization flags" ON) @@ -138,15 +139,32 @@ if(Boost_USE_STATIC_LIBS) endif() -if(NOT WIN32 OR USE_ICONV) - find_package(iconv REQUIRED) - check_link_library(iconv iconv_LIBRARIES) - list(APPEND LIBRARIES ${iconv_LIBRARIES}) - include_directories(SYSTEM ${iconv_INCLUDE_DIR}) - add_definitions(${iconv_DEFINITIONS}) - set(INNOEXTRACT_HAVE_ICONV 1) -else() - set(INNOEXTRACT_HAVE_ICONV 0) +set(INNOEXTRACT_HAVE_ICONV 0) +set(INNOEXTRACT_HAVE_WIN32_CONV 0) +set(INNOEXTRACT_HAVE_BUILTIN_CONV ${ENABLE_BUILTIN_CONV}) +if(WIN32 AND (NOT WITH_CONV OR WITH_CONV STREQUAL "win32")) + set(INNOEXTRACT_HAVE_WIN32_CONV 1) +elseif(NOT WITH_CONV OR WITH_CONV STREQUAL "iconv") + if(STRICT_USE) + set(ICONV_REQUIRED REQUIRED) + else() + set(ICONV_REQUIRED) + endif() + find_package(iconv ${ICONV_REQUIRED}) + if(ICONV_FOUND) + check_link_library(iconv iconv_LIBRARIES) + list(APPEND LIBRARIES ${iconv_LIBRARIES}) + include_directories(SYSTEM ${iconv_INCLUDE_DIR}) + add_definitions(${iconv_DEFINITIONS}) + set(INNOEXTRACT_HAVE_ICONV 1) + endif() +elseif(WITH_CONV AND NOT WITH_CONV STREQUAL "builtin") + message(FATAL_ERROR "Invalid WITH_CONV option: ${WITH_CONV}") +endif() +if(NOT INNOEXTRACT_HAVE_ICONV AND NOT INNOEXTRACT_HAVE_WIN32_CONV + AND NOT INNOEXTRACT_HAVE_BUILTIN_CONV) + message(WARNING "\nBuilding without any charset conversion support.\n" + "Any non-ASCII characters in extracted filenames will be missing.") endif() @@ -371,8 +389,9 @@ print_configuration("File time precision" FIRST INNOEXTRACT_HAVE_UTIMES "microseconds" 1 "seconds" ) -print_configuration("Charset conversion" FIRST - USE_ICONV "iconv" - WIN32 "Win32" +print_configuration("Charset conversion" + INNOEXTRACT_HAVE_ICONV "iconv" + INNOEXTRACT_HAVE_WIN32_CONV "Win32" + INNOEXTRACT_HAVE_BUILTIN_CONV "bultin" ) message("") diff --git a/README.md b/README.md index d406a5f..a64a84d 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,8 @@ Build options: | Option | Default | Description | |:------------------------ |:---------:|:----------- | | `USE_LZMA` | `ON` | Use `liblzma` if available. -| `USE_ICONV` | `OFF`^1 | Use `libiconv` instead of native OS function +| `WITH_CONV` | *not set* | The charset conversion library to use. Valid values are `iconv`, `win32` and `builtin`^1. If not set, a library appropriate for the target platform will be chosen. +| `ENABLE_BUILTIN_CONV` | `ON` | Build internal Windows-1252 and UTF-16LE to UTF-18 charset conversion routines. These might be used even if `WITH_CONV` is not set to `builtin`. | `CMAKE_BUILD_TYPE` | `Release` | Set to `Debug` to enable debug output. | `DEBUG` | `OFF`^2 | Enable debug output and runtime checks. | `DEBUG_EXTRA` | `OFF` | Expensive debug options. @@ -56,7 +57,7 @@ Build options: | `ZLIB_USE_STATIC_LIBS` | `OFF`^4 | Statically link `libz`. (used via Boost) | `BZip2_USE_STATIC_LIBS` | `OFF`^4 | Statically link `libbz2`. (used via Boost) | `iconv_USE_STATIC_LIBS` | `OFF`^4 | Statically link `libiconv`. -1. This is only meaningful for Windows +1. The builtin charset conversion only supports Windows-1252 and UTF-16LE. This is normally enough for filenames, but custom message strings (which can be included in filenames) may use arbitrary encodings. 2. Enabled automatically if `CMAKE_BUILD_TYPE` is set to `Debug`. 3. Under Windows, the default is `ON`. 4. Default is `ON` if `USE_STATIC_LIBS` is enabled. diff --git a/src/configure.hpp.in b/src/configure.hpp.in index 54dc9e5..2452836 100644 --- a/src/configure.hpp.in +++ b/src/configure.hpp.in @@ -28,5 +28,7 @@ // Optional dependencies #cmakedefine01 INNOEXTRACT_HAVE_LZMA #cmakedefine01 INNOEXTRACT_HAVE_ICONV +#cmakedefine01 INNOEXTRACT_HAVE_WIN32_CONV +#cmakedefine01 INNOEXTRACT_HAVE_BUILTIN_CONV #endif // INNOEXTRACT_CONFIGURE_HPP diff --git a/src/util/encoding.cpp b/src/util/encoding.cpp index ff1b483..3db1a3a 100644 --- a/src/util/encoding.cpp +++ b/src/util/encoding.cpp @@ -17,6 +17,35 @@ * misrepresented as being the original software. * 3. This notice may not be removed or altered from any source distribution. */ +// Parts based on: +//////////////////////////////////////////////////////////// +// +// SFML - Simple and Fast Multimedia Library +// Copyright (C) 2007-2009 Laurent Gomila (laurent.gom@gmail.com) +// +// This software is provided 'as-is', without any express or implied warranty. +// In no event will the authors be held liable for any damages arising from the +// use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it freely, +// subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; +// you must not claim that you wrote the original software. +// If you use this software in a product, an acknowledgment +// in the product documentation would be appreciated but is not required. +// +// 2. Altered source versions must be plainly marked as such, +// and must not be misrepresented as being the original software. +// +// 3. This notice may not be removed or altered from any source distribution. +// +//////////////////////////////////////////////////////////// +// +// This code has been taken from SFML and altered to fit the project's needs. +// +//////////////////////////////////////////////////////////// #include "util/encoding.hpp" @@ -33,13 +62,14 @@ #if INNOEXTRACT_HAVE_ICONV #include #include -#elif defined(_WIN32) +#endif + +#if INNOEXTRACT_HAVE_WIN32_CONV #include -#else -#error No charset conversion library available! #endif #include +#include #include #include "util/log.hpp" @@ -47,17 +77,19 @@ namespace util { -static const codepage_id cp_utf8 = 65001; -static const codepage_id cp_ascii = 20127; +enum known_codepages { + cp_utf16le = 1200, + cp_windows1252 = 1252, + cp_ascii = 20127, + cp_iso_8859_1 = 28591, + cp_utf8 = 65001, +}; -#if INNOEXTRACT_HAVE_ICONV +namespace { static const char replacement_char = '_'; -namespace { - -typedef boost::unordered_map converter_map; -converter_map converters; +typedef boost::uint32_t unicode_char; static size_t get_encoding_size(codepage_id codepage) { switch(codepage) { @@ -69,6 +101,221 @@ static size_t get_encoding_size(codepage_id codepage) { } } +//! Fallback conversion that will at least work for ASCII characters +static void to_utf8_fallback(const std::string & from, std::string & to, codepage_id cp) { + + size_t skip = get_encoding_size(cp); + + size_t shift = 0; + switch(cp) { + case 1201: shift = 1u * 8u; break; // UTF-16BE + case 12001: shift = 3u * 8u; break; // UTF-32BE + } + + to.clear(); + to.reserve(ceildiv(from.size(), skip)); + + bool warn = false; + + for(std::string::const_iterator it = from.begin(); it != from.end();) { + + unicode_char unicode = 0; + for(size_t i = 0; i < skip; i++) { + unicode |= unicode_char(boost::uint8_t(*it++)) << (i * 8); + } + + char ascii = (unicode >> shift) & 0x7f; + + // replace non-ASCII characters with underscores + if((unicode_char(ascii) << shift) != unicode) { + warn = true; + ascii = replacement_char; + } + + to.push_back(ascii); + } + + static bool warned = false; + if(warn) { + log_warning << "unknown data while converting from CP" << cp << " to UTF-8"; + if(!warned && (cp == cp_windows1252 || cp == cp_utf16le)) { + #if INNOEXTRACT_HAVE_ICONV + log_warning << "make sure your iconv installation supports Windows-1252 and UTF-16LE"; + #elif !INNOEXTRACT_HAVE_BUILTIN_CONV && !INNOEXTRACT_HAVE_WIN32_CONV + log_warning << "build innoextract with charset conversion routines enabled!"; + #endif + warned = true; + } + } + +} + +#if INNOEXTRACT_HAVE_BUILTIN_CONV + +static size_t utf8_length(unicode_char chr) { + if (chr < 0x80) return 1; + else if(chr < 0x800) return 2; + else if(chr < 0x10000) return 3; + else if(chr <= 0x0010ffff) return 4; + return 1; +} + +static void utf8_write(std::string & to, unicode_char chr) { + + static const boost::uint8_t first_bytes[7] = { + 0x00, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc + }; + + // Get number of bytes to write + size_t length = utf8_length(chr); + + // Extract bytes to write + boost::uint8_t bytes[4]; + switch(length) { + case 4 : bytes[3] = static_cast((chr | 0x80) & 0xBF); chr >>= 6; + case 3 : bytes[2] = static_cast((chr | 0x80) & 0xBF); chr >>= 6; + case 2 : bytes[1] = static_cast((chr | 0x80) & 0xBF); chr >>= 6; + case 1 : bytes[0] = static_cast( chr | first_bytes[length]); + } + + // Add them to the output + const boost::uint8_t * cur_byte = bytes; + switch(length) { + case 4 : to.push_back(char(*cur_byte++)); + case 3 : to.push_back(char(*cur_byte++)); + case 2 : to.push_back(char(*cur_byte++)); + case 1 : to.push_back(char(*cur_byte++)); + } + +} + +//! \return true c is is the first part of an UTF-16 surrogate pair +static bool is_utf16_high_surrogate(unicode_char chr) { + return chr >= 0xd800 && chr <= 0xdbff; +} + +//! \return true c is is the second part of an UTF-16 surrogate pair +static bool is_utf16_low_surrogate(unicode_char chr) { + return chr >= 0xdc00 && chr <= 0xdfff; +} + +static void utf16le_to_utf8(const std::string & from, std::string & to) { + + if(from.size() % 2 != 0) { + log_warning << "unexpected trailing byte in UTF-16 string"; + } + + to.clear(); + to.reserve(from.size() / 2); // optimistically, most strings only have ASCII characters + + bool warn = false; + + std::string::const_iterator it = from.begin(); + std::string::const_iterator end = from.end(); + while(it != end) { + + unicode_char chr = boost::uint8_t(*it++); + if(it == end) { + warn = true; + utf8_write(to, replacement_char); + break; + } + chr |= unicode_char(boost::uint8_t(*it++)) << 8; + + // If it's a surrogate pair, convert to a single UTF-32 character + if(is_utf16_high_surrogate(chr)) { + if(it != end) { + unicode_char d = boost::uint8_t(*it++); + if(it == end) { + warn = true; + utf8_write(to, replacement_char); + break; + } + d |= unicode_char(boost::uint8_t(*it++)) << 8; + if(is_utf16_low_surrogate(d)) { + chr = ((chr - 0xd800) << 10) + (d - 0xdc00) + 0x0010000; + } else { + warn = true; + utf8_write(to, replacement_char); + continue; + } + } else { + warn = true; + // Invalid second element + utf8_write(to, replacement_char); + continue; + } + } + + // Replace invalid characters + if(chr > 0x0010FFFF) { + warn = true; + // Invalid character (greater than the maximum unicode value) + utf8_write(to, replacement_char); + continue; + } + + utf8_write(to, chr); + } + + if(warn) { + log_warning << "unexpected data while converting from UTF-16LE to UTF-8"; + } + +} + +static void windows1252_to_utf8(const std::string & from, std::string & to) { + + static unicode_char replacements[] = { + 0x20ac, replacement_char, 0x201a, 0x192, 0x201e, 0x2026, 0x2020, 0x2021, 0x2c6, + 0x2030, 0x160, 0x2039, 0x152, replacement_char, 0x17d, replacement_char, + replacement_char, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, 0x2dc, + 0x2122, 0x161, 0x203a, 0x153, replacement_char, 0x17e, 0x178 + }; + + BOOST_STATIC_ASSERT(sizeof(replacements) == (160 - 128) * sizeof(*replacements)); + + to.clear(); + to.reserve(from.size()); // optimistically, most strings only have ASCII characters + + bool warn = false; + + BOOST_FOREACH(char c, from) { + + // windows1252 maps almost directly to Unicode - yay! + unicode_char chr = boost::uint8_t(c); + if(chr >= 128 && chr < 160) { + chr = replacements[chr - 128]; + warn = warn || (chr == unicode_char(replacement_char)); + } + + utf8_write(to, chr); + } + + if(warn) { + log_warning << "unexpected data while converting from Windows-1252 to UTF-8"; + } + +} + +static bool to_utf8_builtin(const std::string & from, std::string & to, codepage_id cp) { + + switch(cp) { + case cp_utf16le: utf16le_to_utf8(from, to); return true; + case cp_windows1252: windows1252_to_utf8(from, to); return true; + case cp_iso_8859_1: windows1252_to_utf8(from, to); return true; + default: return false; + } + +} + +#endif // INNOEXTRACT_HAVE_BUILTIN_CONV + +#if INNOEXTRACT_HAVE_ICONV + +typedef boost::unordered_map converter_map; +static converter_map converters; + //! Get names for encodings where iconv doesn't have the codepage alias static const char * get_encoding_name(codepage_id codepage) { switch(codepage) { @@ -177,43 +424,13 @@ static iconv_t get_converter(codepage_id codepage) { return converters[codepage] = handle; } -//! Fallback conversion that will at least work for ASCII characters -static void to_utf8_fallback(const std::string & from, std::string & to, - codepage_id codepage) { - - size_t skip = get_encoding_size(codepage); - - to.clear(); - to.reserve(ceildiv(from.size(), skip)); - - for(size_t i = 0; i < from.size(); i += skip) { - if((unsigned char)from[i] <= 127) { - // copy ASCII characters - to.push_back(from[i]); - } else { - // replace everything else with underscores - to.push_back(replacement_char); - } - } -} - -} // anonymous namespace - -void to_utf8(const std::string & from, std::string & to, codepage_id codepage) { +static bool to_utf8_iconv(const std::string & from, std::string & to, codepage_id cp) { - if(codepage == cp_utf8 || codepage == cp_ascii) { - // copy UTF-8 directly - to = from; - return; - } - - iconv_t converter = get_converter(codepage); + iconv_t converter = get_converter(cp); if(converter == iconv_t(-1)) { - to_utf8_fallback(from, to, codepage); - return; + return false; } - /* * Some iconv implementations declare the second parameter of iconv() as * const char **, others as char **. @@ -230,14 +447,9 @@ void to_utf8(const std::string & from, std::string & to, codepage_id codepage) { size_t outbase = 0; - if(!insize) { - to.clear(); - return; - } - iconv(converter, NULL, NULL, NULL, NULL); - size_t skip = get_encoding_size(codepage); + size_t skip = get_encoding_size(cp); bool warn = false; @@ -274,19 +486,19 @@ void to_utf8(const std::string & from, std::string & to, codepage_id codepage) { } if(warn) { - log_warning << "unexpected data while converting from CP" << codepage << " to UTF-8"; + log_warning << "unexpected data while converting from CP" << cp << " to UTF-8"; } to.resize(outbase); + + return true; } -#elif defined(_WIN32) +#endif // INNOEXTRACT_HAVE_ICONV -static const codepage_id cp_utf16le = 1200; - -namespace { +#if INNOEXTRACT_HAVE_WIN32_CONV -std::string windows_error_string(DWORD code) { +static std::string windows_error_string(DWORD code) { char * error; DWORD n = FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_ALLOCATE_BUFFER, NULL, code, 0, reinterpret_cast(&error), 0, @@ -303,22 +515,8 @@ std::string windows_error_string(DWORD code) { } } -} // anonymous namespace - -void to_utf8(const std::string & from, std::string & to, codepage_id cp) { +static bool to_utf8_win32(const std::string & from, std::string & to, codepage_id cp) { - if(from.empty()) { - to.clear(); - return; - } - - if(cp == cp_utf8 || cp == cp_ascii) { - // copy UTF-8 directly - to = from; - return; - } - - int ret = 0; // Convert from the source codepage to UTF-16LE @@ -338,7 +536,7 @@ void to_utf8(const std::string & from, std::string & to, codepage_id cp) { if(utf16_size <= 0 || ret <= 0) { log_warning << "error while converting from CP" << cp << " to UTF-16: " << windows_error_string(GetLastError()); - return; + return false; } utf16 = &buffer.front(); } @@ -353,11 +551,48 @@ void to_utf8(const std::string & from, std::string & to, codepage_id cp) { if(utf8_size <= 0 || ret <= 0) { log_warning << "error while converting from UTF-16 to UTF-8: " << windows_error_string(GetLastError()); - return; + return false; } + return true; } -#endif +#endif // INNOEXTRACT_HAVE_WIN32_CONV + +} // anonymous namespace + +void to_utf8(const std::string & from, std::string & to, codepage_id cp) { + + if(from.empty()) { + to.clear(); + return; + } + + if(cp == cp_utf8 || cp == cp_ascii) { + to = from; + return; + } + + #if INNOEXTRACT_HAVE_BUILTIN_CONV + if(to_utf8_builtin(from, to, cp)) { + return; + } + #endif + + #if INNOEXTRACT_HAVE_ICONV + if(to_utf8_iconv(from, to, cp)) { + return; + } + #endif + + #if INNOEXTRACT_HAVE_WIN32_CONV + if(to_utf8_win32(from, to, cp)) { + return; + } + #endif + + to_utf8_fallback(from, to, cp); + +} } // namespace util