Browse Source

Add support for building without iconv anywhere

Most of the time only Windows-1252 or UTF-16LE conversions are needed,
so just bundle conversion routines for those.
coverity_scan
Daniel Scharrer 12 years ago
parent
commit
89258fdd64
  1. 47
      CMakeLists.txt
  2. 5
      README.md
  3. 2
      src/configure.hpp.in
  4. 381
      src/util/encoding.cpp

47
CMakeLists.txt

@ -5,8 +5,9 @@ cmake_minimum_required(VERSION 2.8)
# Define configuration options
option(USE_LZMA "Build lzma decompression support." ON)
option(USE_ICONV "Build against libiconv instead of native OS functions." OFF)
option(USE_LZMA "Build lzma decompression support" ON)
set(WITH_CONV CACHE STRING "The library to use for charset conversions")
option(ENABLE_BUILTIN_CONV "Build internal charset conversion routines" ON)
option(DEBUG_EXTRA "Expensive debug options" OFF)
option(SET_WARNING_FLAGS "Adjust compiler warning flags" ON)
option(SET_OPTIMIZATION_FLAGS "Adjust compiler optimization flags" ON)
@ -138,15 +139,32 @@ if(Boost_USE_STATIC_LIBS)
endif()
if(NOT WIN32 OR USE_ICONV)
find_package(iconv REQUIRED)
check_link_library(iconv iconv_LIBRARIES)
list(APPEND LIBRARIES ${iconv_LIBRARIES})
include_directories(SYSTEM ${iconv_INCLUDE_DIR})
add_definitions(${iconv_DEFINITIONS})
set(INNOEXTRACT_HAVE_ICONV 1)
else()
set(INNOEXTRACT_HAVE_ICONV 0)
set(INNOEXTRACT_HAVE_ICONV 0)
set(INNOEXTRACT_HAVE_WIN32_CONV 0)
set(INNOEXTRACT_HAVE_BUILTIN_CONV ${ENABLE_BUILTIN_CONV})
if(WIN32 AND (NOT WITH_CONV OR WITH_CONV STREQUAL "win32"))
set(INNOEXTRACT_HAVE_WIN32_CONV 1)
elseif(NOT WITH_CONV OR WITH_CONV STREQUAL "iconv")
if(STRICT_USE)
set(ICONV_REQUIRED REQUIRED)
else()
set(ICONV_REQUIRED)
endif()
find_package(iconv ${ICONV_REQUIRED})
if(ICONV_FOUND)
check_link_library(iconv iconv_LIBRARIES)
list(APPEND LIBRARIES ${iconv_LIBRARIES})
include_directories(SYSTEM ${iconv_INCLUDE_DIR})
add_definitions(${iconv_DEFINITIONS})
set(INNOEXTRACT_HAVE_ICONV 1)
endif()
elseif(WITH_CONV AND NOT WITH_CONV STREQUAL "builtin")
message(FATAL_ERROR "Invalid WITH_CONV option: ${WITH_CONV}")
endif()
if(NOT INNOEXTRACT_HAVE_ICONV AND NOT INNOEXTRACT_HAVE_WIN32_CONV
AND NOT INNOEXTRACT_HAVE_BUILTIN_CONV)
message(WARNING "\nBuilding without any charset conversion support.\n"
"Any non-ASCII characters in extracted filenames will be missing.")
endif()
@ -371,8 +389,9 @@ print_configuration("File time precision" FIRST
INNOEXTRACT_HAVE_UTIMES "microseconds"
1 "seconds"
)
print_configuration("Charset conversion" FIRST
USE_ICONV "iconv"
WIN32 "Win32"
print_configuration("Charset conversion"
INNOEXTRACT_HAVE_ICONV "iconv"
INNOEXTRACT_HAVE_WIN32_CONV "Win32"
INNOEXTRACT_HAVE_BUILTIN_CONV "bultin"
)
message("")

5
README.md

@ -43,7 +43,8 @@ Build options:
| Option | Default | Description |
|:------------------------ |:---------:|:----------- |
| `USE_LZMA` | `ON` | Use `liblzma` if available.
| `USE_ICONV` | `OFF`^1 | Use `libiconv` instead of native OS function
| `WITH_CONV` | *not set* | The charset conversion library to use. Valid values are `iconv`, `win32` and `builtin`^1. If not set, a library appropriate for the target platform will be chosen.
| `ENABLE_BUILTIN_CONV` | `ON` | Build internal Windows-1252 and UTF-16LE to UTF-18 charset conversion routines. These might be used even if `WITH_CONV` is not set to `builtin`.
| `CMAKE_BUILD_TYPE` | `Release` | Set to `Debug` to enable debug output.
| `DEBUG` | `OFF`^2 | Enable debug output and runtime checks.
| `DEBUG_EXTRA` | `OFF` | Expensive debug options.
@ -56,7 +57,7 @@ Build options:
| `ZLIB_USE_STATIC_LIBS` | `OFF`^4 | Statically link `libz`. (used via Boost)
| `BZip2_USE_STATIC_LIBS` | `OFF`^4 | Statically link `libbz2`. (used via Boost)
| `iconv_USE_STATIC_LIBS` | `OFF`^4 | Statically link `libiconv`.
1. This is only meaningful for Windows
1. The builtin charset conversion only supports Windows-1252 and UTF-16LE. This is normally enough for filenames, but custom message strings (which can be included in filenames) may use arbitrary encodings.
2. Enabled automatically if `CMAKE_BUILD_TYPE` is set to `Debug`.
3. Under Windows, the default is `ON`.
4. Default is `ON` if `USE_STATIC_LIBS` is enabled.

2
src/configure.hpp.in

@ -28,5 +28,7 @@
// Optional dependencies
#cmakedefine01 INNOEXTRACT_HAVE_LZMA
#cmakedefine01 INNOEXTRACT_HAVE_ICONV
#cmakedefine01 INNOEXTRACT_HAVE_WIN32_CONV
#cmakedefine01 INNOEXTRACT_HAVE_BUILTIN_CONV
#endif // INNOEXTRACT_CONFIGURE_HPP

381
src/util/encoding.cpp

@ -17,6 +17,35 @@
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
// Parts based on:
////////////////////////////////////////////////////////////
//
// SFML - Simple and Fast Multimedia Library
// Copyright (C) 2007-2009 Laurent Gomila (laurent.gom@gmail.com)
//
// This software is provided 'as-is', without any express or implied warranty.
// In no event will the authors be held liable for any damages arising from the
// use of this software.
//
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it freely,
// subject to the following restrictions:
//
// 1. The origin of this software must not be misrepresented;
// you must not claim that you wrote the original software.
// If you use this software in a product, an acknowledgment
// in the product documentation would be appreciated but is not required.
//
// 2. Altered source versions must be plainly marked as such,
// and must not be misrepresented as being the original software.
//
// 3. This notice may not be removed or altered from any source distribution.
//
////////////////////////////////////////////////////////////
//
// This code has been taken from SFML and altered to fit the project's needs.
//
////////////////////////////////////////////////////////////
#include "util/encoding.hpp"
@ -33,13 +62,14 @@
#if INNOEXTRACT_HAVE_ICONV
#include <iconv.h>
#include <errno.h>
#elif defined(_WIN32)
#endif
#if INNOEXTRACT_HAVE_WIN32_CONV
#include <windows.h>
#else
#error No charset conversion library available!
#endif
#include <boost/foreach.hpp>
#include <boost/static_assert.hpp>
#include <boost/unordered_map.hpp>
#include "util/log.hpp"
@ -47,17 +77,19 @@
namespace util {
static const codepage_id cp_utf8 = 65001;
static const codepage_id cp_ascii = 20127;
enum known_codepages {
cp_utf16le = 1200,
cp_windows1252 = 1252,
cp_ascii = 20127,
cp_iso_8859_1 = 28591,
cp_utf8 = 65001,
};
#if INNOEXTRACT_HAVE_ICONV
namespace {
static const char replacement_char = '_';
namespace {
typedef boost::unordered_map<codepage_id, iconv_t> converter_map;
converter_map converters;
typedef boost::uint32_t unicode_char;
static size_t get_encoding_size(codepage_id codepage) {
switch(codepage) {
@ -69,6 +101,221 @@ static size_t get_encoding_size(codepage_id codepage) {
}
}
//! Fallback conversion that will at least work for ASCII characters
static void to_utf8_fallback(const std::string & from, std::string & to, codepage_id cp) {
size_t skip = get_encoding_size(cp);
size_t shift = 0;
switch(cp) {
case 1201: shift = 1u * 8u; break; // UTF-16BE
case 12001: shift = 3u * 8u; break; // UTF-32BE
}
to.clear();
to.reserve(ceildiv(from.size(), skip));
bool warn = false;
for(std::string::const_iterator it = from.begin(); it != from.end();) {
unicode_char unicode = 0;
for(size_t i = 0; i < skip; i++) {
unicode |= unicode_char(boost::uint8_t(*it++)) << (i * 8);
}
char ascii = (unicode >> shift) & 0x7f;
// replace non-ASCII characters with underscores
if((unicode_char(ascii) << shift) != unicode) {
warn = true;
ascii = replacement_char;
}
to.push_back(ascii);
}
static bool warned = false;
if(warn) {
log_warning << "unknown data while converting from CP" << cp << " to UTF-8";
if(!warned && (cp == cp_windows1252 || cp == cp_utf16le)) {
#if INNOEXTRACT_HAVE_ICONV
log_warning << "make sure your iconv installation supports Windows-1252 and UTF-16LE";
#elif !INNOEXTRACT_HAVE_BUILTIN_CONV && !INNOEXTRACT_HAVE_WIN32_CONV
log_warning << "build innoextract with charset conversion routines enabled!";
#endif
warned = true;
}
}
}
#if INNOEXTRACT_HAVE_BUILTIN_CONV
static size_t utf8_length(unicode_char chr) {
if (chr < 0x80) return 1;
else if(chr < 0x800) return 2;
else if(chr < 0x10000) return 3;
else if(chr <= 0x0010ffff) return 4;
return 1;
}
static void utf8_write(std::string & to, unicode_char chr) {
static const boost::uint8_t first_bytes[7] = {
0x00, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
};
// Get number of bytes to write
size_t length = utf8_length(chr);
// Extract bytes to write
boost::uint8_t bytes[4];
switch(length) {
case 4 : bytes[3] = static_cast<boost::uint8_t>((chr | 0x80) & 0xBF); chr >>= 6;
case 3 : bytes[2] = static_cast<boost::uint8_t>((chr | 0x80) & 0xBF); chr >>= 6;
case 2 : bytes[1] = static_cast<boost::uint8_t>((chr | 0x80) & 0xBF); chr >>= 6;
case 1 : bytes[0] = static_cast<boost::uint8_t>( chr | first_bytes[length]);
}
// Add them to the output
const boost::uint8_t * cur_byte = bytes;
switch(length) {
case 4 : to.push_back(char(*cur_byte++));
case 3 : to.push_back(char(*cur_byte++));
case 2 : to.push_back(char(*cur_byte++));
case 1 : to.push_back(char(*cur_byte++));
}
}
//! \return true c is is the first part of an UTF-16 surrogate pair
static bool is_utf16_high_surrogate(unicode_char chr) {
return chr >= 0xd800 && chr <= 0xdbff;
}
//! \return true c is is the second part of an UTF-16 surrogate pair
static bool is_utf16_low_surrogate(unicode_char chr) {
return chr >= 0xdc00 && chr <= 0xdfff;
}
static void utf16le_to_utf8(const std::string & from, std::string & to) {
if(from.size() % 2 != 0) {
log_warning << "unexpected trailing byte in UTF-16 string";
}
to.clear();
to.reserve(from.size() / 2); // optimistically, most strings only have ASCII characters
bool warn = false;
std::string::const_iterator it = from.begin();
std::string::const_iterator end = from.end();
while(it != end) {
unicode_char chr = boost::uint8_t(*it++);
if(it == end) {
warn = true;
utf8_write(to, replacement_char);
break;
}
chr |= unicode_char(boost::uint8_t(*it++)) << 8;
// If it's a surrogate pair, convert to a single UTF-32 character
if(is_utf16_high_surrogate(chr)) {
if(it != end) {
unicode_char d = boost::uint8_t(*it++);
if(it == end) {
warn = true;
utf8_write(to, replacement_char);
break;
}
d |= unicode_char(boost::uint8_t(*it++)) << 8;
if(is_utf16_low_surrogate(d)) {
chr = ((chr - 0xd800) << 10) + (d - 0xdc00) + 0x0010000;
} else {
warn = true;
utf8_write(to, replacement_char);
continue;
}
} else {
warn = true;
// Invalid second element
utf8_write(to, replacement_char);
continue;
}
}
// Replace invalid characters
if(chr > 0x0010FFFF) {
warn = true;
// Invalid character (greater than the maximum unicode value)
utf8_write(to, replacement_char);
continue;
}
utf8_write(to, chr);
}
if(warn) {
log_warning << "unexpected data while converting from UTF-16LE to UTF-8";
}
}
static void windows1252_to_utf8(const std::string & from, std::string & to) {
static unicode_char replacements[] = {
0x20ac, replacement_char, 0x201a, 0x192, 0x201e, 0x2026, 0x2020, 0x2021, 0x2c6,
0x2030, 0x160, 0x2039, 0x152, replacement_char, 0x17d, replacement_char,
replacement_char, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, 0x2dc,
0x2122, 0x161, 0x203a, 0x153, replacement_char, 0x17e, 0x178
};
BOOST_STATIC_ASSERT(sizeof(replacements) == (160 - 128) * sizeof(*replacements));
to.clear();
to.reserve(from.size()); // optimistically, most strings only have ASCII characters
bool warn = false;
BOOST_FOREACH(char c, from) {
// windows1252 maps almost directly to Unicode - yay!
unicode_char chr = boost::uint8_t(c);
if(chr >= 128 && chr < 160) {
chr = replacements[chr - 128];
warn = warn || (chr == unicode_char(replacement_char));
}
utf8_write(to, chr);
}
if(warn) {
log_warning << "unexpected data while converting from Windows-1252 to UTF-8";
}
}
static bool to_utf8_builtin(const std::string & from, std::string & to, codepage_id cp) {
switch(cp) {
case cp_utf16le: utf16le_to_utf8(from, to); return true;
case cp_windows1252: windows1252_to_utf8(from, to); return true;
case cp_iso_8859_1: windows1252_to_utf8(from, to); return true;
default: return false;
}
}
#endif // INNOEXTRACT_HAVE_BUILTIN_CONV
#if INNOEXTRACT_HAVE_ICONV
typedef boost::unordered_map<codepage_id, iconv_t> converter_map;
static converter_map converters;
//! Get names for encodings where iconv doesn't have the codepage alias
static const char * get_encoding_name(codepage_id codepage) {
switch(codepage) {
@ -177,43 +424,13 @@ static iconv_t get_converter(codepage_id codepage) {
return converters[codepage] = handle;
}
//! Fallback conversion that will at least work for ASCII characters
static void to_utf8_fallback(const std::string & from, std::string & to,
codepage_id codepage) {
size_t skip = get_encoding_size(codepage);
to.clear();
to.reserve(ceildiv(from.size(), skip));
for(size_t i = 0; i < from.size(); i += skip) {
if((unsigned char)from[i] <= 127) {
// copy ASCII characters
to.push_back(from[i]);
} else {
// replace everything else with underscores
to.push_back(replacement_char);
}
}
}
} // anonymous namespace
void to_utf8(const std::string & from, std::string & to, codepage_id codepage) {
static bool to_utf8_iconv(const std::string & from, std::string & to, codepage_id cp) {
if(codepage == cp_utf8 || codepage == cp_ascii) {
// copy UTF-8 directly
to = from;
return;
}
iconv_t converter = get_converter(codepage);
iconv_t converter = get_converter(cp);
if(converter == iconv_t(-1)) {
to_utf8_fallback(from, to, codepage);
return;
return false;
}
/*
* Some iconv implementations declare the second parameter of iconv() as
* const char **, others as char **.
@ -230,14 +447,9 @@ void to_utf8(const std::string & from, std::string & to, codepage_id codepage) {
size_t outbase = 0;
if(!insize) {
to.clear();
return;
}
iconv(converter, NULL, NULL, NULL, NULL);
size_t skip = get_encoding_size(codepage);
size_t skip = get_encoding_size(cp);
bool warn = false;
@ -274,19 +486,19 @@ void to_utf8(const std::string & from, std::string & to, codepage_id codepage) {
}
if(warn) {
log_warning << "unexpected data while converting from CP" << codepage << " to UTF-8";
log_warning << "unexpected data while converting from CP" << cp << " to UTF-8";
}
to.resize(outbase);
return true;
}
#elif defined(_WIN32)
#endif // INNOEXTRACT_HAVE_ICONV
static const codepage_id cp_utf16le = 1200;
namespace {
#if INNOEXTRACT_HAVE_WIN32_CONV
std::string windows_error_string(DWORD code) {
static std::string windows_error_string(DWORD code) {
char * error;
DWORD n = FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_ALLOCATE_BUFFER,
NULL, code, 0, reinterpret_cast<char *>(&error), 0,
@ -303,22 +515,8 @@ std::string windows_error_string(DWORD code) {
}
}
} // anonymous namespace
void to_utf8(const std::string & from, std::string & to, codepage_id cp) {
static bool to_utf8_win32(const std::string & from, std::string & to, codepage_id cp) {
if(from.empty()) {
to.clear();
return;
}
if(cp == cp_utf8 || cp == cp_ascii) {
// copy UTF-8 directly
to = from;
return;
}
int ret = 0;
// Convert from the source codepage to UTF-16LE
@ -338,7 +536,7 @@ void to_utf8(const std::string & from, std::string & to, codepage_id cp) {
if(utf16_size <= 0 || ret <= 0) {
log_warning << "error while converting from CP" << cp << " to UTF-16: "
<< windows_error_string(GetLastError());
return;
return false;
}
utf16 = &buffer.front();
}
@ -353,11 +551,48 @@ void to_utf8(const std::string & from, std::string & to, codepage_id cp) {
if(utf8_size <= 0 || ret <= 0) {
log_warning << "error while converting from UTF-16 to UTF-8: "
<< windows_error_string(GetLastError());
return;
return false;
}
return true;
}
#endif
#endif // INNOEXTRACT_HAVE_WIN32_CONV
} // anonymous namespace
void to_utf8(const std::string & from, std::string & to, codepage_id cp) {
if(from.empty()) {
to.clear();
return;
}
if(cp == cp_utf8 || cp == cp_ascii) {
to = from;
return;
}
#if INNOEXTRACT_HAVE_BUILTIN_CONV
if(to_utf8_builtin(from, to, cp)) {
return;
}
#endif
#if INNOEXTRACT_HAVE_ICONV
if(to_utf8_iconv(from, to, cp)) {
return;
}
#endif
#if INNOEXTRACT_HAVE_WIN32_CONV
if(to_utf8_win32(from, to, cp)) {
return;
}
#endif
to_utf8_fallback(from, to, cp);
}
} // namespace util

Loading…
Cancel
Save