Browse Source

Move to_utf8() and helpers to their own source file

coverity_scan
Daniel Scharrer 12 years ago
parent
commit
22ddada291
  1. 2
      CMakeLists.txt
  2. 1
      src/setup/header.cpp
  3. 1
      src/setup/message.cpp
  4. 269
      src/util/encoding.cpp
  5. 49
      src/util/encoding.hpp
  6. 241
      src/util/load.cpp
  7. 13
      src/util/load.hpp

2
CMakeLists.txt

@ -288,6 +288,8 @@ set(INNOEXTRACT_SOURCES
src/util/boostfs_compat.hpp
src/util/console.hpp
src/util/console.cpp
src/util/encoding.hpp
src/util/encoding.cpp
src/util/endian.hpp
src/util/enum.hpp
src/util/flags.hpp

1
src/setup/header.cpp

@ -26,6 +26,7 @@
#include <boost/static_assert.hpp>
#include "setup/version.hpp"
#include "util/encoding.hpp"
#include "util/load.hpp"
#include "util/storedenum.hpp"

1
src/setup/message.cpp

@ -24,6 +24,7 @@
#include "setup/language.hpp"
#include "setup/version.hpp"
#include "util/encoding.hpp"
#include "util/load.hpp"
namespace setup {

269
src/util/encoding.cpp

@ -0,0 +1,269 @@
/*
* Copyright (C) 2011-2013 Daniel Scharrer
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author(s) be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#include "util/encoding.hpp"
#include <iterator>
#include <sstream>
#include <algorithm>
#include <iomanip>
#include <iconv.h>
#include <errno.h>
#include <boost/foreach.hpp>
#include <boost/unordered_map.hpp>
#include "util/log.hpp"
#include "util/math.hpp"
namespace util {
namespace {
static const codepage_id cp_utf8 = 65001;
static const codepage_id cp_ascii = 20127;
static const char replacement_char = '_';
typedef boost::unordered_map<codepage_id, iconv_t> converter_map;
converter_map converters;
static size_t get_encoding_size(codepage_id codepage) {
switch(codepage) {
case 1200: return 2u; // UTF-16LE
case 1201: return 2u; // UTF-16BE
case 12000: return 4u; // UTF-32LE
case 12001: return 4u; // UTF-32BE
default: return 1u;
}
}
//! Get names for encodings where iconv doesn't have the codepage alias
static const char * get_encoding_name(codepage_id codepage) {
switch(codepage) {
case 708: return "ISO-8859-6";
case 936: return "GBK";
case 949: return "UHC";
case 950: return "BIG5";
// iconv's behavior for "UTF-16" is platform-dependent if there is no BOM.
// There never is any BOM in Inno Setup files and it's always little-endian,
// so we specify the exact encoding.
case 1200: return "UTF-16LE";
case 1201: return "UTF-16BE";
case 1252: return "MS-ANSI";
case 1361: return "JOHAB";
case 10000: return "MACINTOSH";
case 10002: return "BIG5";
case 10008: return "GB2312";
case 12000: return "UTF-32LE";
case 12001: return "UTF-32BE";
case 20003: return "IBM5550";
case 20127: return "US-ASCII";
case 20261: return "T.61";
case 20269: return "ISO_6937";
case 20273: return "IBM273";
case 20277: return "IBM277";
case 20278: return "IBM278";
case 20280: return "IBM280";
case 20284: return "IBM284";
case 20285: return "IBM285";
case 20290: return "IBM290";
case 20297: return "IBM297";
case 20420: return "IBM420";
case 20423: return "IBM423";
case 20424: return "IBM424";
case 20866: return "KOI8-R";
case 20871: return "IBM871";
case 20880: return "IBM880";
case 20905: return "IBM905";
case 20924: return "IBM1047";
case 20932: return "EUC-JP-MS";
case 20936: return "EUC-CN";
case 21025: return "IBM1025";
case 21866: return "KOI8-U";
case 28591: return "ISO-8859-1";
case 28592: return "ISO-8859-2";
case 28593: return "ISO-8859-3";
case 28594: return "ISO-8859-4";
case 28595: return "ISO-8859-5";
case 28596: return "ISO-8859-6";
case 28597: return "ISO-8859-7";
case 28598: return "ISO-8859-8";
case 28599: return "ISO-8859-9";
case 28603: return "ISO-8859-13";
case 28605: return "ISO-8859-15";
case 38598: return "ISO-8859-8";
case 50220: return "ISO-2022-JP";
case 50221: return "ISO-2022-JP-2";
case 50222: return "ISO-2022-JP-3";
case 50225: return "ISO-2022-KR";
case 50227: return "ISO-2022-CN";
case 50229: return "ISO-2022-CN-EXT";
case 50930: return "EBCDIC-JP-E";
case 51932: return "EUC-JP";
case 51936: return "EUC-CN";
case 51949: return "EUC-KR";
case 51950: return "EUC-CN";
case 54936: return "GB18030";
case 65000: return "UTF-7";
case 65001: return "UTF-8";
default: return NULL;
}
}
static iconv_t get_converter(codepage_id codepage) {
// Try to reuse an existing converter if possible
converter_map::const_iterator i = converters.find(codepage);
if(i != converters.end()) {
return i->second;
}
iconv_t handle = iconv_t(-1);
const char * encoding = get_encoding_name(codepage);
if(encoding) {
handle = iconv_open("UTF-8", encoding);
}
// Otherwise, try a few different codepage name prefixes
if(handle == iconv_t(-1)) {
const char * prefixes[] = { "MSCP", "CP", "WINDOWS-", "MS", "IBM", "IBM-", "" };
BOOST_FOREACH(const char * prefix, prefixes) {
std::ostringstream oss;
oss << prefix << std::setfill('0') << std::setw(3) << codepage;
handle = iconv_open("UTF-8", oss.str().c_str());
if(handle != iconv_t(-1)) {
break;
}
}
}
if(handle == iconv_t(-1)) {
log_warning << "could not get codepage " << codepage << " -> UTF-8 converter";
}
return converters[codepage] = handle;
}
//! Fallback conversion that will at least work for ASCII characters
static void to_utf8_fallback(const std::string & from, std::string & to,
codepage_id codepage) {
size_t skip = get_encoding_size(codepage);
to.clear();
to.reserve(ceildiv(from.size(), skip));
for(size_t i = 0; i < from.size(); i += skip) {
if((unsigned char)from[i] <= 127) {
// copy ASCII characters
to.push_back(from[i]);
} else {
// replace everything else with underscores
to.push_back(replacement_char);
}
}
}
} // anonymous namespace
void to_utf8(const std::string & from, std::string & to, codepage_id codepage) {
if(codepage == cp_utf8 || codepage == cp_ascii) {
// copy UTF-8 directly
to = from;
return;
}
iconv_t converter = get_converter(codepage);
if(converter == iconv_t(-1)) {
to_utf8_fallback(from, to, codepage);
return;
}
/*
* Some iconv implementations declare the second parameter of iconv() as
* const char **, others as char **.
* Use this little hack to compile with both variants.
*/
struct inbuf_ {
const char * buf;
explicit inbuf_(const char * data) : buf(data) { }
operator const char **() { return &buf; }
operator char **() { return const_cast<char **>(&buf); }
} inbuf(from.data());
size_t insize = from.size();
size_t outbase = 0;
if(!insize) {
to.clear();
return;
}
iconv(converter, NULL, NULL, NULL, NULL);
size_t skip = get_encoding_size(codepage);
bool warn = false;
while(insize) {
to.resize(outbase + ceildiv(insize, skip) + 4);
char * outbuf = &to[0] + outbase;
size_t outsize = to.size() - outbase;
size_t ret = iconv(converter, inbuf, &insize, &outbuf, &outsize);
if(ret == size_t(-1)) {
if(errno == E2BIG) {
// not enough output space - we'll allocate more in the next loop
} else if(/*errno == EILSEQ &&*/ insize >= 2) {
// invalid byte (sequence) - add a replacement char and try the next byte
if(outsize == 0) {
to.push_back(replacement_char);
} else {
*outbuf = replacement_char;
outsize--;
}
inbuf.buf += skip;
insize -= skip;
warn = true;
} else {
// something else went wrong - return what we have so far
insize = 0;
warn = true;
}
}
outbase = to.size() - outsize;
}
if(warn) {
log_warning << "unexpected data while converting from CP" << codepage << " to UTF-8";
}
to.resize(outbase);
}
} // namespace util

49
src/util/encoding.hpp

@ -0,0 +1,49 @@
/*
* Copyright (C) 2011-2013 Daniel Scharrer
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author(s) be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/*!
* \file
*
* Utility function to convert strings to UTF-8.
*/
#ifndef INNOEXTRACT_UTIL_ENCODING_HPP
#define INNOEXTRACT_UTIL_ENCODING_HPP
#include <string>
#include <boost/cstdint.hpp>
namespace util {
typedef boost::uint32_t codepage_id;
/*!
* Convert a string to UTF-8 from a specified encoding.
* \param from The input string to convert.
* \param to The output for the converted string.
* \param codepage The Windows codepage number for the input string encoding.
*
* \note This function is not thread-safe.
*/
void to_utf8(const std::string & from, std::string & to, codepage_id codepage = 1252);
} // namespace util
#endif // INNOEXTRACT_UTIL_ENCODING_HPP

241
src/util/load.cpp

@ -20,172 +20,12 @@
#include "util/load.hpp"
#include <iterator>
#include <sstream>
#include <algorithm>
#include <iomanip>
#include <iconv.h>
#include <errno.h>
#include <boost/foreach.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/unordered_map.hpp>
#include "util/log.hpp"
#include "util/math.hpp"
namespace util {
namespace {
static const codepage_id cp_utf8 = 65001;
static const codepage_id cp_ascii = 20127;
static const char replacement_char = '_';
typedef boost::unordered_map<codepage_id, iconv_t> converter_map;
converter_map converters;
static size_t get_encoding_size(codepage_id codepage) {
switch(codepage) {
case 1200: return 2u; // UTF-16LE
case 1201: return 2u; // UTF-16BE
case 12000: return 4u; // UTF-32LE
case 12001: return 4u; // UTF-32BE
default: return 1u;
}
}
//! Get names for encodings where iconv doesn't have the codepage alias
static const char * get_encoding_name(codepage_id codepage) {
switch(codepage) {
case 708: return "ISO-8859-6";
case 936: return "GBK";
case 949: return "UHC";
case 950: return "BIG5";
// iconv's behavior for "UTF-16" is platform-dependent if there is no BOM.
// There never is any BOM in Inno Setup files and it's always little-endian,
// so we specify the exact encoding.
case 1200: return "UTF-16LE";
case 1201: return "UTF-16BE";
case 1252: return "MS-ANSI";
case 1361: return "JOHAB";
case 10000: return "MACINTOSH";
case 10002: return "BIG5";
case 10008: return "GB2312";
case 12000: return "UTF-32LE";
case 12001: return "UTF-32BE";
case 20003: return "IBM5550";
case 20127: return "US-ASCII";
case 20261: return "T.61";
case 20269: return "ISO_6937";
case 20273: return "IBM273";
case 20277: return "IBM277";
case 20278: return "IBM278";
case 20280: return "IBM280";
case 20284: return "IBM284";
case 20285: return "IBM285";
case 20290: return "IBM290";
case 20297: return "IBM297";
case 20420: return "IBM420";
case 20423: return "IBM423";
case 20424: return "IBM424";
case 20866: return "KOI8-R";
case 20871: return "IBM871";
case 20880: return "IBM880";
case 20905: return "IBM905";
case 20924: return "IBM1047";
case 20932: return "EUC-JP-MS";
case 20936: return "EUC-CN";
case 21025: return "IBM1025";
case 21866: return "KOI8-U";
case 28591: return "ISO-8859-1";
case 28592: return "ISO-8859-2";
case 28593: return "ISO-8859-3";
case 28594: return "ISO-8859-4";
case 28595: return "ISO-8859-5";
case 28596: return "ISO-8859-6";
case 28597: return "ISO-8859-7";
case 28598: return "ISO-8859-8";
case 28599: return "ISO-8859-9";
case 28603: return "ISO-8859-13";
case 28605: return "ISO-8859-15";
case 38598: return "ISO-8859-8";
case 50220: return "ISO-2022-JP";
case 50221: return "ISO-2022-JP-2";
case 50222: return "ISO-2022-JP-3";
case 50225: return "ISO-2022-KR";
case 50227: return "ISO-2022-CN";
case 50229: return "ISO-2022-CN-EXT";
case 50930: return "EBCDIC-JP-E";
case 51932: return "EUC-JP";
case 51936: return "EUC-CN";
case 51949: return "EUC-KR";
case 51950: return "EUC-CN";
case 54936: return "GB18030";
case 65000: return "UTF-7";
case 65001: return "UTF-8";
default: return NULL;
}
}
static iconv_t get_converter(codepage_id codepage) {
// Try to reuse an existing converter if possible
converter_map::const_iterator i = converters.find(codepage);
if(i != converters.end()) {
return i->second;
}
iconv_t handle = iconv_t(-1);
const char * encoding = get_encoding_name(codepage);
if(encoding) {
handle = iconv_open("UTF-8", encoding);
}
// Otherwise, try a few different codepage name prefixes
if(handle == iconv_t(-1)) {
const char * prefixes[] = { "MSCP", "CP", "WINDOWS-", "MS", "IBM", "IBM-", "" };
BOOST_FOREACH(const char * prefix, prefixes) {
std::ostringstream oss;
oss << prefix << std::setfill('0') << std::setw(3) << codepage;
handle = iconv_open("UTF-8", oss.str().c_str());
if(handle != iconv_t(-1)) {
break;
}
}
}
if(handle == iconv_t(-1)) {
log_warning << "could not get codepage " << codepage << " -> UTF-8 converter";
}
return converters[codepage] = handle;
}
//! Fallback conversion that will at least work for ASCII characters
static void to_utf8_fallback(const std::string & from, std::string & to,
codepage_id codepage) {
size_t skip = get_encoding_size(codepage);
to.clear();
to.reserve(ceildiv(from.size(), skip));
for(size_t i = 0; i < from.size(); i += skip) {
if((unsigned char)from[i] <= 127) {
// copy ASCII characters
to.push_back(from[i]);
} else {
// replace everything else with underscores
to.push_back(replacement_char);
}
}
}
} // anonymous namespace
void binary_string::load(std::istream & is, std::string & target) {
boost::uint32_t length = util::load<boost::uint32_t>(is);
@ -218,87 +58,6 @@ void encoded_string::load(std::istream & is, std::string & target, codepage_id c
to_utf8(binary_string::load(is), target, codepage);
}
void to_utf8(const std::string & from, std::string & to, codepage_id codepage) {
if(codepage == cp_utf8 || codepage == cp_ascii) {
// copy UTF-8 directly
to = from;
return;
}
iconv_t converter = get_converter(codepage);
if(converter == iconv_t(-1)) {
to_utf8_fallback(from, to, codepage);
return;
}
/*
* Some iconv implementations declare the second parameter of iconv() as
* const char **, others as char **.
* Use this little hack to compile with both variants.
*/
struct inbuf_ {
const char * buf;
explicit inbuf_(const char * data) : buf(data) { }
operator const char **() { return &buf; }
operator char **() { return const_cast<char **>(&buf); }
} inbuf(from.data());
size_t insize = from.size();
size_t outbase = 0;
if(!insize) {
to.clear();
return;
}
iconv(converter, NULL, NULL, NULL, NULL);
size_t skip = get_encoding_size(codepage);
bool warn = false;
while(insize) {
to.resize(outbase + ceildiv(insize, skip) + 4);
char * outbuf = &to[0] + outbase;
size_t outsize = to.size() - outbase;
size_t ret = iconv(converter, inbuf, &insize, &outbuf, &outsize);
if(ret == size_t(-1)) {
if(errno == E2BIG) {
// not enough output space - we'll allocate more in the next loop
} else if(/*errno == EILSEQ &&*/ insize >= 2) {
// invalid byte (sequence) - add a replacement char and try the next byte
if(outsize == 0) {
to.push_back(replacement_char);
} else {
*outbuf = replacement_char;
outsize--;
}
inbuf.buf += skip;
insize -= skip;
warn = true;
} else {
// something else went wrong - return what we have so far
insize = 0;
warn = true;
}
}
outbase = to.size() - outsize;
}
if(warn) {
log_warning << "unexpected data while converting from CP" << codepage << " to UTF-8";
}
to.resize(outbase);
}
unsigned to_unsigned(const char * chars, size_t count) {
#if BOOST_VERSION < 105200
return boost::lexical_cast<unsigned>(std::string(chars, count));

13
src/util/load.hpp

@ -33,23 +33,12 @@
#include <boost/cstdint.hpp>
#include <boost/range/size.hpp>
#include "util/encoding.hpp"
#include "util/endian.hpp"
#include "util/types.hpp"
namespace util {
typedef boost::uint32_t codepage_id;
/*!
* Convert a string to UTF-8 from a specified encoding.
* \param from The input string to convert.
* \param to The output for the converted string.
* \param codepage The Windows codepage number for the input string encoding.
*
* \note This function is not thread-safe.
*/
void to_utf8(const std::string & from, std::string & to, codepage_id codepage = 1252);
/*!
* Wrapper to load a length-prefixed string from an input stream into a std::string.
* The string length is stored as 32-bit integer.

Loading…
Cancel
Save