innoextract/src/util/load.cpp

/*
 * Copyright (C) 2011-2013 Daniel Scharrer
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the author(s) be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute it
 * freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must not
 *    claim that you wrote the original software. If you use this software
 *    in a product, an acknowledgment in the product documentation would be
 *    appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must not be
 *    misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source distribution.
 */

#include "util/load.hpp"

#include <iterator>
#include <map>
#include <sstream>
#include <algorithm>
#include <iomanip>

#include <iconv.h>
#include <errno.h>

#include <boost/foreach.hpp>

#include "util/log.hpp"

namespace {

static const boost::uint32_t cp_utf8  = 65001;
static const boost::uint32_t cp_ascii = 20127;
static const char replacement_char = '_';

std::map<boost::uint32_t, iconv_t> converters;

static size_t get_encoding_size(boost::uint32_t codepage) {
	switch(codepage) {
		case  1200: return 2u; // UTF-16LE
		case  1201: return 2u; // UTF-16BE
		case 12000: return 4u; // UTF-32LE
		case 12001: return 4u; // UTF-32BE
		default:    return 1u;
	}
}

//! Get names for encodings where iconv doesn't have the codepage alias
static const char * get_encoding_name(boost::uint32_t codepage) {
	switch(codepage) {
		case   708: return "ISO-8859-6";
		case   936: return "GBK";
		case   949: return "UHC";
		case   950: return "BIG5";
		// iconv's behavior for "UTF-16" is platform-dependant if there is no BOM.
		// There never is any BOM in Inno Setup files and it's always little-endian,
		// so we specify the exact encoding.
		case  1200: return "UTF-16LE";
		case  1201: return "UTF-16BE";
		case  1252: return "MS-ANSI";
		case  1361: return "JOHAB";
		case 10000: return "MACINTOSH";
		case 10002: return "BIG5";
		case 10008: return "GB2312";
		case 12000: return "UTF-32LE";
		case 12001: return "UTF-32BE";
		case 20003: return "IBM5550";
		case 20127: return "US-ASCII";
		case 20261: return "T.61";
		case 20269: return "ISO_6937";
		case 20273: return "IBM273";
		case 20277: return "IBM277";
		case 20278: return "IBM278";
		case 20280: return "IBM280";
		case 20284: return "IBM284";
		case 20285: return "IBM285";
		case 20290: return "IBM290";
		case 20297: return "IBM297";
		case 20420: return "IBM420";
		case 20423: return "IBM423";
		case 20424: return "IBM424";
		case 20866: return "KOI8-R";
		case 20871: return "IBM871";
		case 20880: return "IBM880";
		case 20905: return "IBM905";
		case 20924: return "IBM1047";
		case 20932: return "EUC-JP-MS";
		case 20936: return "EUC-CN";
		case 21025: return "IBM1025";
		case 21866: return "KOI8-U";
		case 28591: return "ISO-8859-1";
		case 28592: return "ISO-8859-2";
		case 28593: return "ISO-8859-3";
		case 28594: return "ISO-8859-4";
		case 28595: return "ISO-8859-5";
		case 28596: return "ISO-8859-6";
		case 28597: return "ISO-8859-7";
		case 28598: return "ISO-8859-8";
		case 28599: return "ISO-8859-9";
		case 28603: return "ISO-8859-13";
		case 28605: return "ISO-8859-15";
		case 38598: return "ISO-8859-8";
		case 50220: return "ISO-2022-JP";
		case 50221: return "ISO-2022-JP-2";
		case 50222: return "ISO-2022-JP-3";
		case 50225: return "ISO-2022-KR";
		case 50227: return "ISO-2022-CN";
		case 50229: return "ISO-2022-CN-EXT";
		case 50930: return "EBCDIC-JP-E";
		case 51932: return "EUC-JP";
		case 51936: return "EUC-CN";
		case 51949: return "EUC-KR";
		case 51950: return "EUC-CN";
		case 54936: return "GB18030";
		case 65000: return "UTF-7";
		case 65001: return "UTF-8";
		default: return NULL;
	}
}

static iconv_t get_converter(boost::uint32_t codepage) {
	
	// Try to reuse an existing converter if possible
	std::map<boost::uint32_t, iconv_t>::iterator i = converters.find(codepage);
	if(i != converters.end()) {
		return i->second;
	}
	
	iconv_t handle = iconv_t(-1);
	
	const char * encoding = get_encoding_name(codepage);
	if(encoding) {
		handle = iconv_open("UTF-8", encoding);
	}
	
	// Otherwise, try a few different codepage name prefixes
	if(handle == iconv_t(-1)) {
		const char * prefixes[] = { "MSCP", "CP", "WINDOWS-", "MS", "IBM", "IBM-", "" };
		BOOST_FOREACH(const char * prefix, prefixes) {
			std::ostringstream oss;
			oss << prefix << std::setfill('0') << std::setw(3) << codepage;
			handle = iconv_open("UTF-8", oss.str().c_str());
			if(handle != iconv_t(-1)) {
				break;
			}
		}
	}
	
	if(handle == iconv_t(-1)) {
		log_warning << "could not get codepage " << codepage << " -> UTF-8 converter";
	}
	
	return converters[codepage] = handle;
}

//! Fallback conversion that will at least work for ASCII characters
static void to_utf8_fallback(const std::string & from, std::string & to,
                             boost::uint32_t codepage) {
	
	size_t skip = get_encoding_size(codepage);
	
	to.clear();
	to.reserve(ceildiv(from.size(), skip));
	
	for(size_t i = 0; i < from.size(); i += skip) {
		if((unsigned char)from[i] <= 127) {
			// copy ASCII characters
			to.push_back(from[i]);
		} else {
			// replace everything else with underscores
			to.push_back(replacement_char);
		}
	}
}

} // anonymous namespace

void binary_string::load(std::istream & is, std::string & target) {
	
	boost::int32_t length = load_number<boost::int32_t>(is);
	if(is.fail()) {
		return;
	}
	
	target.clear();
	
	while(length) {
		char buffer[10 * 1024];
		boost::int32_t buf_size = std::min(length, boost::int32_t(sizeof(buffer)));
		is.read(buffer, buf_size);
		target.append(buffer, size_t(buf_size));
		length -= buf_size;
	}
}

void encoded_string::load(std::istream & is, std::string & target, boost::uint32_t codepage) {
	
	std::string temp;
	binary_string::load(is, temp);
	
	to_utf8(temp, target, codepage);
}

void to_utf8(const std::string & from, std::string & to, boost::uint32_t codepage) {
	
	if(codepage == cp_utf8 || codepage == cp_ascii) {
		// copy UTF-8 directly
		to = from;
		return;
	}
	
	iconv_t converter = get_converter(codepage);
	if(converter == iconv_t(-1)) {
		to_utf8_fallback(from, to, codepage);
		return;
	}
	
	
	/*
	 * Some iconv implementations declare the second parameter of iconv() as
	 * const char **, others as char **.
	 * Use this little hack to compile with both variants.
	 */
	struct inbuf_ {
		const char * buf;
		explicit inbuf_(const char * data) : buf(data) { }
		operator const char **() { return &buf; }
		operator char **() { return const_cast<char **>(&buf); }
	} inbuf(from.data());
	
	size_t insize = from.size();
	
	size_t outbase = 0;
	
	if(!insize) {
		to.clear();
		return;
	}
	
	iconv(converter, NULL, NULL, NULL, NULL);
	
	size_t skip = get_encoding_size(codepage);
	
	bool warn = false;
	
	while(insize) {
		
		to.resize(outbase + ceildiv(insize, skip) + 4);
		
		char * outbuf = &to[0] + outbase;
		size_t outsize = to.size() - outbase;
		
		size_t ret = iconv(converter, inbuf, &insize, &outbuf, &outsize);
		if(ret == size_t(-1)) {
			if(errno == E2BIG) {
				// not enough output space - we'll allocate more in the next loop
			} else if(/*errno == EILSEQ &&*/ insize >= 2) {
				// invalid byte (sequence) - add a replacement char nd try the next byte
				if(outsize == 0) {
					to.push_back(replacement_char);
				} else {
					*outbuf = replacement_char;
					outsize--;
				}
				inbuf.buf += skip;
				insize -= skip;
				warn = true;
			} else {
				// something else went wrong - return what we have so far
				insize = 0;
				warn = true;
			}
		}
		
		outbase = to.size() - outsize;
	}
	
	if(warn) {
		log_warning << "unexpected data while converting from CP" << codepage << " to UTF-8";
	}
	
	to.resize(outbase);
}
Add copyright notices. 14 years ago			`/*`
Update copyright lines for files changed this year 13 years ago			`* Copyright (C) 2011-2013 Daniel Scharrer`
Add copyright notices. 14 years ago			`*`
			`* This software is provided 'as-is', without any express or implied`
			`* warranty. In no event will the author(s) be held liable for any damages`
			`* arising from the use of this software.`
			`*`
			`* Permission is granted to anyone to use this software for any purpose,`
			`* including commercial applications, and to alter it and redistribute it`
			`* freely, subject to the following restrictions:`
			`*`
			`* 1. The origin of this software must not be misrepresented; you must not`
			`* claim that you wrote the original software. If you use this software`
			`* in a product, an acknowledgment in the product documentation would be`
			`* appreciated but is not required.`
			`* 2. Altered source versions must be plainly marked as such, and must not be`
			`* misrepresented as being the original software.`
			`* 3. This notice may not be removed or altered from any source distribution.`
			`*/`
The header is loaded. 15 years ago
Cleanup utils. 15 years ago			`#include "util/load.hpp"`
The header is loaded. 15 years ago
			`#include <iterator>`
Parse custom message and permission entries. 15 years ago			`#include <map>`
			`#include <sstream>`
Allocate strings in chunks when loading. 15 years ago			`#include <algorithm>`
Add support for codepages without aliases in iconv 13 years ago			`#include <iomanip>`
The header is loaded. 15 years ago
			`#include <iconv.h>`
			`#include <errno.h>`

Add support for codepages without aliases in iconv 13 years ago			`#include <boost/foreach.hpp>`

Improve output. 15 years ago			`#include "util/log.hpp"`
The header is loaded. 15 years ago
Parse custom message and permission entries. 15 years ago			`namespace {`

Add support for codepages without aliases in iconv 13 years ago			`static const boost::uint32_t cp_utf8 = 65001;`
			`static const boost::uint32_t cp_ascii = 20127;`
Improve iconv error handling - Change conversion errors to warnings - Only print one warning if iconv_open() fails - Add a fall-back converter - Replace bad code sequences and try to resume on the next byte 13 years ago			`static const char replacement_char = '_';`

Fix MSVC 9 build 13 years ago			`std::map<boost::uint32_t, iconv_t> converters;`
Parse custom message and permission entries. 15 years ago
Add support for codepages without aliases in iconv 13 years ago			`static size_t get_encoding_size(boost::uint32_t codepage) {`
			`switch(codepage) {`
			`case 1200: return 2u; // UTF-16LE`
			`case 1201: return 2u; // UTF-16BE`
			`case 12000: return 4u; // UTF-32LE`
			`case 12001: return 4u; // UTF-32BE`
			`default: return 1u;`
			`}`
			`}`

			`//! Get names for encodings where iconv doesn't have the codepage alias`
			`static const char * get_encoding_name(boost::uint32_t codepage) {`
			`switch(codepage) {`
			`case 708: return "ISO-8859-6";`
			`case 936: return "GBK";`
			`case 949: return "UHC";`
			`case 950: return "BIG5";`
			`// iconv's behavior for "UTF-16" is platform-dependant if there is no BOM.`
			`// There never is any BOM in Inno Setup files and it's always little-endian,`
			`// so we specify the exact encoding.`
			`case 1200: return "UTF-16LE";`
			`case 1201: return "UTF-16BE";`
			`case 1252: return "MS-ANSI";`
			`case 1361: return "JOHAB";`
			`case 10000: return "MACINTOSH";`
			`case 10002: return "BIG5";`
			`case 10008: return "GB2312";`
			`case 12000: return "UTF-32LE";`
			`case 12001: return "UTF-32BE";`
			`case 20003: return "IBM5550";`
			`case 20127: return "US-ASCII";`
			`case 20261: return "T.61";`
			`case 20269: return "ISO_6937";`
			`case 20273: return "IBM273";`
			`case 20277: return "IBM277";`
			`case 20278: return "IBM278";`
			`case 20280: return "IBM280";`
			`case 20284: return "IBM284";`
			`case 20285: return "IBM285";`
			`case 20290: return "IBM290";`
			`case 20297: return "IBM297";`
			`case 20420: return "IBM420";`
			`case 20423: return "IBM423";`
			`case 20424: return "IBM424";`
			`case 20866: return "KOI8-R";`
			`case 20871: return "IBM871";`
			`case 20880: return "IBM880";`
			`case 20905: return "IBM905";`
			`case 20924: return "IBM1047";`
			`case 20932: return "EUC-JP-MS";`
			`case 20936: return "EUC-CN";`
			`case 21025: return "IBM1025";`
			`case 21866: return "KOI8-U";`
			`case 28591: return "ISO-8859-1";`
			`case 28592: return "ISO-8859-2";`
			`case 28593: return "ISO-8859-3";`
			`case 28594: return "ISO-8859-4";`
			`case 28595: return "ISO-8859-5";`
			`case 28596: return "ISO-8859-6";`
			`case 28597: return "ISO-8859-7";`
			`case 28598: return "ISO-8859-8";`
			`case 28599: return "ISO-8859-9";`
			`case 28603: return "ISO-8859-13";`
			`case 28605: return "ISO-8859-15";`
			`case 38598: return "ISO-8859-8";`
			`case 50220: return "ISO-2022-JP";`
			`case 50221: return "ISO-2022-JP-2";`
			`case 50222: return "ISO-2022-JP-3";`
			`case 50225: return "ISO-2022-KR";`
			`case 50227: return "ISO-2022-CN";`
			`case 50229: return "ISO-2022-CN-EXT";`
			`case 50930: return "EBCDIC-JP-E";`
			`case 51932: return "EUC-JP";`
			`case 51936: return "EUC-CN";`
			`case 51949: return "EUC-KR";`
			`case 51950: return "EUC-CN";`
			`case 54936: return "GB18030";`
			`case 65000: return "UTF-7";`
			`case 65001: return "UTF-8";`
			`default: return NULL;`
			`}`
			`}`

			`static iconv_t get_converter(boost::uint32_t codepage) {`
Parse custom message and permission entries. 15 years ago
Add support for codepages without aliases in iconv 13 years ago			`// Try to reuse an existing converter if possible`
Fix MSVC 9 build 13 years ago			`std::map<boost::uint32_t, iconv_t>::iterator i = converters.find(codepage);`
Parse custom message and permission entries. 15 years ago			`if(i != converters.end()) {`
			`return i->second;`
			`}`

Add support for codepages without aliases in iconv 13 years ago			`iconv_t handle = iconv_t(-1);`

			`const char * encoding = get_encoding_name(codepage);`
			`if(encoding) {`
			`handle = iconv_open("UTF-8", encoding);`
Parse custom message and permission entries. 15 years ago			`}`

Add support for codepages without aliases in iconv 13 years ago			`// Otherwise, try a few different codepage name prefixes`
			`if(handle == iconv_t(-1)) {`
			`const char * prefixes[] = { "MSCP", "CP", "WINDOWS-", "MS", "IBM", "IBM-", "" };`
			`BOOST_FOREACH(const char * prefix, prefixes) {`
			`std::ostringstream oss;`
			`oss << prefix << std::setfill('0') << std::setw(3) << codepage;`
			`handle = iconv_open("UTF-8", oss.str().c_str());`
			`if(handle != iconv_t(-1)) {`
			`break;`
			`}`
			`}`
			`}`
Improve iconv error handling - Change conversion errors to warnings - Only print one warning if iconv_open() fails - Add a fall-back converter - Replace bad code sequences and try to resume on the next byte 13 years ago
			`if(handle == iconv_t(-1)) {`
Add support for codepages without aliases in iconv 13 years ago			`log_warning << "could not get codepage " << codepage << " -> UTF-8 converter";`
Improve iconv error handling - Change conversion errors to warnings - Only print one warning if iconv_open() fails - Add a fall-back converter - Replace bad code sequences and try to resume on the next byte 13 years ago			`}`

			`return converters[codepage] = handle;`
Parse custom message and permission entries. 15 years ago			`}`

Add support for codepages without aliases in iconv 13 years ago			`//! Fallback conversion that will at least work for ASCII characters`
			`static void to_utf8_fallback(const std::string & from, std::string & to,`
			`boost::uint32_t codepage) {`

			`size_t skip = get_encoding_size(codepage);`

			`to.clear();`
			`to.reserve(ceildiv(from.size(), skip));`

			`for(size_t i = 0; i < from.size(); i += skip) {`
			`if((unsigned char)from[i] <= 127) {`
			`// copy ASCII characters`
			`to.push_back(from[i]);`
			`} else {`
			`// replace everything else with underscores`
			`to.push_back(replacement_char);`
			`}`
			`}`
			`}`

			`} // anonymous namespace`
Parse custom message and permission entries. 15 years ago
Cleanup utils. 15 years ago			`void binary_string::load(std::istream & is, std::string & target) {`
The header is loaded. 15 years ago
Fix MSVC 9 build 13 years ago			`boost::int32_t length = load_number<boost::int32_t>(is);`
Allocate strings in chunks when loading. 15 years ago			`if(is.fail()) {`
The header is loaded. 15 years ago			`return;`
			`}`

Allocate strings in chunks when loading. 15 years ago			`target.clear();`

			`while(length) {`
			`char buffer[10 * 1024];`
Fix MSVC 9 build 13 years ago			`boost::int32_t buf_size = std::min(length, boost::int32_t(sizeof(buffer)));`
Allocate strings in chunks when loading. 15 years ago			`is.read(buffer, buf_size);`
			`target.append(buffer, size_t(buf_size));`
			`length -= buf_size;`
			`}`
The header is loaded. 15 years ago			`}`

Fix MSVC 9 build 13 years ago			`void encoded_string::load(std::istream & is, std::string & target, boost::uint32_t codepage) {`
Parse custom message and permission entries. 15 years ago
			`std::string temp;`
Cleanup utils. 15 years ago			`binary_string::load(is, temp);`
Parse custom message and permission entries. 15 years ago
Cleanup utils. 15 years ago			`to_utf8(temp, target, codepage);`
Parse custom message and permission entries. 15 years ago			`}`

Fix MSVC 9 build 13 years ago			`void to_utf8(const std::string & from, std::string & to, boost::uint32_t codepage) {`
Parse custom message and permission entries. 15 years ago
Add support for codepages without aliases in iconv 13 years ago			`if(codepage == cp_utf8 \|\| codepage == cp_ascii) {`
Improve iconv error handling - Change conversion errors to warnings - Only print one warning if iconv_open() fails - Add a fall-back converter - Replace bad code sequences and try to resume on the next byte 13 years ago			`// copy UTF-8 directly`
			`to = from;`
			`return;`
			`}`

Cleanup utils. 15 years ago			`iconv_t converter = get_converter(codepage);`
Improve iconv error handling - Change conversion errors to warnings - Only print one warning if iconv_open() fails - Add a fall-back converter - Replace bad code sequences and try to resume on the next byte 13 years ago			`if(converter == iconv_t(-1)) {`
			`to_utf8_fallback(from, to, codepage);`
			`return;`
			`}`

The header is loaded. 15 years ago
Fix compile error with some iconv implementations Some iconv implementations declare the second parameter of iconv() as const char (FreeBSD), others as char (Linux, OS X). This commit adds a workaround so that innoextract will compile with both variants. 13 years ago			`/*`
			`* Some iconv implementations declare the second parameter of iconv() as`
			`* const char , others as char .`
			`* Use this little hack to compile with both variants.`
			`*/`
			`struct inbuf_ {`
			`const char * buf;`
[style] Remove redundant semicolons 13 years ago			`explicit inbuf_(const char * data) : buf(data) { }`
			`operator const char **() { return &buf; }`
			`operator char () { return const_cast<char >(&buf); }`
Fix compile error with some iconv implementations Some iconv implementations declare the second parameter of iconv() as const char (FreeBSD), others as char (Linux, OS X). This commit adds a workaround so that innoextract will compile with both variants. 13 years ago			`} inbuf(from.data());`

The header is loaded. 15 years ago			`size_t insize = from.size();`

			`size_t outbase = 0;`

			`if(!insize) {`
			`to.clear();`
			`return;`
			`}`

			`iconv(converter, NULL, NULL, NULL, NULL);`

Add support for codepages without aliases in iconv 13 years ago			`size_t skip = get_encoding_size(codepage);`
Improve iconv error handling - Change conversion errors to warnings - Only print one warning if iconv_open() fails - Add a fall-back converter - Replace bad code sequences and try to resume on the next byte 13 years ago
			`bool warn = false;`

The header is loaded. 15 years ago			`while(insize) {`

Improve iconv error handling - Change conversion errors to warnings - Only print one warning if iconv_open() fails - Add a fall-back converter - Replace bad code sequences and try to resume on the next byte 13 years ago			`to.resize(outbase + ceildiv(insize, skip) + 4);`
The header is loaded. 15 years ago
			`char * outbuf = &to[0] + outbase;`
			`size_t outsize = to.size() - outbase;`

Fix compile error with some iconv implementations Some iconv implementations declare the second parameter of iconv() as const char (FreeBSD), others as char (Linux, OS X). This commit adds a workaround so that innoextract will compile with both variants. 13 years ago			`size_t ret = iconv(converter, inbuf, &insize, &outbuf, &outsize);`
Improve iconv error handling - Change conversion errors to warnings - Only print one warning if iconv_open() fails - Add a fall-back converter - Replace bad code sequences and try to resume on the next byte 13 years ago			`if(ret == size_t(-1)) {`
			`if(errno == E2BIG) {`
			`// not enough output space - we'll allocate more in the next loop`
			`} else if(/errno == EILSEQ &&/ insize >= 2) {`
			`// invalid byte (sequence) - add a replacement char nd try the next byte`
			`if(outsize == 0) {`
			`to.push_back(replacement_char);`
			`} else {`
			`*outbuf = replacement_char;`
			`outsize--;`
			`}`
			`inbuf.buf += skip;`
			`insize -= skip;`
			`warn = true;`
			`} else {`
			`// something else went wrong - return what we have so far`
			`insize = 0;`
			`warn = true;`
			`}`
The header is loaded. 15 years ago			`}`

			`outbase = to.size() - outsize;`
			`}`

Improve iconv error handling - Change conversion errors to warnings - Only print one warning if iconv_open() fails - Add a fall-back converter - Replace bad code sequences and try to resume on the next byte 13 years ago			`if(warn) {`
			`log_warning << "unexpected data while converting from CP" << codepage << " to UTF-8";`
			`}`

Implement reading language entries. 15 years ago			`to.resize(outbase);`
The header is loaded. 15 years ago			`}`