Improve iconv error handling

- Change conversion errors to warnings - Only print one warning if iconv_open() fails - Add a fall-back converter - Replace bad code sequences and try to resume on the next byte
13 years ago · ebb2ec3ac8
1 changed files with 71 additions and 7 deletions
--- a/src/util/load.cpp
+++ b/src/util/load.cpp
@ -32,6 +32,10 @@

 namespace {

+static const boost::uint32_t cp_utf8 = 65001;
+static const boost::uint32_t cp_utf16 = 1200;
+static const char replacement_char = '_';
+
 std::map<boost::uint32_t, iconv_t> converters;

 iconv_t get_converter(boost::uint32_t codepage) {
@ -43,7 +47,7 @@ iconv_t get_converter(boost::uint32_t codepage) {
 	}
 	
 	std::ostringstream oss;
-	if(codepage == 1200) {
+	if(codepage == cp_utf16) {
 		// iconv's behavior for "UTF-16" is platform-dependant if there is no BOM.
 		// There never is any BOM in Inno Setup files and it's always little-endian,
 		// so we specify the exact encoding.
@ -52,7 +56,13 @@ iconv_t get_converter(boost::uint32_t codepage) {
 		oss << "CP" << codepage;
 	}
 	
-	return converters[codepage] = iconv_open("UTF-8", oss.str().c_str());
+	iconv_t handle = iconv_open("UTF-8", oss.str().c_str());
+	
+	if(handle == iconv_t(-1)) {
+		log_warning << "could not get " << oss.str() << " -> UTF-8 converter";
+	}
+	
+	return converters[codepage] = handle;
 }

 };
@ -83,9 +93,40 @@ void encoded_string::load(std::istream & is, std::string & target, boost::uint32
 	to_utf8(temp, target, codepage);
 }

+//! Fallback conversion that will at least work for ASCII characters
+static void to_utf8_fallback(const std::string & from, std::string & to,
+                             boost::uint32_t codepage) {
+	
+	size_t skip = ((codepage == cp_utf16) ? 2 : 1);
+	
+	to.clear();
+	to.reserve(ceildiv(from.size(), skip));
+	
+	for(size_t i = 0; i < from.size(); i += skip) {
+		if((unsigned char)from[i] <= 127) {
+			// copy ASCII characters
+			to.push_back(from[i]);
+		} else {
+			// replace everything else with underscores
+			to.push_back(replacement_char);
+		}
+	}
+}
+
 void to_utf8(const std::string & from, std::string & to, boost::uint32_t codepage) {
 	
+	if(codepage == cp_utf8) {
+		// copy UTF-8 directly
+		to = from;
+		return;
+	}
+	
 	iconv_t converter = get_converter(codepage);
+	if(converter == iconv_t(-1)) {
+		to_utf8_fallback(from, to, codepage);
+		return;
+	}
+	
 	
 	/*
 	 * Some iconv implementations declare the second parameter of iconv() as
@ -110,22 +151,45 @@ void to_utf8(const std::string & from, std::string & to, boost::uint32_t codepag
 	
 	iconv(converter, NULL, NULL, NULL, NULL);
 	
+	size_t skip = ((codepage == cp_utf16) ? 2 : 1);
+	
+	bool warn = false;
+	
 	while(insize) {
 		
-		to.resize(outbase + insize + 4);
+		to.resize(outbase + ceildiv(insize, skip) + 4);
 		
 		char * outbuf = &to[0] + outbase;
 		size_t outsize = to.size() - outbase;
 		
 		size_t ret = iconv(converter, inbuf, &insize, &outbuf, &outsize);
-		if(ret == size_t(-1) && errno != E2BIG) {
-			log_error << "iconv error while converting from CP" << codepage << ": " << errno;
-			to.clear();
-			return;
+		if(ret == size_t(-1)) {
+			if(errno == E2BIG) {
+				// not enough output space - we'll allocate more in the next loop
+			} else if(/*errno == EILSEQ &&*/ insize >= 2) {
+				// invalid byte (sequence) - add a replacement char nd try the next byte
+				if(outsize == 0) {
+					to.push_back(replacement_char);
+				} else {
+					*outbuf = replacement_char;
+					outsize--;
+				}
+				inbuf.buf += skip;
+				insize -= skip;
+				warn = true;
+			} else {
+				// something else went wrong - return what we have so far
+				insize = 0;
+				warn = true;
+			}
 		}
 		
 		outbase = to.size() - outsize;
 	}
 	
+	if(warn) {
+		log_warning << "unexpected data while converting from CP" << codepage << " to UTF-8";
+	}
+	
 	to.resize(outbase);
 }