You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
181 lines
6.0 KiB
181 lines
6.0 KiB
/* |
|
* Copyright (C) 2011-2019 Daniel Scharrer |
|
* |
|
* This software is provided 'as-is', without any express or implied |
|
* warranty. In no event will the author(s) be held liable for any damages |
|
* arising from the use of this software. |
|
* |
|
* Permission is granted to anyone to use this software for any purpose, |
|
* including commercial applications, and to alter it and redistribute it |
|
* freely, subject to the following restrictions: |
|
* |
|
* 1. The origin of this software must not be misrepresented; you must not |
|
* claim that you wrote the original software. If you use this software |
|
* in a product, an acknowledgment in the product documentation would be |
|
* appreciated but is not required. |
|
* 2. Altered source versions must be plainly marked as such, and must not be |
|
* misrepresented as being the original software. |
|
* 3. This notice may not be removed or altered from any source distribution. |
|
*/ |
|
|
|
/*! |
|
* \file |
|
* |
|
* Utility function to convert strings to UTF-8. |
|
*/ |
|
#ifndef INNOEXTRACT_UTIL_ENCODING_HPP |
|
#define INNOEXTRACT_UTIL_ENCODING_HPP |
|
|
|
#include <string> |
|
|
|
#include <boost/cstdint.hpp> |
|
|
|
namespace util { |
|
|
|
enum known_codepages { |
|
cp_dos708 = 708, // arabic |
|
cp_windows874 = 874, // thai |
|
cp_shift_jis = 932, // japanese |
|
cp_gbk = 936, // chinese |
|
cp_uhc = 949, // korean |
|
cp_big5 = 950, // chinese |
|
cp_big5_hkscs = 951, // chinese |
|
cp_utf16le = 1200, |
|
cp_utf16be = 1201, |
|
cp_windows1250 = 1250, // latin |
|
cp_windows1251 = 1251, // cyrillic |
|
cp_windows1252 = 1252, // latin |
|
cp_windows1253 = 1253, // greek |
|
cp_windows1254 = 1254, // turkish |
|
cp_windows1255 = 1255, // hebrew |
|
cp_windows1256 = 1256, // arabic |
|
cp_windows1257 = 1257, // baltic |
|
cp_windows1258 = 1258, // vietnamese |
|
cp_windows1270 = 1270, // sami |
|
cp_johab = 1361, // korean |
|
cp_macroman = 10000, // latin |
|
cp_macjapanese = 10001, // japanese |
|
cp_macchinese1 = 10002, // chinese |
|
cp_mackorean = 10003, // korean |
|
cp_macarabic = 10004, // arabic |
|
cp_machebrew = 10005, // hebrew |
|
cp_macgreek = 10006, // greek |
|
cp_maccyrillic = 10007, // cyrillic |
|
cp_macchinese2 = 10008, // chinese |
|
cp_macromania = 10010, // latin |
|
cp_macukraine = 10017, // cyrillic |
|
cp_macthai = 10021, // thai |
|
cp_macroman2 = 10029, // latin |
|
cp_maciceland = 10079, // latin |
|
cp_macturkish = 10081, // turkish |
|
cp_maccroatian = 10082, // latin |
|
cp_utf32le = 12000, |
|
cp_utf32be = 12001, |
|
cp_cns = 20000, // chinese |
|
cp_big5_eten = 20002, // chinese |
|
cp_ia5 = 20105, // latin |
|
cp_ia5_de = 20106, // latin |
|
cp_ia5_se2 = 20107, // latin |
|
cp_ia5_no2 = 20108, // latin |
|
cp_ascii = 20127, // latin |
|
cp_t61 = 20261, // latin |
|
cp_iso_6937 = 20269, // latin |
|
cp_ibm273 = 20273, // latin |
|
cp_ibm277 = 20277, // latin |
|
cp_ibm278 = 20278, // latin |
|
cp_ibm280 = 20280, // latin |
|
cp_ibm284 = 20284, // latin |
|
cp_ibm285 = 20285, // latin |
|
cp_ibm290 = 20290, // japanese |
|
cp_ibm297 = 20297, // latin |
|
cp_ibm420 = 20420, // arabic |
|
cp_ibm423 = 20423, // greek |
|
cp_ibm424 = 20424, // hebrew |
|
cp_ibm833 = 20833, // korean |
|
cp_ibm838 = 20838, // thai |
|
cp_koi8_r = 20866, // cyrillic |
|
cp_ibm871 = 20871, // latin |
|
cp_ibm880 = 20880, // cyrillic |
|
cp_ibm905 = 20905, // turkish |
|
cp_ibm924 = 20924, // latin |
|
cp_euc_jp_ms = 20932, // japanese |
|
cp_gb2312_80 = 20936, // chinese |
|
cp_wansung = 20949, // korean |
|
cp_ibm1025 = 21025, // cyrillic |
|
cp_koi8_u = 21866, // cyrillic |
|
cp_iso_8859_1 = 28591, // latin |
|
cp_iso_8859_2 = 28592, // latin |
|
cp_iso_8859_3 = 28593, // latin |
|
cp_iso_8859_4 = 28594, // latin |
|
cp_iso_8859_5 = 28595, // cyrillic |
|
cp_iso_8859_6 = 28596, // arabic |
|
cp_iso_8859_7 = 28597, // greek |
|
cp_iso_8859_8 = 28598, // hebrew |
|
cp_iso_8859_9 = 28599, // turkish |
|
cp_iso_8859_10 = 28600, // latin |
|
cp_iso_8859_11 = 28601, // thai |
|
cp_iso_8859_13 = 28603, // baltic |
|
cp_iso_8859_14 = 28604, // celtic |
|
cp_iso_8859_15 = 28605, // latin |
|
cp_europa3 = 29001, // latin |
|
cp_iso_8859_6i = 38596, // hebrew |
|
cp_iso_8859_8i = 38598, // hebrew |
|
cp_iso_2022_jp = 50220, // japanese |
|
cp_iso_2022_jp2 = 50221, // japanese |
|
cp_iso_2022_jp3 = 50222, // japanese |
|
cp_iso_2022_kr = 50225, // korean |
|
cp_iso_2022_cn = 50227, // chinese |
|
cp_iso_2022_cn2 = 50229, // chinese |
|
cp_ibm930 = 50930, // japanese |
|
cp_ibm931 = 50931, // japanese |
|
cp_ibm933 = 50933, // korean |
|
cp_ibm935 = 50935, // chinese |
|
cp_ibm936 = 50936, // chinese |
|
cp_ibm937 = 50937, // chinese |
|
cp_ibm939 = 50939, // japanese |
|
cp_euc_jp = 51932, // japanese |
|
cp_euc_cn = 51936, // chinese |
|
cp_euc_kr = 51949, // korean |
|
cp_euc_tw = 51950, // chinese |
|
cp_gb2312_hz = 52936, // chinese |
|
cp_gb18030 = 54936, // chinese |
|
cp_utf7 = 65000, |
|
cp_utf8 = 65001, |
|
}; |
|
|
|
typedef boost::uint32_t codepage_id; |
|
|
|
/*! |
|
* Convert a possibly broken UTF-16 string to WTF-8, an extension of UTF-8. |
|
*/ |
|
void utf16le_to_wtf8(const std::string & from, std::string & to); |
|
|
|
/*! |
|
* Convert WTF-8 to UTF-16 while preserving unpaired surrogates. |
|
*/ |
|
void wtf8_to_utf16le(const std::string & from, std::string & to); |
|
|
|
/*! |
|
* Convert a string in place to UTF-8 from a specified encoding. |
|
* \param data The input string to convert. |
|
* \param codepage The Windows codepage number for the input string encoding. |
|
* |
|
* \note This function is not thread-safe. |
|
*/ |
|
void to_utf8(std::string & data, codepage_id codepage = cp_windows1252); |
|
|
|
/*! |
|
* Convert a string from UTF-8 to a specified encoding. |
|
* \param from The input string to convert. |
|
* \param to The output for the converted string. |
|
* \param codepage The Windows codepage number for the input string encoding. |
|
* |
|
* \note This function is not thread-safe. |
|
*/ |
|
void from_utf8(const std::string & from, std::string & to, codepage_id codepage = cp_windows1252); |
|
|
|
std::string encoding_name(codepage_id codepage); |
|
|
|
} // namespace util |
|
|
|
#endif // INNOEXTRACT_UTIL_ENCODING_HPP
|
|
|