diff --git a/3rdParty/hoehrmann_utf8/CMakeLists.txt b/3rdParty/hoehrmann_utf8/CMakeLists.txt new file mode 100644 index 000000000..b17b98472 --- /dev/null +++ b/3rdParty/hoehrmann_utf8/CMakeLists.txt @@ -0,0 +1,3 @@ +add_library(hoehrmann_utf8 INTERFACE) + +target_include_directories(hoehrmann_utf8 INTERFACE ${CMAKE_CURRENT_LIST_DIR}) diff --git a/3rdParty/hoehrmann_utf8/hoehrmann_utf8.h b/3rdParty/hoehrmann_utf8/hoehrmann_utf8.h new file mode 100644 index 000000000..bc914c152 --- /dev/null +++ b/3rdParty/hoehrmann_utf8/hoehrmann_utf8.h @@ -0,0 +1,61 @@ +/* Adapted from: https://github.com/hoehrmann/utf-8-misc/blob/449221e7a693a9c7b8938721cd4244eed4ca9320/utf8_branch.h */ + +/*- + * Copyright (c) 2014 Taylor R Campbell + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 0xf + +static const uint32_t utf8_classtab[0x10] = { + 0x88888888UL,0x88888888UL,0x99999999UL,0x99999999UL, + 0xaaaaaaaaUL,0xaaaaaaaaUL,0xaaaaaaaaUL,0xaaaaaaaaUL, + 0x222222ffUL,0x22222222UL,0x22222222UL,0x22222222UL, + 0x3333333bUL,0x33433333UL,0xfff5666cUL,0xffffffffUL, +}; + +static const uint32_t utf8_statetab[0x10] = { + 0xfffffff0UL,0xffffffffUL,0xfffffff1UL,0xfffffff3UL, + 0xfffffff4UL,0xfffffff7UL,0xfffffff6UL,0xffffffffUL, + 0x33f11f0fUL,0xf3311f0fUL,0xf33f110fUL,0xfffffff2UL, + 0xfffffff5UL,0xffffffffUL,0xffffffffUL,0xffffffffUL, +}; + +static inline uint8_t +utf8_decode_step(uint8_t state, uint8_t octet, uint32_t *cpp) +{ + const uint8_t reject = (state >> 3), nonascii = (octet >> 7); + const uint8_t klass = (!nonascii? 0 : + (0xf & (utf8_classtab[(octet >> 3) & 0xf] >> (4 * (octet & 7))))); + + *cpp = (state == UTF8_ACCEPT + ? (octet & (0xffU >> klass)) + : ((octet & 0x3fU) | (*cpp << 6))); + + return (reject? 0xf : + (0xf & (utf8_statetab[klass] >> (4 * (state & 7))))); +} diff --git a/CMakeLists.txt b/CMakeLists.txt index 64cb3570c..21b524478 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -355,6 +355,8 @@ add_subdirectory(3rdParty/simpleini) add_subdirectory(3rdParty/libmpq) +add_subdirectory(3rdParty/hoehrmann_utf8) + add_library(PKWare STATIC 3rdParty/PKWare/explode.cpp 3rdParty/PKWare/implode.cpp) @@ -466,6 +468,7 @@ set(libdevilutionx_SRCS Source/utils/sdl_bilinear_scale.cpp Source/utils/sdl_rwops_file_wrapper.cpp Source/utils/sdl_thread.cpp + Source/utils/utf8.cpp Source/DiabloUI/art.cpp Source/DiabloUI/art_draw.cpp Source/DiabloUI/button.cpp @@ -861,7 +864,8 @@ target_link_libraries(libdevilutionx PUBLIC PKWare libmpq smacker - simpleini) + simpleini + hoehrmann_utf8) if(WIN32) target_link_libraries(libdevilutionx PUBLIC find_steam_game) diff --git a/Source/DiabloUI/diabloui.cpp b/Source/DiabloUI/diabloui.cpp index bf35d27d9..5b1237b75 100644 --- a/Source/DiabloUI/diabloui.cpp +++ b/Source/DiabloUI/diabloui.cpp @@ -24,7 +24,7 @@ #include "utils/sdl_wrap.h" #include "utils/stubs.h" #include "utils/language.h" -#include "utils/utf8.h" +#include "utils/utf8.hpp" #ifdef __SWITCH__ // for virtual keyboard on Switch diff --git a/Source/control.cpp b/Source/control.cpp index 5c5884055..c3dc5326b 100644 --- a/Source/control.cpp +++ b/Source/control.cpp @@ -38,7 +38,7 @@ #include "utils/language.h" #include "utils/sdl_geometry.h" #include "utils/stdcompat/optional.hpp" -#include "utils/utf8.h" +#include "utils/utf8.hpp" #include "options.h" #ifdef _DEBUG diff --git a/Source/engine/render/text_render.cpp b/Source/engine/render/text_render.cpp index 85b51e778..aa39edb95 100644 --- a/Source/engine/render/text_render.cpp +++ b/Source/engine/render/text_render.cpp @@ -20,7 +20,7 @@ #include "palette.h" #include "utils/display.h" #include "utils/sdl_compat.h" -#include "utils/utf8.h" +#include "utils/utf8.hpp" namespace devilution { @@ -196,20 +196,13 @@ int GetLineWidth(string_view text, GameFontTables size, int spacing, int *charac { int lineWidth = 0; - std::string textBuffer; - textBuffer.reserve(textBuffer.size() + 3); // Buffer must be padded before calling utf8_decode() - textBuffer.append(text.data(), text.size()); - textBuffer.resize(textBuffer.size() + 3); - const char *textData = textBuffer.data(); - uint32_t codepoints = 0; uint32_t currentUnicodeRow = 0; std::array *kerning = nullptr; char32_t next; - int error; - while (*textData != '\0') { - textData = utf8_decode(textData, &next, &error); - if (error) + while (!text.empty()) { + next = ConsumeFirstUtf8CodePoint(&text); + if (next == Utf8DecodeError) break; if (next == ZWSP) continue; @@ -249,13 +242,11 @@ std::string WordWrapString(string_view text, size_t width, GameFontTables size, int lastBreakableLen; char32_t lastBreakableCodePoint; - std::string input; + std::string input { text }; std::string output; - input.reserve(input.size() + 3); // Buffer must be padded before calling utf8_decode() - input.append(text.data(), text.size()); - input.resize(input.size() + 3); output.reserve(text.size()); const char *begin = input.data(); + const char *end = input.data() + input.size(); const char *cur = begin; const char *processedEnd = cur; @@ -263,10 +254,11 @@ std::string WordWrapString(string_view text, size_t width, GameFontTables size, size_t lineWidth = 0; std::array *kerning = nullptr; char32_t next; - int error; - while (*cur != '\0') { - cur = utf8_decode(cur, &next, &error); - if (error != 0) + while (cur != end && *cur != '\0') { + uint8_t codepointLen; + next = DecodeFirstUtf8CodePoint(cur, &codepointLen); + cur += codepointLen; + if (next == Utf8DecodeError) break; if (next == U'\n') { // Existing line break, scan next line @@ -361,17 +353,12 @@ uint32_t DrawString(const Surface &out, string_view text, const Rectangle &rect, Art *font = nullptr; std::array *kerning = nullptr; - std::string textBuffer(text); - textBuffer.resize(textBuffer.size() + 4); // Buffer must be padded before calling utf8_decode() - const char *textData = textBuffer.data(); - const char *previousPosition = textData; - char32_t next; uint32_t currentUnicodeRow = 0; - int error; - for (; *textData != '\0'; previousPosition = textData) { - textData = utf8_decode(textData, &next, &error); - if (error) + string_view remaining = text; + while (!remaining.empty() && remaining[0] != '\0') { + next = ConsumeFirstUtf8CodePoint(&remaining); + if (next == Utf8DecodeError) break; if (next == ZWSP) continue; @@ -392,8 +379,8 @@ uint32_t DrawString(const Surface &out, string_view text, const Rectangle &rect, if (HasAnyOf(flags, (UiFlags::AlignCenter | UiFlags::AlignRight))) { lineWidth = (*kerning)[frame]; - if (*textData != '\0') - lineWidth += spacing + GetLineWidth(textData, size, spacing); + if (text[0] != '\0') + lineWidth += spacing + GetLineWidth(text, size, spacing); } if (HasAnyOf(flags, UiFlags::AlignCenter)) @@ -415,7 +402,7 @@ uint32_t DrawString(const Surface &out, string_view text, const Rectangle &rect, DrawArt(out, characterPosition, LoadFont(size, color, 0), '|'); } - return previousPosition - textBuffer.data(); + return text.data() - remaining.data(); } uint8_t PentSpn2Spin() diff --git a/Source/miniwin/misc_msg.cpp b/Source/miniwin/misc_msg.cpp index 09b829230..422f50c4e 100644 --- a/Source/miniwin/misc_msg.cpp +++ b/Source/miniwin/misc_msg.cpp @@ -27,7 +27,7 @@ #include "utils/log.hpp" #include "utils/sdl_compat.h" #include "utils/stubs.h" -#include "utils/utf8.h" +#include "utils/utf8.hpp" #ifdef __vita__ #include "platform/vita/touch.h" diff --git a/Source/utils/utf8.cpp b/Source/utils/utf8.cpp new file mode 100644 index 000000000..a1c117865 --- /dev/null +++ b/Source/utils/utf8.cpp @@ -0,0 +1,27 @@ +#include "utils/utf8.hpp" + +#include + +#include + +namespace devilution { + +char32_t DecodeFirstUtf8CodePoint(string_view input, uint8_t *len) +{ + uint32_t codepoint = 0; + uint32_t state = UTF8_ACCEPT; + for (std::size_t i = 0; i < input.size(); ++i) { + state = utf8_decode_step(state, static_cast(input[i]), &codepoint); + if (state == UTF8_ACCEPT) { + *len = i + 1; + return codepoint; + } + if (state == UTF8_REJECT) { + *len = i + 1; + return Utf8DecodeError; + } + } + return codepoint; +} + +} // namespace devilution diff --git a/Source/utils/utf8.h b/Source/utils/utf8.h deleted file mode 100644 index 2e98153b9..000000000 --- a/Source/utils/utf8.h +++ /dev/null @@ -1,85 +0,0 @@ -#pragma once - -#include -#include -#include - -/* Branchless UTF-8 decoder - * - * This is free and unencumbered software released into the public domain. - */ - -/* Decode the next character, C, from BUF, reporting errors in E. - * - * Since this is a branchless decoder, four bytes will be read from the - * buffer regardless of the actual length of the next character. This - * means the buffer _must_ have at least three bytes of zero padding - * following the end of the data stream. - * - * Errors are reported in E, which will be non-zero if the parsed - * character was somehow invalid: invalid byte sequence, non-canonical - * encoding, or a surrogate half. - * - * The function returns a pointer to the next character. When an error - * occurs, this pointer will be a guess that depends on the particular - * error, but it will always advance at least one byte. - */ -inline const char *utf8_decode(const char *buf, char32_t *c, int *e) -{ - static const char lengths[] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0 - }; - static const int masks[] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07 }; - static const uint32_t mins[] = { 4194304, 0, 128, 2048, 65536 }; - static const int shiftc[] = { 0, 18, 12, 6, 0 }; - static const int shifte[] = { 0, 6, 4, 2, 0 }; - - auto s = reinterpret_cast(buf); - int len = lengths[s[0] >> 3]; - - /* Compute the pointer to the next character early so that the next - * iteration can start working on the next character. Neither Clang - * nor GCC figure out this reordering on their own. - */ - const unsigned char *next = s + len + !len; - - /* Assume a four-byte character and load four bytes. Unused bits are - * shifted out. - */ - *c = static_cast((s[0] & masks[len]) << 18); - *c |= static_cast((s[1] & 0x3f) << 12); - *c |= static_cast((s[2] & 0x3f) << 6); - *c |= static_cast((s[3] & 0x3f) << 0); - *c >>= shiftc[len]; - - /* Accumulate the various error conditions. */ - *e = (*c < mins[len]) << 6; // non-canonical encoding - *e |= ((*c >> 11) == 0x1b) << 7; // surrogate half? - *e |= (*c > 0x10FFFF) << 8; // out of range? - *e |= (s[1] & 0xc0) >> 2; - *e |= (s[2] & 0xc0) >> 4; - *e |= (s[3]) >> 6; - *e ^= 0x2a; // top two bits of each tail byte correct? - *e >>= shifte[len]; - - return reinterpret_cast(next); -} - -inline int FindLastUtf8Symbols(const char *text) -{ - std::string textBuffer(text); - textBuffer.resize(textBuffer.size() + 4); // Buffer must be padded before calling utf8_decode() - const char *textData = textBuffer.data(); - const char *previousPosition = textData; - - char32_t next; - int error; - for (; *textData != '\0'; previousPosition = textData) { - textData = utf8_decode(textData, &next, &error); - if (*textData == '\0') - return previousPosition - textBuffer.data(); - } - - return 0; -} diff --git a/Source/utils/utf8.hpp b/Source/utils/utf8.hpp new file mode 100644 index 000000000..f00cbbb28 --- /dev/null +++ b/Source/utils/utf8.hpp @@ -0,0 +1,56 @@ +#pragma once + +#include +#include +#include + +#include "utils/stdcompat/string_view.hpp" + +namespace devilution { + +constexpr char32_t Utf8DecodeError = 0xD83F; + +/** + * Decodes the first code point from UTF8-encoded input. + * + * Sets `len` to the length of the code point in bytes. + * Returns `Utf8DecodeError` on error. + */ +char32_t DecodeFirstUtf8CodePoint(string_view input, uint8_t *len); + +/** + * Decodes and removes the first code point from UTF8-encoded input. + */ +inline char32_t ConsumeFirstUtf8CodePoint(string_view *input) +{ + uint8_t len; + const char32_t result = DecodeFirstUtf8CodePoint(*input, &len); + input->remove_prefix(len); + return result; +} + +/** + * Returns true if this is a trailing byte in a UTF-8 code point encoding. + * + * A trailing byte is any byte that is not the heading byte. + */ +inline bool IsTrailUtf8CodeUnit(char x) +{ + return static_cast(x) < -0x40; +} + +/** + * Returns the start byte index of the last code point in a UTF-8 string. + */ +inline std::size_t FindLastUtf8Symbols(string_view input) +{ + if (input.empty()) + return 0; + + std::size_t pos = input.size() - 1; + while (pos > 0 && IsTrailUtf8CodeUnit(input[pos])) + --pos; + return pos; +} + +} // namespace devilution