Browse Source
Switch to a state-machine UTF-8 decoder from the branchless one. This allows us to avoid copying the string on every `DrawString` call.pull/3415/head
10 changed files with 173 additions and 120 deletions
@ -0,0 +1,3 @@
|
||||
add_library(hoehrmann_utf8 INTERFACE) |
||||
|
||||
target_include_directories(hoehrmann_utf8 INTERFACE ${CMAKE_CURRENT_LIST_DIR}) |
||||
@ -0,0 +1,61 @@
|
||||
/* Adapted from: https://github.com/hoehrmann/utf-8-misc/blob/449221e7a693a9c7b8938721cd4244eed4ca9320/utf8_branch.h */ |
||||
|
||||
/*-
|
||||
* Copyright (c) 2014 Taylor R Campbell |
||||
* All rights reserved. |
||||
* |
||||
* Redistribution and use in source and binary forms, with or without |
||||
* modification, are permitted provided that the following conditions |
||||
* are met: |
||||
* 1. Redistributions of source code must retain the above copyright |
||||
* notice, this list of conditions and the following disclaimer. |
||||
* 2. Redistributions in binary form must reproduce the above copyright |
||||
* notice, this list of conditions and the following disclaimer in the |
||||
* documentation and/or other materials provided with the distribution. |
||||
* |
||||
* THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' |
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE |
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
||||
* POSSIBILITY OF SUCH DAMAGE. |
||||
*/ |
||||
|
||||
#include <stdint.h> |
||||
|
||||
#define UTF8_ACCEPT 0 |
||||
#define UTF8_REJECT 0xf |
||||
|
||||
static const uint32_t utf8_classtab[0x10] = { |
||||
0x88888888UL,0x88888888UL,0x99999999UL,0x99999999UL, |
||||
0xaaaaaaaaUL,0xaaaaaaaaUL,0xaaaaaaaaUL,0xaaaaaaaaUL, |
||||
0x222222ffUL,0x22222222UL,0x22222222UL,0x22222222UL, |
||||
0x3333333bUL,0x33433333UL,0xfff5666cUL,0xffffffffUL, |
||||
}; |
||||
|
||||
static const uint32_t utf8_statetab[0x10] = { |
||||
0xfffffff0UL,0xffffffffUL,0xfffffff1UL,0xfffffff3UL, |
||||
0xfffffff4UL,0xfffffff7UL,0xfffffff6UL,0xffffffffUL, |
||||
0x33f11f0fUL,0xf3311f0fUL,0xf33f110fUL,0xfffffff2UL, |
||||
0xfffffff5UL,0xffffffffUL,0xffffffffUL,0xffffffffUL, |
||||
}; |
||||
|
||||
static inline uint8_t |
||||
utf8_decode_step(uint8_t state, uint8_t octet, uint32_t *cpp) |
||||
{ |
||||
const uint8_t reject = (state >> 3), nonascii = (octet >> 7); |
||||
const uint8_t klass = (!nonascii? 0 : |
||||
(0xf & (utf8_classtab[(octet >> 3) & 0xf] >> (4 * (octet & 7))))); |
||||
|
||||
*cpp = (state == UTF8_ACCEPT |
||||
? (octet & (0xffU >> klass)) |
||||
: ((octet & 0x3fU) | (*cpp << 6))); |
||||
|
||||
return (reject? 0xf : |
||||
(0xf & (utf8_statetab[klass] >> (4 * (state & 7))))); |
||||
} |
||||
@ -0,0 +1,27 @@
|
||||
#include "utils/utf8.hpp" |
||||
|
||||
#include <cstddef> |
||||
|
||||
#include <hoehrmann_utf8.h> |
||||
|
||||
namespace devilution { |
||||
|
||||
char32_t DecodeFirstUtf8CodePoint(string_view input, uint8_t *len) |
||||
{ |
||||
uint32_t codepoint = 0; |
||||
uint32_t state = UTF8_ACCEPT; |
||||
for (std::size_t i = 0; i < input.size(); ++i) { |
||||
state = utf8_decode_step(state, static_cast<uint8_t>(input[i]), &codepoint); |
||||
if (state == UTF8_ACCEPT) { |
||||
*len = i + 1; |
||||
return codepoint; |
||||
} |
||||
if (state == UTF8_REJECT) { |
||||
*len = i + 1; |
||||
return Utf8DecodeError; |
||||
} |
||||
} |
||||
return codepoint; |
||||
} |
||||
|
||||
} // namespace devilution
|
||||
@ -1,85 +0,0 @@
|
||||
#pragma once |
||||
|
||||
#include <cstdint> |
||||
#include <string> |
||||
#include <utility> |
||||
|
||||
/* Branchless UTF-8 decoder
|
||||
* |
||||
* This is free and unencumbered software released into the public domain. |
||||
*/ |
||||
|
||||
/* Decode the next character, C, from BUF, reporting errors in E.
|
||||
* |
||||
* Since this is a branchless decoder, four bytes will be read from the |
||||
* buffer regardless of the actual length of the next character. This |
||||
* means the buffer _must_ have at least three bytes of zero padding |
||||
* following the end of the data stream. |
||||
* |
||||
* Errors are reported in E, which will be non-zero if the parsed |
||||
* character was somehow invalid: invalid byte sequence, non-canonical |
||||
* encoding, or a surrogate half. |
||||
* |
||||
* The function returns a pointer to the next character. When an error |
||||
* occurs, this pointer will be a guess that depends on the particular |
||||
* error, but it will always advance at least one byte. |
||||
*/ |
||||
inline const char *utf8_decode(const char *buf, char32_t *c, int *e) |
||||
{ |
||||
static const char lengths[] = { |
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0 |
||||
}; |
||||
static const int masks[] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07 }; |
||||
static const uint32_t mins[] = { 4194304, 0, 128, 2048, 65536 }; |
||||
static const int shiftc[] = { 0, 18, 12, 6, 0 }; |
||||
static const int shifte[] = { 0, 6, 4, 2, 0 }; |
||||
|
||||
auto s = reinterpret_cast<const unsigned char *>(buf); |
||||
int len = lengths[s[0] >> 3]; |
||||
|
||||
/* Compute the pointer to the next character early so that the next
|
||||
* iteration can start working on the next character. Neither Clang |
||||
* nor GCC figure out this reordering on their own. |
||||
*/ |
||||
const unsigned char *next = s + len + !len; |
||||
|
||||
/* Assume a four-byte character and load four bytes. Unused bits are
|
||||
* shifted out. |
||||
*/ |
||||
*c = static_cast<char32_t>((s[0] & masks[len]) << 18); |
||||
*c |= static_cast<char32_t>((s[1] & 0x3f) << 12); |
||||
*c |= static_cast<char32_t>((s[2] & 0x3f) << 6); |
||||
*c |= static_cast<char32_t>((s[3] & 0x3f) << 0); |
||||
*c >>= shiftc[len]; |
||||
|
||||
/* Accumulate the various error conditions. */ |
||||
*e = (*c < mins[len]) << 6; // non-canonical encoding
|
||||
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
|
||||
*e |= (*c > 0x10FFFF) << 8; // out of range?
|
||||
*e |= (s[1] & 0xc0) >> 2; |
||||
*e |= (s[2] & 0xc0) >> 4; |
||||
*e |= (s[3]) >> 6; |
||||
*e ^= 0x2a; // top two bits of each tail byte correct?
|
||||
*e >>= shifte[len]; |
||||
|
||||
return reinterpret_cast<const char *>(next); |
||||
} |
||||
|
||||
inline int FindLastUtf8Symbols(const char *text) |
||||
{ |
||||
std::string textBuffer(text); |
||||
textBuffer.resize(textBuffer.size() + 4); // Buffer must be padded before calling utf8_decode()
|
||||
const char *textData = textBuffer.data(); |
||||
const char *previousPosition = textData; |
||||
|
||||
char32_t next; |
||||
int error; |
||||
for (; *textData != '\0'; previousPosition = textData) { |
||||
textData = utf8_decode(textData, &next, &error); |
||||
if (*textData == '\0') |
||||
return previousPosition - textBuffer.data(); |
||||
} |
||||
|
||||
return 0; |
||||
} |
||||
@ -0,0 +1,56 @@
|
||||
#pragma once |
||||
|
||||
#include <cstdint> |
||||
#include <string> |
||||
#include <utility> |
||||
|
||||
#include "utils/stdcompat/string_view.hpp" |
||||
|
||||
namespace devilution { |
||||
|
||||
constexpr char32_t Utf8DecodeError = 0xD83F; |
||||
|
||||
/**
|
||||
* Decodes the first code point from UTF8-encoded input. |
||||
* |
||||
* Sets `len` to the length of the code point in bytes. |
||||
* Returns `Utf8DecodeError` on error. |
||||
*/ |
||||
char32_t DecodeFirstUtf8CodePoint(string_view input, uint8_t *len); |
||||
|
||||
/**
|
||||
* Decodes and removes the first code point from UTF8-encoded input. |
||||
*/ |
||||
inline char32_t ConsumeFirstUtf8CodePoint(string_view *input) |
||||
{ |
||||
uint8_t len; |
||||
const char32_t result = DecodeFirstUtf8CodePoint(*input, &len); |
||||
input->remove_prefix(len); |
||||
return result; |
||||
} |
||||
|
||||
/**
|
||||
* Returns true if this is a trailing byte in a UTF-8 code point encoding. |
||||
* |
||||
* A trailing byte is any byte that is not the heading byte. |
||||
*/ |
||||
inline bool IsTrailUtf8CodeUnit(char x) |
||||
{ |
||||
return static_cast<signed char>(x) < -0x40; |
||||
} |
||||
|
||||
/**
|
||||
* Returns the start byte index of the last code point in a UTF-8 string. |
||||
*/ |
||||
inline std::size_t FindLastUtf8Symbols(string_view input) |
||||
{ |
||||
if (input.empty()) |
||||
return 0; |
||||
|
||||
std::size_t pos = input.size() - 1; |
||||
while (pos > 0 && IsTrailUtf8CodeUnit(input[pos])) |
||||
--pos; |
||||
return pos; |
||||
} |
||||
|
||||
} // namespace devilution
|
||||
Loading…
Reference in new issue