From e5e007cd3ca7f6839f24fe4280fc12710a82cefc Mon Sep 17 00:00:00 2001 From: Gleb Mazovetskiy Date: Mon, 5 Aug 2024 18:04:50 +0100 Subject: [PATCH] Slightly optimize `Utf8CodePointLen` A few more operations but the "lookup table" is now an immediate constant. https://godbolt.org/z/7YG3ohWT6 --- Source/utils/utf8.hpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/Source/utils/utf8.hpp b/Source/utils/utf8.hpp index 1b9d23cab..defdc3f97 100644 --- a/Source/utils/utf8.hpp +++ b/Source/utils/utf8.hpp @@ -1,9 +1,8 @@ #pragma once -#include +#include #include #include -#include namespace devilution { @@ -54,10 +53,19 @@ inline bool IsTrailUtf8CodeUnit(char x) /** * @brief Returns the number of code units for a code point starting at *src; + * + * `src` must not be empty. + * If `src` does not begin with a UTF-8 code point start byte, returns 1. */ inline size_t Utf8CodePointLen(const char *src) { - return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[static_cast(*src) >> 4]; + // This constant is effectively a lookup table for 2-bit keys, where + // values represent code point length - 1. + // `-1` is so that this method never returns 0, even for invalid values + // (which could lead to infinite loops in some code). + // Generated with: + // ruby -e 'p "0000000000000000000000001111223".reverse.to_i(4).to_s(16)' + return ((0x3a55000000000000ULL >> (2 * (static_cast(*src) >> 3))) & 0x3) + 1; } /**