diff --git a/Source/utils/utf8.cpp b/Source/utils/utf8.cpp index a25d6a71d..8bf2286ef 100644 --- a/Source/utils/utf8.cpp +++ b/Source/utils/utf8.cpp @@ -28,7 +28,7 @@ string_view TruncateUtf8(string_view str, std::size_t len) char32_t DecodeFirstUtf8CodePoint(string_view input, std::size_t *len) { uint32_t codepoint = 0; - uint32_t state = UTF8_ACCEPT; + uint8_t state = UTF8_ACCEPT; for (std::size_t i = 0; i < input.size(); ++i) { state = utf8_decode_step(state, static_cast(input[i]), &codepoint); if (state == UTF8_ACCEPT) { diff --git a/Source/utils/utf8.hpp b/Source/utils/utf8.hpp index 9e6ffc35f..a422624fb 100644 --- a/Source/utils/utf8.hpp +++ b/Source/utils/utf8.hpp @@ -29,20 +29,6 @@ inline char32_t ConsumeFirstUtf8CodePoint(string_view *input) return result; } -/** - * Returns true if this is a byte that potentially starts a valid UTF-8 sequence. - * - * Well-formed UTF-8 sequences are described in table 3-7 of S3.9 of the Unicode Standard - * see: https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf#G7404 - * - * This is not an inverse of IsTrailUtf8CodeUnit. Byte values C0-C1, F5-FF are not valid anywhere in a UTF-8 sequence. - */ -constexpr bool IsLeadUtf8CodeUnit(char x) -{ - // single byte character || multibyte character leader - return (x >= '\x00' && x <= '\x7F') || (x >= '\xC2' && x <= '\xF4'); -} - /** * Returns true if the character is part of the Basic Latin set. * @@ -64,8 +50,6 @@ inline bool IsTrailUtf8CodeUnit(char x) { // The following is equivalent to a bitmask test (x & 0xC0) == 0x80 // On x86_64 architectures it ends up being one instruction shorter - // This invokes implementation defined behaviour on platforms where the underlying type of char is unsigned char - // until C++20 makes unsigned to signed conversion well defined return static_cast(x) < static_cast('\xC0'); } diff --git a/test/utf8_test.cpp b/test/utf8_test.cpp index 555d92a75..773fa5508 100644 --- a/test/utf8_test.cpp +++ b/test/utf8_test.cpp @@ -38,18 +38,15 @@ TEST(Utf8CodeUnits, ValidCodePoints) { // Working backwards on this loop to avoid triggering signed integer overflow on platforms where char has an // underlying type of signed char - for (char x = '\x7F'; x >= '\x00' && x <= '\x7F'; x--) { - EXPECT_TRUE(IsLeadUtf8CodeUnit(x)) << "Basic Latin and ASCII Control characters are lead code units"; + for (char x = '\x7F'; static_cast(x) >= '\x00'; x--) { EXPECT_FALSE(IsTrailUtf8CodeUnit(x)) << "Basic Latin and ASCII Control characters are not trail code units"; } for (char x = '\x80'; x >= '\x80' && x <= '\xBF'; x++) { EXPECT_TRUE(IsTrailUtf8CodeUnit(x)) << "Bytes in the range 0x80 to 0xBF are potentially valid trail code units"; - EXPECT_FALSE(IsLeadUtf8CodeUnit(x)) << "Trail code units are never valid lead code units"; } for (char x = '\xC2'; x >= '\xC2' && x <= '\xF4'; x++) { - EXPECT_TRUE(IsLeadUtf8CodeUnit(x)) << "Bytes in the range 0xC2 to 0xF4 are lead code units"; EXPECT_FALSE(IsTrailUtf8CodeUnit(x)) << "Bytes in the range 0xC2 to 0xF4 are never valid trail code units"; } } @@ -57,12 +54,10 @@ TEST(Utf8CodeUnits, ValidCodePoints) TEST(Utf8CodeUnits, InvalidCodePoints) { for (char x = '\xC0'; x >= '\xC0' && x <= '\xC1'; x++) { - EXPECT_FALSE(IsLeadUtf8CodeUnit(x)) << "Bytes in the range 0xC0 to 0xC1 are not lead code units"; EXPECT_FALSE(IsTrailUtf8CodeUnit(x)) << "Bytes in the range 0xC0 to oxC1 are not trail code units"; } for (char x = '\xF5'; x >= '\xF5' && x <= '\xFF'; x++) { - EXPECT_FALSE(IsLeadUtf8CodeUnit(x)) << "Bytes in the range 0xF5 to 0xFF are not lead code units"; EXPECT_FALSE(IsTrailUtf8CodeUnit(x)) << "Bytes in the range 0xF5 to 0xFF are not trail code units"; } }