From 34b7d8526390acfb4209b732a98d36e2d2b04d44 Mon Sep 17 00:00:00 2001 From: ephphatha Date: Sat, 18 Jun 2022 11:24:46 +1000 Subject: [PATCH] Add IsLeadUtf8CodeUnit to complement trail byte detection --- Source/utils/utf8.hpp | 14 ++++++++++++++ test/utf8_test.cpp | 5 +++++ 2 files changed, 19 insertions(+) diff --git a/Source/utils/utf8.hpp b/Source/utils/utf8.hpp index fca675ad3..399983b7f 100644 --- a/Source/utils/utf8.hpp +++ b/Source/utils/utf8.hpp @@ -29,6 +29,20 @@ inline char32_t ConsumeFirstUtf8CodePoint(string_view *input) return result; } +/** + * Returns true if this is a byte that potentially starts a valid UTF-8 sequence. + * + * Well-formed UTF-8 sequences are described in table 3-7 of S3.9 of the Unicode Standard + * see: https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf#G7404 + * + * This is not an inverse of IsTrailUtf8CodeUnit. Byte values C0-C1, F5-FF are not valid anywhere in a UTF-8 sequence. + */ +constexpr bool IsLeadUtf8CodeUnit(char x) +{ + // single byte character || multibyte character leader + return (x >= '\x00' && x <= '\x7F') || (x >= '\xC2' && x <= '\xF4'); +} + /** * Returns true if this is a trailing byte in a UTF-8 code point encoding. * diff --git a/test/utf8_test.cpp b/test/utf8_test.cpp index f7ce7e072..e618e6576 100644 --- a/test/utf8_test.cpp +++ b/test/utf8_test.cpp @@ -39,14 +39,17 @@ TEST(Utf8CodeUnits, ValidCodePoints) // Working backwards on this loop to avoid triggering signed integer overflow on platforms where char has an // underlying type of signed char for (char x = '\x7F'; x >= '\x00' && x <= '\x7F'; x--) { + EXPECT_TRUE(IsLeadUtf8CodeUnit(x)) << "Basic Latin and ASCII Control characters are lead code units"; EXPECT_FALSE(IsTrailUtf8CodeUnit(x)) << "Basic Latin and ASCII Control characters are not trail code units"; } for (char x = '\x80'; x >= '\x80' && x <= '\xBF'; x++) { EXPECT_TRUE(IsTrailUtf8CodeUnit(x)) << "Bytes in the range 0x80 to 0xBF are potentially valid trail code units"; + EXPECT_FALSE(IsLeadUtf8CodeUnit(x)) << "Trail code units are never valid lead code units"; } for (char x = '\xC2'; x >= '\xC2' && x <= '\xF4'; x++) { + EXPECT_TRUE(IsLeadUtf8CodeUnit(x)) << "Bytes in the range 0xC2 to 0xF4 are lead code units"; EXPECT_FALSE(IsTrailUtf8CodeUnit(x)) << "Bytes in the range 0xC2 to 0xF4 are never valid trail code units"; } } @@ -54,10 +57,12 @@ TEST(Utf8CodeUnits, ValidCodePoints) TEST(Utf8CodeUnits, InvalidCodePoints) { for (char x = '\xC0'; x >= '\xC0' && x <= '\xC1'; x++) { + EXPECT_FALSE(IsLeadUtf8CodeUnit(x)) << "Bytes in the range 0xC0 to 0xC1 are not lead code units"; EXPECT_FALSE(IsTrailUtf8CodeUnit(x)) << "Bytes in the range 0xC0 to oxC1 are not trail code units"; } for (char x = '\xF5'; x >= '\xF5' && x <= '\xFF'; x++) { + EXPECT_FALSE(IsLeadUtf8CodeUnit(x)) << "Bytes in the range 0xF5 to 0xFF are not lead code units"; EXPECT_FALSE(IsTrailUtf8CodeUnit(x)) << "Bytes in the range 0xF5 to 0xFF are not trail code units"; } }