I don't know what your code is doing exactly. For comparison, here's my utf8 decoder (for a single codepoint):
static UnicodeCodepoint utf8_decode(u8 const bytes[static 4], u8 *out_num_consumed) {
u8 const flipped = ~bytes[0];
if (flipped == 0) {
// Because __builtin_clz is UB for value 0.
// When his happens, the UTF-8 is malformed.
*out_num_consumed = 1;
return 0;
}
u8 const num_ones = __builtin_clz(flipped) & 0x07;
u8 const num_bytes_total = num_ones > 1 ? num_ones : 1;
u8 const main_byte_shift = num_ones + 1;
UnicodeCodepoint value = bytes[0] & (0xFF >> main_byte_shift);
for (u8 i = 1; i < num_bytes_total; ++i) {
if (bytes[i] >> 6 != 2) {
// Not a valid continuation byte.
*out_num_consumed = i;
return 0;
}
value = (value << 6) | (bytes[i] & 0x3F);
}
*out_num_consumed = num_bytes_total;
return value;
}