Grünﬁeld

Well, just short of two years from my previous post, so I thought I'd resurrect this blog.

Back then, I was working on a hobby project for text processing with particular reference to UTF- encoding and decoding. I wrote a lot of exploratory code (in a C++ library called "tea") that was left hanging when I moved on to other things. I've decided to re-visit that library and publish what I discovered whilst working on it.

Firstly, let's define some useful types. Note that all the code below is pseudo-code; I'll publish the full library source code when it's in a suitable state:

namespace tea {
namespace types {
// Unsigned integers
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;

// Signed integers
typedef int8_t s8;
typedef int16_t s16;
typedef int32_t s32;
typedef int64_t s64;

// Signed characters
typedef signed char c8;
typedef int16_t c16;
typedef int32_t c32;

// IEEE floating-point
typedef float f32;
typedef double f64;

// Traits
template<typename T>
struct Traits {
inline static T minimum(void) {
return std::numeric_limits<T>::lowest();
}
inline static T maximum(void) {
return std::numeric_limits<T>::max();
}
inline static T extreme(void) {
return std::is_signed<T>::value ?
std::numeric_limits<T>::lowest() :
std::numeric_limits<T>::max();
}
};

// Bit-sizes
template<size_t BITS>
struct BitSize;
template<>
struct BitSize<8> {
typedef s8 Signed;
typedef u8 Unsigned;
};
template<>
struct BitSize<16> {
typedef s16 Signed;
typedef u16 Unsigned;
};
template<>
struct BitSize<32> {
typedef s32 Signed;
typedef u32 Unsigned;
};
template<>
struct BitSize<64> {
typedef s64 Signed;
typedef u64 Unsigned;
};
}
using namespace types;
}

Next, let's be overly-optimistic and assume that our UTF-8 and UTF-32 representations are always valid. That is:

UTF-8 codepoints are not truncated mid-sequence.
All codepoints are valid Unicode codepoints.
When writing UTF-8, we never over-run the buffer.

namespace tea {
namespace utf8 {
const c32 codePointMinimum = 0x000000;
const c32 codePointMaximum = 0x10FFFF;
enum Status_UTF8 {
Status_UTF8_OK = 0,
Status_UTF8_Overlong = -1,
Status_UTF8_Beyond = -2,
Status_UTF8_Extraneous = -3,
Status_UTF8_Continuation = -4,
Status_UTF8_Truncated = -5,
Status_UTF8_Invalid = -6,

Status_UTF8_ThresholdStrict = Status_UTF8_OK,
Status_UTF8_ThresholdNormal = Status_UTF8_Overlong,
Status_UTF8_ThresholdLax = Status_UTF8_Invalid
};

// Positive for code point length, negative for error
// (Continuation/Invalid)
extern const s8 leadByteTable[256];

// Positive for code point length, zero for error
extern const u8 msbEncodedSize[33];

// The UTF-8 sequence must be valid
namespace optimistic {
size_t measureCodePoint(c32 codepoint);
void encodeCodePointMeasured(u8* ptr, c32 codepoint, size_t bytes);
u8* encodeCodePoint(u8* ptr, c32 codepoint);
c32 decodeCodePoint(const u8*& ptr, const u8* end);
size_t countCodePoints(const u8* begin, const u8* end);
}
}
}

The "leadByteTable" array is the classic length-of-UTF-8-sequence-based-on-the-first-byte:

#define SIXTEEN(x) x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x

const tea::s8 tea::utf8::leadByteTable[256] = {

/* 0x00-0x0F */ SIXTEEN(1),

/* 0x10-0x1F */ SIXTEEN(1),

/* 0x20-0x2F */ SIXTEEN(1),

/* 0x30-0x3F */ SIXTEEN(1),

/* 0x40-0x4F */ SIXTEEN(1),

/* 0x50-0x5F */ SIXTEEN(1),

/* 0x60-0x6F */ SIXTEEN(1),

/* 0x70-0x7F */ SIXTEEN(1),

/* 0x80-0x8F */ SIXTEEN(tea::utf8::Status_UTF8_Continuation),

/* 0x90-0x9F */ SIXTEEN(tea::utf8::Status_UTF8_Continuation),

/* 0xA0-0xAF */ SIXTEEN(tea::utf8::Status_UTF8_Continuation),

/* 0xB0-0xBF */ SIXTEEN(tea::utf8::Status_UTF8_Continuation),

/* 0xC0-0xCF */ SIXTEEN(2),

/* 0xD0-0xDF */ SIXTEEN(2),

/* 0xE0-0xEF */ SIXTEEN(3),

/* 0xF0-0xFF */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6,

tea::utf8::Status_UTF8_Invalid,

tea::utf8::Status_UTF8_Invalid

};

#undef SIXTEEN

Note that negative numbers indicate the type of error.

This gives us a trivial implementation of "countCodePoints()" for UTF-8 strings assuming that unrecognised UTF-8 sequences are mapped to a single, replacement codepoint:

size_t countCodePoints(const u8* begin, const u8* end) {

assert(begin != nullptr);

assert(end >= begin);

size_t count = 0;

while (begin < end) {

count++;

s8 bytes = leadByteTable[*begin];

begin += (bytes <= 0) ? 1 : bytes;

}

return count;

}

The implementation of "decodeCodePoint()" to decode a single UTF-8 codepoint to UTF-32 is also relatively simple:

c32 decodeCodePoint(const u8*& ptr, const u8* end) {

static const s32 bias[] = {

0, 0, -12416, -925824, -63447168, 100130688, 2113396608

};

assert(ptr != nullptr);

assert(ptr < end);

u8 u1 = *(ptr++);

s8 expected = leadByteTable[u1];

assert((expected > 0) && (expected < TEA_NELEMS(bias)));

s32 codepoint = u1;

while (--expected > 0) {

// Extract any continuation bytes

assert(ptr < end);

u8 u2 = *(ptr++);

assert(leadByteTable[u2] == Status_UTF8_Continuation);

codepoint = (codepoint << 6) + u2;

}

codepoint += bias[leadByteTable[u1]];

assert(codepoint >= 0);

return codepoint;

}

Working out how many UTF-8 bytes are required to store a UTF-32 character is performed by "measureCodePoint()":

size_t measureCodePoint(c32 codepoint) {

int msb = tea::bits::scanMostSetBit(u32(codepoint)) + 1;

assert((msb >= 0) && (msb < TEA_NELEMS(msbEncodedSize)));

return msbEncodedSize[msb];

}

This requires a lookup table "msbEncodedSize" and a fast way of detecting the most-significant set bit of a 32-bit value:

const u8 msbEncodedSize[33] = {

1,1,1,1,1,1,1,1, // 0000000000000000000000000XXXXXX1: 0-7 : One byte

2,2,2,2, // 000000000000000000000XXX1xxxxxxx: 8-11: Two bytes

3,3,3,3,3, // 0000000000000000XXXX1xxxxxxxxxxx: 12-16: Three bytes

4,4,4,4,4, // 00000000000XXXX1xxxxxxxxxxxxxxxx: 17-21: Four bytes

5,5,5,5,5, // 000000XXXX1xxxxxxxxxxxxxxxxxxxxx: 22-26: Five bytes

6,6,6,6,6, // 0XXXX1xxxxxxxxxxxxxxxxxxxxxxxxxx: 27-31: Six bytes

0 // 1xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx: 32 : Invalid

};

Obviously, the implementation for "scanMostSetBit" is platform-dependent. So for Microsoft compilers it is:

int tea::bits::scanMostSetBit(u32 value) {

// Returns the index (31..0) of the most-significant non-zero bit

// or -1 if input is zero

unsigned long bit;

return _BitScanReverse(&bit, value) ? int(bit) : -1;

}

Finally, encoding a UTF-32 codepoint to a UTF-8 sequency is a case of measuring the size and storing the bytes:

void encodeCodePointMeasured(u8* ptr, c32 codepoint, size_t bytes) {

static const u64 bias[7] = {

0, 0, 12288, 917504, 62914560, 4160749568, 270582939648

};

assert(codepoint >= 0);

assert(ptr != nullptr);

assert((bytes > 0) && (bytes < TEA_NELEMS(bias)));;

u8* p = ptr + bytes;

u64 bits = bias[bytes] + codepoint;

while (bits > 0xFF) {

*(--p) = u8((bits & 0x3F) + 0x80);

bits >>= 6;

}

*(--p) = u8(bits);

assert(p == ptr);

}

u8* encodeCodePoint(u8* ptr, c32 codepoint) {

assert(codepoint >= 0);

assert(ptr != nullptr);

size_t bytes = measureCodePoint(codepoint);

assert(bytes > 0);

encodeCodePointMeasured(ptr, codepoint, bytes);

return ptr + bytes;

}

Obviously, our assumptions about valid UTF-8 sequences and the lack of checks for buffer over-runs make these "optimistic" routines unsuitable for production code, unless we've pre-validated the sequences.

Next time, we'll discuss routines that handle those error conditions gracefully.