/home/uke/oil/data_lang/utf8.h
Line | Count | Source (jump to first uncovered line) |
1 | | #ifndef DATA_LANG_UTF8_H |
2 | | #define DATA_LANG_UTF8_H |
3 | | |
4 | | #include <stddef.h> // size_t |
5 | | #include <stdint.h> // uint32_t |
6 | | #include <stdio.h> |
7 | | |
8 | | /** |
9 | | * ---- Quick reference about the encoding ---- |
10 | | * |
11 | | * First, all valid UTF-8 sequences follow of bit "patterns" (Table 3-6.) The |
12 | | * first byte determines the length of the sequence and then the next 0-3 bytes |
13 | | * are "continuation bytes." |
14 | | * |
15 | | * +----------------------------+----------+----------+----------+----------+ |
16 | | * | Scalar Value | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte | |
17 | | * +----------------------------+----------+----------+----------+----------+ |
18 | | * | 00000000 0xxxxxxx | 0xxxxxxx | | | | |
19 | | * | 00000yyy yyxxxxxx | 110yyyyy | 10xxxxxx | | | |
20 | | * | zzzzyyyy yyxxxxxx | 1110zzzz | 10yyyyyy | 10xxxxxx | | |
21 | | * | 000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx | |
22 | | * +----------------------------+----------+----------+----------+----------+ |
23 | | * |
24 | | * Table 3-6 from Unicode Standard 15.0.0 Ch3. UTF-8 bit patterns |
25 | | * |
26 | | * There are 3 further restrictions which make some valid bit patterns |
27 | | * *invalid*: |
28 | | * 1. Overlongs: eg, <0x41> and <0xC1 0x81> both store U+41, but the second |
29 | | * sequence is longer and thus an error. |
30 | | * 2. Surrogates: Any codepoint between U+D800 and U+DFFF (inclusive) is a |
31 | | * surrogate. It is an error to encode surrogates in UTF-8. |
32 | | * 3. Too Large: Any decoded value over 0x10FFFF is not a Unicode codepoint, |
33 | | * and must be rejected as an error. |
34 | | * |
35 | | * See https://aolsen.ca/writings/everything-about-utf8 for more details about |
36 | | * the encoding. |
37 | | */ |
38 | | |
39 | | typedef enum Utf8Error { |
40 | | UTF8_OK = 0, |
41 | | |
42 | | // Encodes a codepoint in more bytes than necessary |
43 | | UTF8_ERR_OVERLONG = 1, |
44 | | |
45 | | // Encodes a codepoint in the surrogate range (0xD800 to 0xDFFF, inclusive) |
46 | | UTF8_ERR_SURROGATE = 2, |
47 | | |
48 | | // Encodes a value greater than the max codepoint U+10FFFF |
49 | | UTF8_ERR_TOO_LARGE = 3, |
50 | | |
51 | | // Encoding doesn't conform to the UTF-8 bit patterns |
52 | | UTF8_ERR_BAD_ENCODING = 4, |
53 | | |
54 | | // It looks like there is another codepoint, but it has been truncated. |
55 | | UTF8_ERR_TRUNCATED_BYTES = 5, |
56 | | } Utf8Error_t; |
57 | | |
58 | | typedef struct Utf8Result { |
59 | | Utf8Error_t error; |
60 | | uint32_t codepoint; |
61 | | size_t bytes_read; |
62 | | } Utf8Result_t; |
63 | | |
64 | 0 | static inline void _cont(const unsigned char *input, Utf8Result_t *result) { |
65 | 0 | if (result->error) return; |
66 | | |
67 | 0 | int byte = input[result->bytes_read]; |
68 | 0 | if (byte == '\0') { |
69 | 0 | result->error = UTF8_ERR_TRUNCATED_BYTES; |
70 | 0 | return; |
71 | 0 | } |
72 | 0 | result->bytes_read += 1; |
73 | | |
74 | | // Continuation bytes follow the bit pattern 10xx_xxxx. We need to a) |
75 | | // validate the pattern and b) remove the leading '10'. |
76 | 0 | if ((byte & 0xC0) == 0x80) { |
77 | 0 | result->codepoint <<= 6; |
78 | 0 | result->codepoint |= byte & 0x3F; |
79 | 0 | } else { |
80 | 0 | result->error = UTF8_ERR_BAD_ENCODING; |
81 | 0 | } |
82 | 0 | } |
83 | | |
84 | | /** |
85 | | * Given a nul-terminated string `input`, try to decode the next codepoint from |
86 | | * that string. |
87 | | * |
88 | | * It is required that `input` does not point to the nul-terminator. If |
89 | | * `*input == '\0'`, then it is assumed that the zero-byte is meant to encode |
90 | | * U+00, not a sentinel. The nul-terminator is still necessary because we need |
91 | | * it to prevent buffer overrun in the case of a truncated byte sequence, for |
92 | | * example '\xC2'. This oddity is to facilitate strings which may contain U+00 |
93 | | * codepoints. |
94 | | * |
95 | | * If there was a surrogate, overlong or codepoint to large error then |
96 | | * `result.codepoint` will contain the recovered value. |
97 | | */ |
98 | | static inline void utf8_decode(const unsigned char *input, |
99 | 14 | Utf8Result_t *result) { |
100 | 14 | result->error = UTF8_OK; |
101 | 14 | result->codepoint = 0; |
102 | 14 | result->bytes_read = 0; |
103 | | |
104 | 14 | int first = *input; |
105 | 14 | result->bytes_read = 1; |
106 | | |
107 | 14 | if ((first & 0x80) == 0) { |
108 | | // 1-byte long (ASCII subset) |
109 | 14 | result->codepoint = first; |
110 | 14 | return; |
111 | 14 | } |
112 | | |
113 | 0 | if ((first & 0xE0) == 0xC0) { |
114 | | // 2-bytes long |
115 | 0 | result->codepoint = first & 0x1F; |
116 | |
|
117 | 0 | _cont(input, result); |
118 | 0 | if (result->error) return; |
119 | | |
120 | 0 | if (result->codepoint < 0x80) { |
121 | 0 | result->error = UTF8_ERR_OVERLONG; |
122 | 0 | } |
123 | |
|
124 | 0 | return; |
125 | 0 | } |
126 | | |
127 | 0 | if ((first & 0xF0) == 0xE0) { |
128 | | // 3-bytes long |
129 | 0 | result->codepoint = first & 0x0F; |
130 | |
|
131 | 0 | _cont(input, result); |
132 | 0 | _cont(input, result); |
133 | 0 | if (result->error) return; |
134 | | |
135 | 0 | if (result->codepoint < 0x800) { |
136 | 0 | result->error = UTF8_ERR_OVERLONG; |
137 | 0 | } |
138 | |
|
139 | 0 | if (0xD800 <= result->codepoint && result->codepoint <= 0xDFFF) { |
140 | 0 | result->error = UTF8_ERR_SURROGATE; |
141 | 0 | } |
142 | |
|
143 | 0 | return; |
144 | 0 | } |
145 | | |
146 | 0 | if ((first & 0xF8) == 0xF0) { |
147 | | // 4-bytes long |
148 | 0 | result->codepoint = first & 0x07; |
149 | |
|
150 | 0 | _cont(input, result); |
151 | 0 | _cont(input, result); |
152 | 0 | _cont(input, result); |
153 | 0 | if (result->error) return; |
154 | | |
155 | 0 | if (result->codepoint < 0x10000) { |
156 | 0 | result->error = UTF8_ERR_OVERLONG; |
157 | 0 | } |
158 | |
|
159 | 0 | if (result->codepoint > 0x10FFFF) { |
160 | 0 | result->error = UTF8_ERR_TOO_LARGE; |
161 | 0 | } |
162 | |
|
163 | 0 | return; |
164 | 0 | } |
165 | | |
166 | 0 | result->error = UTF8_ERR_BAD_ENCODING; |
167 | 0 | return; |
168 | 0 | } |
169 | | |
170 | | #endif // DATA_LANG_UTF8_H |