examples

Coverage Report

Created: 2025-05-02 00:07

/home/uke/oil/data_lang/utf8.h
Line
Count
Source (jump to first uncovered line)
1
#ifndef DATA_LANG_UTF8_H
2
#define DATA_LANG_UTF8_H
3
4
#include <stddef.h>  // size_t
5
#include <stdint.h>  // uint32_t
6
#include <stdio.h>
7
8
/**
9
 *              ---- Quick reference about the encoding ----
10
 *
11
 * First, all valid UTF-8 sequences follow of bit "patterns" (Table 3-6.) The
12
 * first byte determines the length of the sequence and then the next 0-3 bytes
13
 * are "continuation bytes."
14
 *
15
 * +----------------------------+----------+----------+----------+----------+
16
 * | Scalar Value               | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
17
 * +----------------------------+----------+----------+----------+----------+
18
 * | 00000000 0xxxxxxx          | 0xxxxxxx |          |          |          |
19
 * | 00000yyy yyxxxxxx          | 110yyyyy | 10xxxxxx |          |          |
20
 * | zzzzyyyy yyxxxxxx          | 1110zzzz | 10yyyyyy | 10xxxxxx |          |
21
 * | 000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx |
22
 * +----------------------------+----------+----------+----------+----------+
23
 *
24
 *      Table 3-6 from Unicode Standard 15.0.0 Ch3. UTF-8 bit patterns
25
 *
26
 * There are 3 further restrictions which make some valid bit patterns
27
 * *invalid*:
28
 *  1. Overlongs: eg, <0x41> and <0xC1 0x81> both store U+41, but the second
29
 *     sequence is longer and thus an error.
30
 *  2. Surrogates: Any codepoint between U+D800 and U+DFFF (inclusive) is a
31
 *     surrogate. It is an error to encode surrogates in UTF-8.
32
 *  3. Too Large: Any decoded value over 0x10FFFF is not a Unicode codepoint,
33
 *     and must be rejected as an error.
34
 *
35
 * See https://aolsen.ca/writings/everything-about-utf8 for more details about
36
 * the encoding.
37
 */
38
39
typedef enum Utf8Error {
40
  UTF8_OK = 0,
41
42
  // Encodes a codepoint in more bytes than necessary
43
  UTF8_ERR_OVERLONG = 1,
44
45
  // Encodes a codepoint in the surrogate range (0xD800 to 0xDFFF, inclusive)
46
  UTF8_ERR_SURROGATE = 2,
47
48
  // Encodes a value greater than the max codepoint U+10FFFF
49
  UTF8_ERR_TOO_LARGE = 3,
50
51
  // Encoding doesn't conform to the UTF-8 bit patterns
52
  UTF8_ERR_BAD_ENCODING = 4,
53
54
  // It looks like there is another codepoint, but it has been truncated.
55
  UTF8_ERR_TRUNCATED_BYTES = 5,
56
} Utf8Error_t;
57
58
typedef struct Utf8Result {
59
  Utf8Error_t error;
60
  uint32_t codepoint;
61
  size_t bytes_read;
62
} Utf8Result_t;
63
64
0
static inline void _cont(const unsigned char *input, Utf8Result_t *result) {
65
0
  if (result->error) return;
66
67
0
  int byte = input[result->bytes_read];
68
0
  if (byte == '\0') {
69
0
    result->error = UTF8_ERR_TRUNCATED_BYTES;
70
0
    return;
71
0
  }
72
0
  result->bytes_read += 1;
73
74
  // Continuation bytes follow the bit pattern 10xx_xxxx. We need to a)
75
  // validate the pattern and b) remove the leading '10'.
76
0
  if ((byte & 0xC0) == 0x80) {
77
0
    result->codepoint <<= 6;
78
0
    result->codepoint |= byte & 0x3F;
79
0
  } else {
80
0
    result->error = UTF8_ERR_BAD_ENCODING;
81
0
  }
82
0
}
83
84
/**
85
 * Given a nul-terminated string `input`, try to decode the next codepoint from
86
 * that string.
87
 *
88
 * It is required that `input` does not point to the nul-terminator. If
89
 * `*input == '\0'`, then it is assumed that the zero-byte is meant to encode
90
 * U+00, not a sentinel. The nul-terminator is still necessary because we need
91
 * it to prevent buffer overrun in the case of a truncated byte sequence, for
92
 * example '\xC2'. This oddity is to facilitate strings which may contain U+00
93
 * codepoints.
94
 *
95
 * If there was a surrogate, overlong or codepoint to large error then
96
 * `result.codepoint` will contain the recovered value.
97
 */
98
static inline void utf8_decode(const unsigned char *input,
99
14
                               Utf8Result_t *result) {
100
14
  result->error = UTF8_OK;
101
14
  result->codepoint = 0;
102
14
  result->bytes_read = 0;
103
104
14
  int first = *input;
105
14
  result->bytes_read = 1;
106
107
14
  if ((first & 0x80) == 0) {
108
    // 1-byte long (ASCII subset)
109
14
    result->codepoint = first;
110
14
    return;
111
14
  }
112
113
0
  if ((first & 0xE0) == 0xC0) {
114
    // 2-bytes long
115
0
    result->codepoint = first & 0x1F;
116
117
0
    _cont(input, result);
118
0
    if (result->error) return;
119
120
0
    if (result->codepoint < 0x80) {
121
0
      result->error = UTF8_ERR_OVERLONG;
122
0
    }
123
124
0
    return;
125
0
  }
126
127
0
  if ((first & 0xF0) == 0xE0) {
128
    // 3-bytes long
129
0
    result->codepoint = first & 0x0F;
130
131
0
    _cont(input, result);
132
0
    _cont(input, result);
133
0
    if (result->error) return;
134
135
0
    if (result->codepoint < 0x800) {
136
0
      result->error = UTF8_ERR_OVERLONG;
137
0
    }
138
139
0
    if (0xD800 <= result->codepoint && result->codepoint <= 0xDFFF) {
140
0
      result->error = UTF8_ERR_SURROGATE;
141
0
    }
142
143
0
    return;
144
0
  }
145
146
0
  if ((first & 0xF8) == 0xF0) {
147
    // 4-bytes long
148
0
    result->codepoint = first & 0x07;
149
150
0
    _cont(input, result);
151
0
    _cont(input, result);
152
0
    _cont(input, result);
153
0
    if (result->error) return;
154
155
0
    if (result->codepoint < 0x10000) {
156
0
      result->error = UTF8_ERR_OVERLONG;
157
0
    }
158
159
0
    if (result->codepoint > 0x10FFFF) {
160
0
      result->error = UTF8_ERR_TOO_LARGE;
161
0
    }
162
163
0
    return;
164
0
  }
165
166
0
  result->error = UTF8_ERR_BAD_ENCODING;
167
0
  return;
168
0
}
169
170
#endif  // DATA_LANG_UTF8_H