OILS / cpp / libc_demo.cc View on Github | oils.pub

217 lines, 128 significant
1#include <locale.h> // setlocale()
2#include <regex.h> // regcomp()
3
4#include "mycpp/runtime.h"
5#include "vendor/greatest.h"
6
7void FindAll(const char* p, const char* s) {
8 regex_t pat;
9
10 int cflags = REG_EXTENDED;
11 if (regcomp(&pat, p, cflags) != 0) {
12 FAIL();
13 }
14 int outlen = pat.re_nsub + 1; // number of captures
15
16 // TODO: Could statically allocate 99, and assert that re_nsub is less than
17 // 99. Would speed up loops.
18 regmatch_t* pmatch =
19 static_cast<regmatch_t*>(malloc(sizeof(regmatch_t) * outlen));
20
21 int cur_pos = 0;
22 // int n = strlen(s);
23 while (true) {
24 // Necessary so ^ doesn't match in the middle!
25 int eflags = cur_pos == 0 ? 0 : REG_NOTBOL;
26 bool match = regexec(&pat, s + cur_pos, outlen, pmatch, eflags) == 0;
27
28 if (!match) {
29 break;
30 }
31 int i;
32 for (i = 0; i < outlen; i++) {
33 int start = pmatch[i].rm_so;
34 int end = pmatch[i].rm_eo;
35 int len = end - start;
36 BigStr* m = StrFromC(s + cur_pos + start, len);
37 log("%d GROUP %d (%d .. %d) = [%s]", cur_pos, i, start, end, m->data_);
38 }
39 log("");
40 int match_len = pmatch[0].rm_eo;
41 if (match_len == 0) {
42 break;
43 }
44 cur_pos += match_len;
45 }
46
47 free(pmatch);
48 regfree(&pat);
49}
50
51// adjacent matches
52const char* s = "a345y-axy- there b789y- cy-";
53
54TEST regex_unanchored() {
55 const char* unanchored = "[abc]([0-9]*)(x?)(y)-";
56 FindAll(unanchored, s);
57
58 PASS();
59}
60
61TEST regex_caret() {
62 const char* anchored = "^[abc]([0-9]*)(x?)(y)-";
63 FindAll(anchored, s);
64
65 PASS();
66}
67
68TEST regex_lexer() {
69 // like the Yaks / Make-a-Lisp pattern
70 const char* lexer = "([a-z]+)|([0-9]+)|([ ]+)|([+-])";
71 FindAll(lexer, s);
72
73 PASS();
74}
75
76TEST regex_repeat_with_capture() {
77 const char* lexer = "(([a-z]+)([0-9]+)-)*((A+)|(Z+))*";
78 FindAll(lexer, "a0-b1-c2-AAZZZA");
79 // Groups are weird
80 // whole match 0: a0-b1-c2-
81 // 1: c2- # last repetition
82 // 2: c # last one
83 // 3: 2 # last one
84 //
85 // And then there's an empty match
86 //
87 // Ideas:
88 // - disallow nested groups in Eggex?
89 // - I really care about the inner ones -- groups 2 and 3
90 // - I want flat groups
91
92 PASS();
93}
94
95// Disallow this in eggex, as well as the above
96TEST regex_nested_capture() {
97 const char* lexer = "(([a-z]+)([0-9]+))";
98 FindAll(lexer, "a0");
99 PASS();
100}
101
102// I think we allow this in eggex
103TEST regex_alt_with_capture() {
104 const char* lexer = "([a-z]+)|([0-9]+)(-)";
105 FindAll(lexer, "x-");
106 FindAll(lexer, "7-");
107 PASS();
108}
109
110bool RegexMatch(const char* s, const char* regex_str) {
111 regex_t pat;
112 int status = regcomp(&pat, regex_str, REG_EXTENDED);
113 if (status != 0) {
114 assert(false);
115 }
116 log("*** Matching string %s against regex %s", s, regex_str);
117
118 int num_groups = pat.re_nsub + 1; // number of captures
119
120 regmatch_t* pmatch =
121 static_cast<regmatch_t*>(malloc(sizeof(regmatch_t) * num_groups));
122 int eflags = 0;
123 bool match = regexec(&pat, s, num_groups, pmatch, eflags) == 0;
124 if (match) {
125 printf("match\n");
126 for (int i = 0; i < num_groups; i++) {
127 int start = pmatch[i].rm_so;
128 int end = pmatch[i].rm_eo;
129 printf("start %d - %d\n", start, end);
130 }
131 }
132 free(pmatch);
133 regfree(&pat);
134
135 return match;
136}
137
138TEST regex_unicode() {
139 regex_t pat;
140
141 const char* p = "_._"; // 1 byte, not code point?
142
143 bool matched;
144
145 matched = RegexMatch("_xyz_", p);
146 ASSERT(not matched);
147
148 matched = RegexMatch("_x_", p);
149 ASSERT(matched);
150
151 matched = RegexMatch("_\x01_", p);
152 ASSERT(matched);
153
154 const char* u1 = "_μ_";
155 const char* u2 = "_\u03bc_";
156 const char* u3 = "_\xce\xbc_"; // utf-8 encoding
157
158 // Doesn't match without UTF-8 setting
159 matched = RegexMatch(u1, p);
160 // log("u1 %d", matched);
161 ASSERT(not matched);
162
163 matched = RegexMatch(u2, p);
164 // log("u2 %d", matched);
165 ASSERT(not matched);
166
167 matched = RegexMatch(u3, p);
168 // log("u3 %d", matched);
169 ASSERT(not matched);
170
171 // SETS GLOBAL
172 char* saved_locale = setlocale(LC_ALL, "");
173 log("saved_locale %s", saved_locale);
174 if (saved_locale == nullptr) {
175 FAIL();
176 }
177
178 // Now it matches
179 matched = RegexMatch(u1, p);
180 ASSERT(matched);
181 matched = RegexMatch(u2, p);
182 ASSERT(matched);
183 matched = RegexMatch(u3, p);
184 ASSERT(matched);
185
186 // [^a] can match a code point
187 matched = RegexMatch(u3, "_[^a]_");
188 ASSERT(matched);
189
190 const char* unicode_char_class = "[a\xce\xbc]";
191 const char* s = "\xce\xbc";
192 matched = RegexMatch(s, unicode_char_class);
193 ASSERT(matched);
194
195 PASS();
196}
197
198GREATEST_MAIN_DEFS();
199
200int main(int argc, char** argv) {
201 gHeap.Init();
202
203 GREATEST_MAIN_BEGIN();
204
205 RUN_TEST(regex_unanchored);
206 RUN_TEST(regex_caret);
207 RUN_TEST(regex_lexer);
208 RUN_TEST(regex_repeat_with_capture);
209 RUN_TEST(regex_alt_with_capture);
210 RUN_TEST(regex_nested_capture);
211 RUN_TEST(regex_unicode);
212
213 gHeap.CleanProcessExit();
214
215 GREATEST_MAIN_END();
216 return 0;
217}