cpp/libc_demo.cc

OILS / cpp / libc_demo.cc View on Github | oils.pub

217 lines, 128 significant

1	#include <locale.h> // setlocale()
2	#include <regex.h> // regcomp()
3
4	#include "mycpp/runtime.h"
5	#include "vendor/greatest.h"
6
7	void FindAll(const char* p, const char* s) {
8	regex_t pat;
9
10	int cflags = REG_EXTENDED;
11	if (regcomp(&pat, p, cflags) != 0) {
12	FAIL();
13	}
14	int outlen = pat.re_nsub + 1; // number of captures
15
16	// TODO: Could statically allocate 99, and assert that re_nsub is less than
17	// 99. Would speed up loops.
18	regmatch_t* pmatch =
19	static_cast<regmatch_t>(malloc(sizeof(regmatch_t) outlen));
20
21	int cur_pos = 0;
22	// int n = strlen(s);
23	while (true) {
24	// Necessary so ^ doesn't match in the middle!
25	int eflags = cur_pos == 0 ? 0 : REG_NOTBOL;
26	bool match = regexec(&pat, s + cur_pos, outlen, pmatch, eflags) == 0;
27
28	if (!match) {
29	break;
30	}
31	int i;
32	for (i = 0; i < outlen; i++) {
33	int start = pmatch[i].rm_so;
34	int end = pmatch[i].rm_eo;
35	int len = end - start;
36	BigStr* m = StrFromC(s + cur_pos + start, len);
37	log("%d GROUP %d (%d .. %d) = [%s]", cur_pos, i, start, end, m->data_);
38	}
39	log("");
40	int match_len = pmatch[0].rm_eo;
41	if (match_len == 0) {
42	break;
43	}
44	cur_pos += match_len;
45	}
46
47	free(pmatch);
48	regfree(&pat);
49	}
50
51	// adjacent matches
52	const char* s = "a345y-axy- there b789y- cy-";
53
54	TEST regex_unanchored() {
55	const char* unanchored = "[abc]([0-9]*)(x?)(y)-";
56	FindAll(unanchored, s);
57
58	PASS();
59	}
60
61	TEST regex_caret() {
62	const char* anchored = "^[abc]([0-9]*)(x?)(y)-";
63	FindAll(anchored, s);
64
65	PASS();
66	}
67
68	TEST regex_lexer() {
69	// like the Yaks / Make-a-Lisp pattern
70	const char* lexer = "([a-z]+)\|([0-9]+)\|([ ]+)\|([+-])";
71	FindAll(lexer, s);
72
73	PASS();
74	}
75
76	TEST regex_repeat_with_capture() {
77	const char* lexer = "(([a-z]+)([0-9]+)-)((A+)\|(Z+))";
78	FindAll(lexer, "a0-b1-c2-AAZZZA");
79	// Groups are weird
80	// whole match 0: a0-b1-c2-
81	// 1: c2- # last repetition
82	// 2: c # last one
83	// 3: 2 # last one
84	//
85	// And then there's an empty match
86	//
87	// Ideas:
88	// - disallow nested groups in Eggex?
89	// - I really care about the inner ones -- groups 2 and 3
90	// - I want flat groups
91
92	PASS();
93	}
94
95	// Disallow this in eggex, as well as the above
96	TEST regex_nested_capture() {
97	const char* lexer = "(([a-z]+)([0-9]+))";
98	FindAll(lexer, "a0");
99	PASS();
100	}
101
102	// I think we allow this in eggex
103	TEST regex_alt_with_capture() {
104	const char* lexer = "([a-z]+)\|([0-9]+)(-)";
105	FindAll(lexer, "x-");
106	FindAll(lexer, "7-");
107	PASS();
108	}
109
110	bool RegexMatch(const char* s, const char* regex_str) {
111	regex_t pat;
112	int status = regcomp(&pat, regex_str, REG_EXTENDED);
113	if (status != 0) {
114	assert(false);
115	}
116	log("*** Matching string %s against regex %s", s, regex_str);
117
118	int num_groups = pat.re_nsub + 1; // number of captures
119
120	regmatch_t* pmatch =
121	static_cast<regmatch_t>(malloc(sizeof(regmatch_t) num_groups));
122	int eflags = 0;
123	bool match = regexec(&pat, s, num_groups, pmatch, eflags) == 0;
124	if (match) {
125	printf("match\n");
126	for (int i = 0; i < num_groups; i++) {
127	int start = pmatch[i].rm_so;
128	int end = pmatch[i].rm_eo;
129	printf("start %d - %d\n", start, end);
130	}
131	}
132	free(pmatch);
133	regfree(&pat);
134
135	return match;
136	}
137
138	TEST regex_unicode() {
139	regex_t pat;
140
141	const char* p = "_._"; // 1 byte, not code point?
142
143	bool matched;
144
145	matched = RegexMatch("_xyz_", p);
146	ASSERT(not matched);
147
148	matched = RegexMatch("_x_", p);
149	ASSERT(matched);
150
151	matched = RegexMatch("_\x01_", p);
152	ASSERT(matched);
153
154	const char* u1 = "_μ_";
155	const char* u2 = "_\u03bc_";
156	const char* u3 = "_\xce\xbc_"; // utf-8 encoding
157
158	// Doesn't match without UTF-8 setting
159	matched = RegexMatch(u1, p);
160	// log("u1 %d", matched);
161	ASSERT(not matched);
162
163	matched = RegexMatch(u2, p);
164	// log("u2 %d", matched);
165	ASSERT(not matched);
166
167	matched = RegexMatch(u3, p);
168	// log("u3 %d", matched);
169	ASSERT(not matched);
170
171	// SETS GLOBAL
172	char* saved_locale = setlocale(LC_ALL, "");
173	log("saved_locale %s", saved_locale);
174	if (saved_locale == nullptr) {
175	FAIL();
176	}
177
178	// Now it matches
179	matched = RegexMatch(u1, p);
180	ASSERT(matched);
181	matched = RegexMatch(u2, p);
182	ASSERT(matched);
183	matched = RegexMatch(u3, p);
184	ASSERT(matched);
185
186	// [^a] can match a code point
187	matched = RegexMatch(u3, "_[^a]_");
188	ASSERT(matched);
189
190	const char* unicode_char_class = "[a\xce\xbc]";
191	const char* s = "\xce\xbc";
192	matched = RegexMatch(s, unicode_char_class);
193	ASSERT(matched);
194
195	PASS();
196	}
197
198	GREATEST_MAIN_DEFS();
199
200	int main(int argc, char** argv) {
201	gHeap.Init();
202
203	GREATEST_MAIN_BEGIN();
204
205	RUN_TEST(regex_unanchored);
206	RUN_TEST(regex_caret);
207	RUN_TEST(regex_lexer);
208	RUN_TEST(regex_repeat_with_capture);
209	RUN_TEST(regex_alt_with_capture);
210	RUN_TEST(regex_nested_capture);
211	RUN_TEST(regex_unicode);
212
213	gHeap.CleanProcessExit();
214
215	GREATEST_MAIN_END();
216	return 0;
217	}