mycpp/gc_str.cc

OILS / mycpp / gc_str.cc View on Github | oils.pub

694 lines, 429 significant

1	#include "mycpp/gc_str.h"
2
3	#include <ctype.h> // isalpha(), isdigit()
4	#include <stdarg.h>
5
6	#include <regex>
7
8	#include "mycpp/common.h"
9	#include "mycpp/gc_alloc.h" // NewStr()
10	#include "mycpp/gc_builtins.h" // StringToInt()
11	#include "mycpp/gc_list.h" // join(), split() use it
12
13	GLOBAL_STR(kEmptyString, "");
14
15	static const std::regex gStrFmtRegex("([^%])(?:%(-?[0-9])(.))?");
16	static const int kMaxFmtWidth = 256; // arbitrary...
17
18	int BigStr::internal_find(BigStr* needle, int direction, int start, int end) {
19	if (end == -1) {
20	end = len(this);
21	}
22	int needle_len = len(needle);
23
24	if (needle_len > (end - start)) {
25	return -1; // needle is too long to be found (Python behavior)
26	}
27
28	if (start > len(this)) {
29	return -1;
30	}
31	// Handle negative indices
32	if (start < 0) {
33	start = len(this) + start;
34	}
35	end = std::min(end, len(this));
36	if (end < 0) {
37	end = len(this) + end;
38	}
39
40	int last_start;
41	int cutoff_start = start;
42	int delta = direction;
43	std::function<bool(int, int)> comparator;
44	if (direction == 1) {
45	last_start = end - needle_len + 1;
46	comparator = std::less<int>{};
47	} else {
48	last_start = start;
49	cutoff_start = end - needle_len;
50	std::swap(start, end);
51	comparator = std::greater_equal<int>{};
52	}
53
54	if (needle_len == 1) {
55	char c = needle->data_[0];
56	// For 'aaa'.find('a', 0, 1)
57	// end = 1, needle_len = 1, last_start = 1 which means we go through once
58	for (int i = cutoff_start; comparator(i, end); i += delta) {
59	if (data_[i] == c) {
60	return i;
61	}
62	}
63	} else {
64	// Note: this works for finding the empty string. Empty string is found in
65	// empty range like [5, 5), but not in [5, 4)
66
67	// For 'aaa'.find('aa', 0, 2)
68	// end = 2, needle_len = 2, last_start = 1 which means we go through once
69
70	// could use a smarter substring search algorithm
71	for (int i = cutoff_start; comparator(i, last_start); i += delta) {
72	if (memcmp(data_ + i, needle->data_, needle_len) == 0) {
73	return i;
74	}
75	}
76	}
77	return -1;
78	}
79
80	int BigStr::find(BigStr* needle, int start, int end) {
81	return internal_find(needle, 1, start, end);
82	}
83
84	int BigStr::rfind(BigStr* needle, int start, int end) {
85	return internal_find(needle, -1, start, end);
86	}
87
88	bool BigStr::isdigit() {
89	int n = len(this);
90	if (n == 0) {
91	return false; // special case
92	}
93	for (int i = 0; i < n; ++i) {
94	if (!::isdigit(data_[i])) {
95	return false;
96	}
97	}
98	return true;
99	}
100
101	bool BigStr::isalpha() {
102	int n = len(this);
103	if (n == 0) {
104	return false; // special case
105	}
106	for (int i = 0; i < n; ++i) {
107	if (!::isalpha(data_[i])) {
108	return false;
109	}
110	}
111	return true;
112	}
113
114	// e.g. for osh/braces.py
115	bool BigStr::isupper() {
116	int n = len(this);
117	if (n == 0) {
118	return false; // special case
119	}
120	for (int i = 0; i < n; ++i) {
121	if (!::isupper(data_[i])) {
122	return false;
123	}
124	}
125	return true;
126	}
127
128	bool BigStr::startswith(BigStr* s) {
129	int n = len(s);
130	if (n > len(this)) {
131	return false;
132	}
133	return memcmp(data_, s->data_, n) == 0;
134	}
135
136	bool BigStr::endswith(BigStr* s) {
137	int len_s = len(s);
138	int len_this = len(this);
139	if (len_s > len_this) {
140	return false;
141	}
142	const char* start = data_ + len_this - len_s;
143	return memcmp(start, s->data_, len_s) == 0;
144	}
145
146	// Get a string with one character
147	BigStr* BigStr::at(int i) {
148	int length = len(this);
149	if (i < 0) {
150	i = length + i;
151	}
152	DCHECK(0 <= i);
153	DCHECK(i < length); // had a problem here!
154
155	BigStr* result = NewStr(1);
156	result->data_[0] = data_[i];
157	return result;
158	}
159
160	// s[begin:]
161	BigStr* BigStr::slice(int begin) {
162	return slice(begin, len(this));
163	}
164
165	// s[begin:end]
166	BigStr* BigStr::slice(int begin, int end) {
167	int length = len(this);
168	SLICE_ADJUST(begin, end, length);
169
170	DCHECK(0 <= begin && begin <= length);
171	DCHECK(0 <= end && end <= length);
172
173	int new_len = end - begin;
174	DCHECK(0 <= new_len && new_len <= length);
175
176	BigStr* result = NewStr(new_len); // has kEmptyString optimization
177	memcpy(result->data_, data_ + begin, new_len);
178
179	return result;
180	}
181
182	// Used by 'help' builtin and --help, neither of which translate yet.
183
184	List<BigStr> BigStr::splitlines(bool keep) {
185	DCHECK(keep == true);
186	FAIL(kNotImplemented);
187	}
188
189	BigStr* BigStr::upper() {
190	int length = len(this);
191	BigStr* result = NewStr(length);
192	char* buffer = result->data();
193	for (int char_index = 0; char_index < length; ++char_index) {
194	buffer[char_index] = toupper(data_[char_index]);
195	}
196	return result;
197	}
198
199	BigStr* BigStr::lower() {
200	int length = len(this);
201	BigStr* result = NewStr(length);
202	char* buffer = result->data();
203	for (int char_index = 0; char_index < length; ++char_index) {
204	buffer[char_index] = tolower(data_[char_index]);
205	}
206	return result;
207	}
208
209	BigStr* BigStr::ljust(int width, BigStr* fillchar) {
210	DCHECK(len(fillchar) == 1);
211
212	int length = len(this);
213	int num_fill = width - length;
214	if (num_fill < 0) {
215	return this;
216	} else {
217	BigStr* result = NewStr(width);
218	char c = fillchar->data_[0];
219	memcpy(result->data_, data_, length);
220	for (int i = length; i < width; ++i) {
221	result->data_[i] = c;
222	}
223	return result;
224	}
225	}
226
227	BigStr* BigStr::rjust(int width, BigStr* fillchar) {
228	DCHECK(len(fillchar) == 1);
229
230	int length = len(this);
231	int num_fill = width - length;
232	if (num_fill < 0) {
233	return this;
234	} else {
235	BigStr* result = NewStr(width);
236	char c = fillchar->data_[0];
237	for (int i = 0; i < num_fill; ++i) {
238	result->data_[i] = c;
239	}
240	memcpy(result->data_ + num_fill, data_, length);
241	return result;
242	}
243	}
244
245	BigStr* BigStr::replace(BigStr* old, BigStr* new_str) {
246	// Use -1 as in python2: "aaaa".replace(-1) -> "AAAA"
247	return replace(old, new_str, -1);
248	}
249
250	BigStr* BigStr::replace(BigStr* old, BigStr* new_str, int count) {
251	// log("replacing %s with %s", old_data, new_str->data_);
252	const char* old_data = old->data_;
253
254	int this_len = len(this);
255	int old_len = len(old);
256
257	const char* last_possible = data_ + this_len - old_len;
258
259	const char* p_this = data_; // advances through 'this'
260
261	// First pass: Calculate number of replacements, and hence new length
262	int replace_count = 0;
263	if (old_len == 0) {
264	replace_count = this_len + 1;
265	if (count > 0) {
266	replace_count = min(replace_count, count);
267	}
268	} else {
269	while (p_this <= last_possible) {
270	if (replace_count != count && // limit replacements (if count != -1)
271	memcmp(p_this, old_data, old_len) == 0) { // equal
272	replace_count++;
273	p_this += old_len;
274	} else {
275	p_this++;
276	}
277	}
278	}
279
280	// log("replacements %d", replace_count);
281
282	if (replace_count == 0) {
283	return this; // Reuse the string if there were no replacements
284	}
285
286	int new_str_len = len(new_str);
287	int result_len =
288	this_len - (replace_count * old_len) + (replace_count * new_str_len);
289
290	BigStr* result = NewStr(result_len);
291
292	const char* new_data = new_str->data_;
293	const size_t new_len = new_str_len;
294
295	// Second pass: Copy pieces into 'result'
296	p_this = data_; // back to beginning
297	char* p_result = result->data_; // advances through 'result'
298	replace_count = 0;
299
300	if (old_len == 0) {
301	// Should place new_str between each char in this
302	while (p_this < last_possible && replace_count != count) {
303	replace_count++;
304	memcpy(p_result, new_data, new_len); // Copy from new_str
305	p_result += new_len; // Move past new_str
306
307	// Write a char from this
308	p_result = p_this;
309	p_this++;
310	p_result++;
311	}
312
313	if (replace_count != count) {
314	// Write a copy of new_str at the end
315	assert(p_this == last_possible);
316	memcpy(p_result, new_data, new_len);
317	} else if (p_this <= last_possible) {
318	// Write the last part of string
319	memcpy(p_result, p_this, data_ + this_len - p_this);
320	}
321	} else {
322	while (p_this <= last_possible) {
323	// Note: would be more efficient if we remembered the match positions
324	if (replace_count != count && // limit replacements (if count != -1)
325	memcmp(p_this, old_data, old_len) == 0) { // equal
326	memcpy(p_result, new_data, new_len); // Copy from new_str
327	replace_count++;
328	p_result += new_len;
329	p_this += old_len;
330	} else { // copy 1 byte
331	p_result = p_this;
332	p_result++;
333	p_this++;
334	}
335	}
336	memcpy(p_result, p_this, data_ + this_len - p_this); // last part of string
337	}
338
339	return result;
340	}
341
342	enum class StripWhere {
343	Left,
344	Right,
345	Both,
346	};
347
348	const int kWhitespace = -1;
349
350	bool OmitChar(int ch, int what) {
351	if (what == kWhitespace) {
352	// Intentional incompatibility with Python, where say \v is whitespace
353	// '\v'.strip() == ''
354	//
355	// But it is consistent with the JSON spec [ \t\r\n] and the rules in
356	// frontend/lexer_def.py
357	//
358	// Note that the YSH is separate, and Str => trim() respects Unicode.
359	return IsAsciiWhitespace(ch);
360	} else {
361	return what == ch;
362	}
363	}
364
365	bool OmitCharMany(int ch, BigStr* chars) {
366	for (int i = 0; i < len(chars); ++i) {
367	if (ch == chars->data_[i]) {
368	return true;
369	}
370	}
371	return false;
372	}
373
374	// StripAny is modeled after CPython's do_strip() in stringobject.c, and can
375	// implement 6 functions:
376	//
377	// strip / lstrip / rstrip
378	// strip(char) / lstrip(char) / rstrip(char)
379	//
380	// Args:
381	// where: which ends to strip from
382	// what: kWhitespace, or an ASCII code 0-255
383
384	BigStr* StripAny(BigStr* s, StripWhere where, int what) {
385	int length = len(s);
386	const char* char_data = s->data();
387
388	int i = 0;
389	if (where != StripWhere::Right) {
390	while (i < length && OmitChar(char_data[i], what)) {
391	i++;
392	}
393	}
394
395	int j = length;
396	if (where != StripWhere::Left) {
397	do {
398	j--;
399	} while (j >= i && OmitChar(char_data[j], what));
400	j++;
401	}
402
403	if (i == j) { // Optimization to reuse existing object
404	return kEmptyString;
405	}
406
407	if (i == 0 && j == length) { // nothing stripped
408	return s;
409	}
410
411	// Note: makes a copy in leaky version, and will in GC version too
412	int new_len = j - i;
413	BigStr* result = NewStr(new_len);
414	memcpy(result->data(), s->data() + i, new_len);
415	return result;
416	}
417
418	BigStr* BigStr::strip() {
419	return StripAny(this, StripWhere::Both, kWhitespace);
420	}
421
422	// Used for CommandSub in osh/cmd_exec.py
423	BigStr* BigStr::rstrip(BigStr* chars) {
424	int num_chars = len(chars);
425	if (num_chars == 0) {
426	return this;
427	}
428
429	// multiple chars, for word splitting
430	if (num_chars > 1) {
431	int j = len(this);
432	do {
433	j--;
434	} while (j >= 0 && OmitCharMany(data_[j], chars));
435	j++;
436
437	int new_len = j;
438	BigStr* result = NewStr(new_len);
439	memcpy(result->data(), data_, new_len);
440	return result;
441	}
442
443	// exactly 1 char
444	int c = chars->data_[0];
445	return StripAny(this, StripWhere::Right, c);
446	}
447
448	BigStr* BigStr::rstrip() {
449	return StripAny(this, StripWhere::Right, kWhitespace);
450	}
451
452	BigStr* BigStr::lstrip(BigStr* chars) {
453	DCHECK(len(chars) == 1);
454	int c = chars->data_[0];
455	return StripAny(this, StripWhere::Left, c);
456	}
457
458	BigStr* BigStr::lstrip() {
459	return StripAny(this, StripWhere::Left, kWhitespace);
460	}
461
462	BigStr* BigStr::join(List<BigStr> items) {
463	int length = 0;
464
465	int num_parts = len(items);
466
467	// " ".join([]) == ""
468	if (num_parts == 0) {
469	return kEmptyString;
470	}
471
472	// Common case
473	// 'anything'.join(["foo"]) == "foo"
474	if (num_parts == 1) {
475	return items->at(0);
476	}
477
478	for (int i = 0; i < num_parts; ++i) {
479	length += len(items->at(i));
480	}
481	// add length of all the separators
482	int this_len = len(this);
483	length += this_len * (num_parts - 1);
484
485	BigStr* result = NewStr(length);
486	char* p_result = result->data_; // advances through
487
488	for (int i = 0; i < num_parts; ++i) {
489	// log("i %d", i);
490	if (i != 0 && this_len) { // optimize common case of ''.join()
491	memcpy(p_result, data_, this_len); // copy the separator
492	p_result += this_len;
493	// log("this_len %d", this_len);
494	}
495
496	int n = len(items->at(i));
497	// log("n: %d", n);
498	memcpy(p_result, items->at(i)->data_, n); // copy the list item
499	p_result += n;
500	}
501
502	return result;
503	}
504
505	static void AppendPart(List<BigStr> result, BigStr* s, int left, int right) {
506	int new_len = right - left;
507	BigStr* part;
508	if (new_len == 0) {
509	part = kEmptyString;
510	} else {
511	part = NewStr(new_len);
512	memcpy(part->data_, s->data_ + left, new_len);
513	}
514	result->append(part);
515	}
516
517	// Split BigStr into List<BigStr*> of parts separated by 'sep'.
518	// The code structure is taken from CPython's Objects/stringlib/split.h.
519	List<BigStr> BigStr::split(BigStr* sep, int max_split) {
520	DCHECK(sep != nullptr);
521	DCHECK(len(sep) == 1); // we can only split one char
522	char sep_char = sep->data_[0];
523
524	int str_len = len(this);
525	if (str_len == 0) {
526	// weird case consistent with Python: ''.split(':') == ['']
527	return NewList<BigStr*>({kEmptyString});
528	}
529
530	List<BigStr> result = NewList<BigStr*>({});
531	int left = 0;
532	int right = 0;
533	int num_parts = 0; // 3 splits results in 4 parts
534
535	while (right < str_len && num_parts < max_split) {
536	// search for separator
537	for (; right < str_len; right++) {
538	if (data_[right] == sep_char) {
539	AppendPart(result, this, left, right);
540	right++;
541	left = right;
542	num_parts++;
543	break;
544	}
545	}
546	}
547	if (num_parts == 0) { // Optimization when there is no split
548	result->append(this);
549	} else if (left <= str_len) { // Last part
550	AppendPart(result, this, left, str_len);
551	}
552
553	return result;
554	}
555
556	List<BigStr> BigStr::split(BigStr* sep) {
557	return this->split(sep, len(this));
558	}
559
560	unsigned BigStr::hash(HashFunc h) {
561	if (!is_hashed_) {
562	hash_ = h(data_, len(this)) >> 1;
563	is_hashed_ = 1;
564	}
565	return hash_;
566	}
567
568	static inline BigStr* _StrFormat(const char* fmt, int fmt_len, va_list args) {
569	auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
570	auto end = std::cregex_iterator();
571
572	char int_buf[kMaxFmtWidth];
573	std::string buf;
574	for (std::cregex_iterator it = beg; it != end; ++it) {
575	const std::cmatch& match = *it;
576
577	const std::csub_match& lit_m = match[1];
578	DCHECK(lit_m.matched);
579	const std::string& lit_s = lit_m.str();
580	buf.append(lit_s);
581
582	int width = 0;
583	bool zero_pad = false;
584	bool pad_back = false;
585	const std::csub_match& width_m = match[2];
586	const std::string& width_s = width_m.str();
587	bool ok = false;
588	if (width_m.matched && !width_s.empty()) {
589	if (width_s[0] == '0') {
590	zero_pad = true;
591	DCHECK(width_s.size() > 1);
592	ok = StringToInt(width_s.c_str() + 1, width_s.size() - 1, 10, &width);
593	DCHECK(ok);
594	(void)ok; // silence unused var warning in opt
595	} else {
596	ok = StringToInt(width_s.c_str(), width_s.size(), 10, &width);
597	DCHECK(ok);
598	}
599	if (width < 0) {
600	pad_back = true;
601	width *= -1;
602	}
603	DCHECK(0 <= width && width < kMaxFmtWidth);
604	}
605
606	char const* str_to_add = nullptr;
607	int add_len = 0;
608	const std::csub_match& code_m = match[3];
609	const std::string& code_s = code_m.str();
610	if (!code_m.matched) {
611	DCHECK(!width_m.matched); // python errors on invalid format operators
612	break;
613	}
614	DCHECK(code_s.size() == 1);
615	switch (code_s[0]) {
616	case '%': {
617	str_to_add = code_s.c_str();
618	add_len = 1;
619	break;
620	}
621	case 's': {
622	BigStr* s = va_arg(args, BigStr*);
623	// Check type unconditionally because mycpp doesn't always check it
624	CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
625
626	str_to_add = s->data();
627	add_len = len(s);
628	zero_pad = false; // python ignores the 0 directive for strings
629	break;
630	}
631	case 'r': {
632	BigStr* s = va_arg(args, BigStr*);
633	// Check type unconditionally because mycpp doesn't always check it
634	CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
635
636	s = repr(s);
637	str_to_add = s->data();
638	add_len = len(s);
639	zero_pad = false; // python ignores the 0 directive for strings
640	break;
641	}
642	case 'd': // fallthrough
643	case 'o': {
644	int d = va_arg(args, int);
645	add_len = snprintf(int_buf, kMaxFmtWidth,
646	match.str().c_str() + lit_s.size(), d);
647	DCHECK(add_len > 0);
648	str_to_add = int_buf;
649	break;
650	}
651	default:
652	DCHECK(0);
653	break;
654	}
655	DCHECK(str_to_add != nullptr);
656
657	if (pad_back) {
658	buf.append(str_to_add, add_len);
659	}
660	if (add_len < width) {
661	for (int i = 0; i < width - add_len; ++i) {
662	buf.push_back(zero_pad ? '0' : ' ');
663	}
664	}
665	if (!pad_back) {
666	buf.append(str_to_add, add_len);
667	}
668	}
669
670	return StrFromC(buf.c_str(), buf.size());
671	}
672
673	BigStr* StrIter::Value() { // similar to at()
674	BigStr* result = NewStr(1);
675	result->data_[0] = s_->data_[i_];
676	DCHECK(result->data_[1] == '\0');
677	return result;
678	}
679
680	BigStr* StrFormat(const char* fmt, ...) {
681	va_list args;
682	va_start(args, fmt);
683	BigStr* ret = _StrFormat(fmt, strlen(fmt), args);
684	va_end(args);
685	return ret;
686	}
687
688	BigStr* StrFormat(BigStr* fmt, ...) {
689	va_list args;
690	va_start(args, fmt);
691	BigStr* ret = _StrFormat(fmt->data(), len(fmt), args);
692	va_end(args);
693	return ret;
694	}