mycpp/gc_str.cc

OILS / mycpp / gc_str.cc View on Github | oils.pub

674 lines, 416 significant

1	#include "mycpp/gc_str.h"
2
3	#include <ctype.h> // isalpha(), isdigit()
4	#include <stdarg.h>
5
6	#include <regex>
7
8	#include "mycpp/common.h"
9	#include "mycpp/gc_alloc.h" // NewStr()
10	#include "mycpp/gc_builtins.h" // StringToInt()
11	#include "mycpp/gc_list.h" // join(), split() use it
12
13	GLOBAL_STR(kEmptyString, "");
14
15	static const std::regex gStrFmtRegex("([^%])(?:%(-?[0-9])(.))?");
16	static const int kMaxFmtWidth = 256; // arbitrary...
17
18	int BigStr::find(BigStr* needle, int start, int end) {
19	if (end == -1) {
20	end = len(this);
21	}
22	int needle_len = len(needle);
23
24	if (needle_len > (end - start)) {
25	return -1; // needle is too long to be found (Python behavior)
26	}
27
28	if (needle_len == 1) {
29	char c = needle->data_[0];
30	// For 'aaa'.find('a', 0, 1)
31	// end = 1, needle_len = 1, last_start = 1 which means we go through once
32	for (int i = start; i < end; ++i) {
33	if (data_[i] == c) {
34	return i;
35	}
36	}
37	} else {
38	// Note: this works for finding the empty string. Empty string is found in
39	// empty range like [5, 5), but not in [5, 4)
40
41	// For 'aaa'.find('aa', 0, 2)
42	// end = 2, needle_len = 2, last_start = 1 which means we go through once
43
44	int last_start = end - needle_len + 1;
45	// could use a smarter substring search algorithm
46	for (int i = start; i < last_start; ++i) {
47	if (memcmp(data_ + i, needle->data_, needle_len) == 0) {
48	return i;
49	}
50	}
51	}
52	return -1;
53	}
54
55	int BigStr::rfind(BigStr* needle) {
56	int length = len(this);
57	DCHECK(len(needle) == 1); // Oils usage
58	char c = needle->data_[0];
59	for (int i = length - 1; i >= 0; --i) {
60	if (data_[i] == c) {
61	return i;
62	}
63	}
64	return -1;
65	}
66
67	bool BigStr::isdigit() {
68	int n = len(this);
69	if (n == 0) {
70	return false; // special case
71	}
72	for (int i = 0; i < n; ++i) {
73	if (!::isdigit(data_[i])) {
74	return false;
75	}
76	}
77	return true;
78	}
79
80	bool BigStr::isalpha() {
81	int n = len(this);
82	if (n == 0) {
83	return false; // special case
84	}
85	for (int i = 0; i < n; ++i) {
86	if (!::isalpha(data_[i])) {
87	return false;
88	}
89	}
90	return true;
91	}
92
93	// e.g. for osh/braces.py
94	bool BigStr::isupper() {
95	int n = len(this);
96	if (n == 0) {
97	return false; // special case
98	}
99	for (int i = 0; i < n; ++i) {
100	if (!::isupper(data_[i])) {
101	return false;
102	}
103	}
104	return true;
105	}
106
107	bool BigStr::startswith(BigStr* s) {
108	int n = len(s);
109	if (n > len(this)) {
110	return false;
111	}
112	return memcmp(data_, s->data_, n) == 0;
113	}
114
115	bool BigStr::endswith(BigStr* s) {
116	int len_s = len(s);
117	int len_this = len(this);
118	if (len_s > len_this) {
119	return false;
120	}
121	const char* start = data_ + len_this - len_s;
122	return memcmp(start, s->data_, len_s) == 0;
123	}
124
125	// Get a string with one character
126	BigStr* BigStr::at(int i) {
127	int length = len(this);
128	if (i < 0) {
129	i = length + i;
130	}
131	DCHECK(0 <= i);
132	DCHECK(i < length); // had a problem here!
133
134	BigStr* result = NewStr(1);
135	result->data_[0] = data_[i];
136	return result;
137	}
138
139	// s[begin:]
140	BigStr* BigStr::slice(int begin) {
141	return slice(begin, len(this));
142	}
143
144	// s[begin:end]
145	BigStr* BigStr::slice(int begin, int end) {
146	int length = len(this);
147	SLICE_ADJUST(begin, end, length);
148
149	DCHECK(0 <= begin && begin <= length);
150	DCHECK(0 <= end && end <= length);
151
152	int new_len = end - begin;
153	DCHECK(0 <= new_len && new_len <= length);
154
155	BigStr* result = NewStr(new_len); // has kEmptyString optimization
156	memcpy(result->data_, data_ + begin, new_len);
157
158	return result;
159	}
160
161	// Used by 'help' builtin and --help, neither of which translate yet.
162
163	List<BigStr> BigStr::splitlines(bool keep) {
164	DCHECK(keep == true);
165	FAIL(kNotImplemented);
166	}
167
168	BigStr* BigStr::upper() {
169	int length = len(this);
170	BigStr* result = NewStr(length);
171	char* buffer = result->data();
172	for (int char_index = 0; char_index < length; ++char_index) {
173	buffer[char_index] = toupper(data_[char_index]);
174	}
175	return result;
176	}
177
178	BigStr* BigStr::lower() {
179	int length = len(this);
180	BigStr* result = NewStr(length);
181	char* buffer = result->data();
182	for (int char_index = 0; char_index < length; ++char_index) {
183	buffer[char_index] = tolower(data_[char_index]);
184	}
185	return result;
186	}
187
188	BigStr* BigStr::ljust(int width, BigStr* fillchar) {
189	DCHECK(len(fillchar) == 1);
190
191	int length = len(this);
192	int num_fill = width - length;
193	if (num_fill < 0) {
194	return this;
195	} else {
196	BigStr* result = NewStr(width);
197	char c = fillchar->data_[0];
198	memcpy(result->data_, data_, length);
199	for (int i = length; i < width; ++i) {
200	result->data_[i] = c;
201	}
202	return result;
203	}
204	}
205
206	BigStr* BigStr::rjust(int width, BigStr* fillchar) {
207	DCHECK(len(fillchar) == 1);
208
209	int length = len(this);
210	int num_fill = width - length;
211	if (num_fill < 0) {
212	return this;
213	} else {
214	BigStr* result = NewStr(width);
215	char c = fillchar->data_[0];
216	for (int i = 0; i < num_fill; ++i) {
217	result->data_[i] = c;
218	}
219	memcpy(result->data_ + num_fill, data_, length);
220	return result;
221	}
222	}
223
224	BigStr* BigStr::replace(BigStr* old, BigStr* new_str) {
225	// Use -1 as in python2: "aaaa".replace(-1) -> "AAAA"
226	return replace(old, new_str, -1);
227	}
228
229	BigStr* BigStr::replace(BigStr* old, BigStr* new_str, int count) {
230	// log("replacing %s with %s", old_data, new_str->data_);
231	const char* old_data = old->data_;
232
233	int this_len = len(this);
234	int old_len = len(old);
235
236	const char* last_possible = data_ + this_len - old_len;
237
238	const char* p_this = data_; // advances through 'this'
239
240	// First pass: Calculate number of replacements, and hence new length
241	int replace_count = 0;
242	if (old_len == 0) {
243	replace_count = this_len + 1;
244	if (count > 0) {
245	replace_count = min(replace_count, count);
246	}
247	} else {
248	while (p_this <= last_possible) {
249	if (replace_count != count && // limit replacements (if count != -1)
250	memcmp(p_this, old_data, old_len) == 0) { // equal
251	replace_count++;
252	p_this += old_len;
253	} else {
254	p_this++;
255	}
256	}
257	}
258
259	// log("replacements %d", replace_count);
260
261	if (replace_count == 0) {
262	return this; // Reuse the string if there were no replacements
263	}
264
265	int new_str_len = len(new_str);
266	int result_len =
267	this_len - (replace_count * old_len) + (replace_count * new_str_len);
268
269	BigStr* result = NewStr(result_len);
270
271	const char* new_data = new_str->data_;
272	const size_t new_len = new_str_len;
273
274	// Second pass: Copy pieces into 'result'
275	p_this = data_; // back to beginning
276	char* p_result = result->data_; // advances through 'result'
277	replace_count = 0;
278
279	if (old_len == 0) {
280	// Should place new_str between each char in this
281	while (p_this < last_possible && replace_count != count) {
282	replace_count++;
283	memcpy(p_result, new_data, new_len); // Copy from new_str
284	p_result += new_len; // Move past new_str
285
286	// Write a char from this
287	p_result = p_this;
288	p_this++;
289	p_result++;
290	}
291
292	if (replace_count != count) {
293	// Write a copy of new_str at the end
294	assert(p_this == last_possible);
295	memcpy(p_result, new_data, new_len);
296	} else if (p_this <= last_possible) {
297	// Write the last part of string
298	memcpy(p_result, p_this, data_ + this_len - p_this);
299	}
300	} else {
301	while (p_this <= last_possible) {
302	// Note: would be more efficient if we remembered the match positions
303	if (replace_count != count && // limit replacements (if count != -1)
304	memcmp(p_this, old_data, old_len) == 0) { // equal
305	memcpy(p_result, new_data, new_len); // Copy from new_str
306	replace_count++;
307	p_result += new_len;
308	p_this += old_len;
309	} else { // copy 1 byte
310	p_result = p_this;
311	p_result++;
312	p_this++;
313	}
314	}
315	memcpy(p_result, p_this, data_ + this_len - p_this); // last part of string
316	}
317
318	return result;
319	}
320
321	enum class StripWhere {
322	Left,
323	Right,
324	Both,
325	};
326
327	const int kWhitespace = -1;
328
329	bool OmitChar(int ch, int what) {
330	if (what == kWhitespace) {
331	// Intentional incompatibility with Python, where say \v is whitespace
332	// '\v'.strip() == ''
333	//
334	// But it is consistent with the JSON spec [ \t\r\n] and the rules in
335	// frontend/lexer_def.py
336	//
337	// Note that the YSH is separate, and Str => trim() respects Unicode.
338	return IsAsciiWhitespace(ch);
339	} else {
340	return what == ch;
341	}
342	}
343
344	bool OmitCharMany(int ch, BigStr* chars) {
345	for (int i = 0; i < len(chars); ++i) {
346	if (ch == chars->data_[i]) {
347	return true;
348	}
349	}
350	return false;
351	}
352
353	// StripAny is modeled after CPython's do_strip() in stringobject.c, and can
354	// implement 6 functions:
355	//
356	// strip / lstrip / rstrip
357	// strip(char) / lstrip(char) / rstrip(char)
358	//
359	// Args:
360	// where: which ends to strip from
361	// what: kWhitespace, or an ASCII code 0-255
362
363	BigStr* StripAny(BigStr* s, StripWhere where, int what) {
364	int length = len(s);
365	const char* char_data = s->data();
366
367	int i = 0;
368	if (where != StripWhere::Right) {
369	while (i < length && OmitChar(char_data[i], what)) {
370	i++;
371	}
372	}
373
374	int j = length;
375	if (where != StripWhere::Left) {
376	do {
377	j--;
378	} while (j >= i && OmitChar(char_data[j], what));
379	j++;
380	}
381
382	if (i == j) { // Optimization to reuse existing object
383	return kEmptyString;
384	}
385
386	if (i == 0 && j == length) { // nothing stripped
387	return s;
388	}
389
390	// Note: makes a copy in leaky version, and will in GC version too
391	int new_len = j - i;
392	BigStr* result = NewStr(new_len);
393	memcpy(result->data(), s->data() + i, new_len);
394	return result;
395	}
396
397	BigStr* BigStr::strip() {
398	return StripAny(this, StripWhere::Both, kWhitespace);
399	}
400
401	// Used for CommandSub in osh/cmd_exec.py
402	BigStr* BigStr::rstrip(BigStr* chars) {
403	int num_chars = len(chars);
404	if (num_chars == 0) {
405	return this;
406	}
407
408	// multiple chars, for word splitting
409	if (num_chars > 1) {
410	const char* char_data = data_;
411	int j = len(this);
412	do {
413	j--;
414	} while (j >= 0 && OmitCharMany(data_[j], chars));
415	j++;
416
417	int new_len = j;
418	BigStr* result = NewStr(new_len);
419	memcpy(result->data(), data_, new_len);
420	return result;
421	}
422
423	// exactly 1 char
424	int c = chars->data_[0];
425	return StripAny(this, StripWhere::Right, c);
426	}
427
428	BigStr* BigStr::rstrip() {
429	return StripAny(this, StripWhere::Right, kWhitespace);
430	}
431
432	BigStr* BigStr::lstrip(BigStr* chars) {
433	DCHECK(len(chars) == 1);
434	int c = chars->data_[0];
435	return StripAny(this, StripWhere::Left, c);
436	}
437
438	BigStr* BigStr::lstrip() {
439	return StripAny(this, StripWhere::Left, kWhitespace);
440	}
441
442	BigStr* BigStr::join(List<BigStr> items) {
443	int length = 0;
444
445	int num_parts = len(items);
446
447	// " ".join([]) == ""
448	if (num_parts == 0) {
449	return kEmptyString;
450	}
451
452	// Common case
453	// 'anything'.join(["foo"]) == "foo"
454	if (num_parts == 1) {
455	return items->at(0);
456	}
457
458	for (int i = 0; i < num_parts; ++i) {
459	length += len(items->at(i));
460	}
461	// add length of all the separators
462	int this_len = len(this);
463	length += this_len * (num_parts - 1);
464
465	BigStr* result = NewStr(length);
466	char* p_result = result->data_; // advances through
467
468	for (int i = 0; i < num_parts; ++i) {
469	// log("i %d", i);
470	if (i != 0 && this_len) { // optimize common case of ''.join()
471	memcpy(p_result, data_, this_len); // copy the separator
472	p_result += this_len;
473	// log("this_len %d", this_len);
474	}
475
476	int n = len(items->at(i));
477	// log("n: %d", n);
478	memcpy(p_result, items->at(i)->data_, n); // copy the list item
479	p_result += n;
480	}
481
482	return result;
483	}
484
485	static void AppendPart(List<BigStr> result, BigStr* s, int left, int right) {
486	int new_len = right - left;
487	BigStr* part;
488	if (new_len == 0) {
489	part = kEmptyString;
490	} else {
491	part = NewStr(new_len);
492	memcpy(part->data_, s->data_ + left, new_len);
493	}
494	result->append(part);
495	}
496
497	// Split BigStr into List<BigStr*> of parts separated by 'sep'.
498	// The code structure is taken from CPython's Objects/stringlib/split.h.
499	List<BigStr> BigStr::split(BigStr* sep, int max_split) {
500	DCHECK(sep != nullptr);
501	DCHECK(len(sep) == 1); // we can only split one char
502	char sep_char = sep->data_[0];
503
504	int str_len = len(this);
505	if (str_len == 0) {
506	// weird case consistent with Python: ''.split(':') == ['']
507	return NewList<BigStr*>({kEmptyString});
508	}
509
510	List<BigStr> result = NewList<BigStr*>({});
511	int left = 0;
512	int right = 0;
513	int num_parts = 0; // 3 splits results in 4 parts
514
515	while (right < str_len && num_parts < max_split) {
516	// search for separator
517	for (; right < str_len; right++) {
518	if (data_[right] == sep_char) {
519	AppendPart(result, this, left, right);
520	right++;
521	left = right;
522	num_parts++;
523	break;
524	}
525	}
526	}
527	if (num_parts == 0) { // Optimization when there is no split
528	result->append(this);
529	} else if (left <= str_len) { // Last part
530	AppendPart(result, this, left, str_len);
531	}
532
533	return result;
534	}
535
536	List<BigStr> BigStr::split(BigStr* sep) {
537	return this->split(sep, len(this));
538	}
539
540	unsigned BigStr::hash(HashFunc h) {
541	if (!is_hashed_) {
542	hash_ = h(data_, len(this)) >> 1;
543	is_hashed_ = 1;
544	}
545	return hash_;
546	}
547
548	static inline BigStr* _StrFormat(const char* fmt, int fmt_len, va_list args) {
549	auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
550	auto end = std::cregex_iterator();
551
552	char int_buf[kMaxFmtWidth];
553	std::string buf;
554	for (std::cregex_iterator it = beg; it != end; ++it) {
555	const std::cmatch& match = *it;
556
557	const std::csub_match& lit_m = match[1];
558	DCHECK(lit_m.matched);
559	const std::string& lit_s = lit_m.str();
560	buf.append(lit_s);
561
562	int width = 0;
563	bool zero_pad = false;
564	bool pad_back = false;
565	const std::csub_match& width_m = match[2];
566	const std::string& width_s = width_m.str();
567	bool ok = false;
568	if (width_m.matched && !width_s.empty()) {
569	if (width_s[0] == '0') {
570	zero_pad = true;
571	DCHECK(width_s.size() > 1);
572	ok = StringToInt(width_s.c_str() + 1, width_s.size() - 1, 10, &width);
573	DCHECK(ok);
574	(void)ok; // silence unused var warning in opt
575	} else {
576	ok = StringToInt(width_s.c_str(), width_s.size(), 10, &width);
577	DCHECK(ok);
578	}
579	if (width < 0) {
580	pad_back = true;
581	width *= -1;
582	}
583	DCHECK(0 <= width && width < kMaxFmtWidth);
584	}
585
586	char const* str_to_add = nullptr;
587	int add_len = 0;
588	const std::csub_match& code_m = match[3];
589	const std::string& code_s = code_m.str();
590	if (!code_m.matched) {
591	DCHECK(!width_m.matched); // python errors on invalid format operators
592	break;
593	}
594	DCHECK(code_s.size() == 1);
595	switch (code_s[0]) {
596	case '%': {
597	str_to_add = code_s.c_str();
598	add_len = 1;
599	break;
600	}
601	case 's': {
602	BigStr* s = va_arg(args, BigStr*);
603	// Check type unconditionally because mycpp doesn't always check it
604	CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
605
606	str_to_add = s->data();
607	add_len = len(s);
608	zero_pad = false; // python ignores the 0 directive for strings
609	break;
610	}
611	case 'r': {
612	BigStr* s = va_arg(args, BigStr*);
613	// Check type unconditionally because mycpp doesn't always check it
614	CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
615
616	s = repr(s);
617	str_to_add = s->data();
618	add_len = len(s);
619	zero_pad = false; // python ignores the 0 directive for strings
620	break;
621	}
622	case 'd': // fallthrough
623	case 'o': {
624	int d = va_arg(args, int);
625	add_len = snprintf(int_buf, kMaxFmtWidth,
626	match.str().c_str() + lit_s.size(), d);
627	DCHECK(add_len > 0);
628	str_to_add = int_buf;
629	break;
630	}
631	default:
632	DCHECK(0);
633	break;
634	}
635	DCHECK(str_to_add != nullptr);
636
637	if (pad_back) {
638	buf.append(str_to_add, add_len);
639	}
640	if (add_len < width) {
641	for (int i = 0; i < width - add_len; ++i) {
642	buf.push_back(zero_pad ? '0' : ' ');
643	}
644	}
645	if (!pad_back) {
646	buf.append(str_to_add, add_len);
647	}
648	}
649
650	return StrFromC(buf.c_str(), buf.size());
651	}
652
653	BigStr* StrIter::Value() { // similar to at()
654	BigStr* result = NewStr(1);
655	result->data_[0] = s_->data_[i_];
656	DCHECK(result->data_[1] == '\0');
657	return result;
658	}
659
660	BigStr* StrFormat(const char* fmt, ...) {
661	va_list args;
662	va_start(args, fmt);
663	BigStr* ret = _StrFormat(fmt, strlen(fmt), args);
664	va_end(args);
665	return ret;
666	}
667
668	BigStr* StrFormat(BigStr* fmt, ...) {
669	va_list args;
670	va_start(args, fmt);
671	BigStr* ret = _StrFormat(fmt->data(), len(fmt), args);
672	va_end(args);
673	return ret;
674	}