OILS / mycpp / gc_str.cc View on Github | oils.pub

694 lines, 429 significant
1#include "mycpp/gc_str.h"
2
3#include <ctype.h> // isalpha(), isdigit()
4#include <stdarg.h>
5
6#include <regex>
7
8#include "mycpp/common.h"
9#include "mycpp/gc_alloc.h" // NewStr()
10#include "mycpp/gc_builtins.h" // StringToInt()
11#include "mycpp/gc_list.h" // join(), split() use it
12
13GLOBAL_STR(kEmptyString, "");
14
15static const std::regex gStrFmtRegex("([^%]*)(?:%(-?[0-9]*)(.))?");
16static const int kMaxFmtWidth = 256; // arbitrary...
17
18int BigStr::internal_find(BigStr* needle, int direction, int start, int end) {
19 if (end == -1) {
20 end = len(this);
21 }
22 int needle_len = len(needle);
23
24 if (needle_len > (end - start)) {
25 return -1; // needle is too long to be found (Python behavior)
26 }
27
28 if (start > len(this)) {
29 return -1;
30 }
31 // Handle negative indices
32 if (start < 0) {
33 start = len(this) + start;
34 }
35 end = std::min(end, len(this));
36 if (end < 0) {
37 end = len(this) + end;
38 }
39
40 int last_start;
41 int cutoff_start = start;
42 int delta = direction;
43 std::function<bool(int, int)> comparator;
44 if (direction == 1) {
45 last_start = end - needle_len + 1;
46 comparator = std::less<int>{};
47 } else {
48 last_start = start;
49 cutoff_start = end - needle_len;
50 std::swap(start, end);
51 comparator = std::greater_equal<int>{};
52 }
53
54 if (needle_len == 1) {
55 char c = needle->data_[0];
56 // For 'aaa'.find('a', 0, 1)
57 // end = 1, needle_len = 1, last_start = 1 which means we go through once
58 for (int i = cutoff_start; comparator(i, end); i += delta) {
59 if (data_[i] == c) {
60 return i;
61 }
62 }
63 } else {
64 // Note: this works for finding the empty string. Empty string is found in
65 // empty range like [5, 5), but not in [5, 4)
66
67 // For 'aaa'.find('aa', 0, 2)
68 // end = 2, needle_len = 2, last_start = 1 which means we go through once
69
70 // could use a smarter substring search algorithm
71 for (int i = cutoff_start; comparator(i, last_start); i += delta) {
72 if (memcmp(data_ + i, needle->data_, needle_len) == 0) {
73 return i;
74 }
75 }
76 }
77 return -1;
78}
79
80int BigStr::find(BigStr* needle, int start, int end) {
81 return internal_find(needle, 1, start, end);
82}
83
84int BigStr::rfind(BigStr* needle, int start, int end) {
85 return internal_find(needle, -1, start, end);
86}
87
88bool BigStr::isdigit() {
89 int n = len(this);
90 if (n == 0) {
91 return false; // special case
92 }
93 for (int i = 0; i < n; ++i) {
94 if (!::isdigit(data_[i])) {
95 return false;
96 }
97 }
98 return true;
99}
100
101bool BigStr::isalpha() {
102 int n = len(this);
103 if (n == 0) {
104 return false; // special case
105 }
106 for (int i = 0; i < n; ++i) {
107 if (!::isalpha(data_[i])) {
108 return false;
109 }
110 }
111 return true;
112}
113
114// e.g. for osh/braces.py
115bool BigStr::isupper() {
116 int n = len(this);
117 if (n == 0) {
118 return false; // special case
119 }
120 for (int i = 0; i < n; ++i) {
121 if (!::isupper(data_[i])) {
122 return false;
123 }
124 }
125 return true;
126}
127
128bool BigStr::startswith(BigStr* s) {
129 int n = len(s);
130 if (n > len(this)) {
131 return false;
132 }
133 return memcmp(data_, s->data_, n) == 0;
134}
135
136bool BigStr::endswith(BigStr* s) {
137 int len_s = len(s);
138 int len_this = len(this);
139 if (len_s > len_this) {
140 return false;
141 }
142 const char* start = data_ + len_this - len_s;
143 return memcmp(start, s->data_, len_s) == 0;
144}
145
146// Get a string with one character
147BigStr* BigStr::at(int i) {
148 int length = len(this);
149 if (i < 0) {
150 i = length + i;
151 }
152 DCHECK(0 <= i);
153 DCHECK(i < length); // had a problem here!
154
155 BigStr* result = NewStr(1);
156 result->data_[0] = data_[i];
157 return result;
158}
159
160// s[begin:]
161BigStr* BigStr::slice(int begin) {
162 return slice(begin, len(this));
163}
164
165// s[begin:end]
166BigStr* BigStr::slice(int begin, int end) {
167 int length = len(this);
168 SLICE_ADJUST(begin, end, length);
169
170 DCHECK(0 <= begin && begin <= length);
171 DCHECK(0 <= end && end <= length);
172
173 int new_len = end - begin;
174 DCHECK(0 <= new_len && new_len <= length);
175
176 BigStr* result = NewStr(new_len); // has kEmptyString optimization
177 memcpy(result->data_, data_ + begin, new_len);
178
179 return result;
180}
181
182// Used by 'help' builtin and --help, neither of which translate yet.
183
184List<BigStr*>* BigStr::splitlines(bool keep) {
185 DCHECK(keep == true);
186 FAIL(kNotImplemented);
187}
188
189BigStr* BigStr::upper() {
190 int length = len(this);
191 BigStr* result = NewStr(length);
192 char* buffer = result->data();
193 for (int char_index = 0; char_index < length; ++char_index) {
194 buffer[char_index] = toupper(data_[char_index]);
195 }
196 return result;
197}
198
199BigStr* BigStr::lower() {
200 int length = len(this);
201 BigStr* result = NewStr(length);
202 char* buffer = result->data();
203 for (int char_index = 0; char_index < length; ++char_index) {
204 buffer[char_index] = tolower(data_[char_index]);
205 }
206 return result;
207}
208
209BigStr* BigStr::ljust(int width, BigStr* fillchar) {
210 DCHECK(len(fillchar) == 1);
211
212 int length = len(this);
213 int num_fill = width - length;
214 if (num_fill < 0) {
215 return this;
216 } else {
217 BigStr* result = NewStr(width);
218 char c = fillchar->data_[0];
219 memcpy(result->data_, data_, length);
220 for (int i = length; i < width; ++i) {
221 result->data_[i] = c;
222 }
223 return result;
224 }
225}
226
227BigStr* BigStr::rjust(int width, BigStr* fillchar) {
228 DCHECK(len(fillchar) == 1);
229
230 int length = len(this);
231 int num_fill = width - length;
232 if (num_fill < 0) {
233 return this;
234 } else {
235 BigStr* result = NewStr(width);
236 char c = fillchar->data_[0];
237 for (int i = 0; i < num_fill; ++i) {
238 result->data_[i] = c;
239 }
240 memcpy(result->data_ + num_fill, data_, length);
241 return result;
242 }
243}
244
245BigStr* BigStr::replace(BigStr* old, BigStr* new_str) {
246 // Use -1 as in python2: "aaaa".replace(-1) -> "AAAA"
247 return replace(old, new_str, -1);
248}
249
250BigStr* BigStr::replace(BigStr* old, BigStr* new_str, int count) {
251 // log("replacing %s with %s", old_data, new_str->data_);
252 const char* old_data = old->data_;
253
254 int this_len = len(this);
255 int old_len = len(old);
256
257 const char* last_possible = data_ + this_len - old_len;
258
259 const char* p_this = data_; // advances through 'this'
260
261 // First pass: Calculate number of replacements, and hence new length
262 int replace_count = 0;
263 if (old_len == 0) {
264 replace_count = this_len + 1;
265 if (count > 0) {
266 replace_count = min(replace_count, count);
267 }
268 } else {
269 while (p_this <= last_possible) {
270 if (replace_count != count && // limit replacements (if count != -1)
271 memcmp(p_this, old_data, old_len) == 0) { // equal
272 replace_count++;
273 p_this += old_len;
274 } else {
275 p_this++;
276 }
277 }
278 }
279
280 // log("replacements %d", replace_count);
281
282 if (replace_count == 0) {
283 return this; // Reuse the string if there were no replacements
284 }
285
286 int new_str_len = len(new_str);
287 int result_len =
288 this_len - (replace_count * old_len) + (replace_count * new_str_len);
289
290 BigStr* result = NewStr(result_len);
291
292 const char* new_data = new_str->data_;
293 const size_t new_len = new_str_len;
294
295 // Second pass: Copy pieces into 'result'
296 p_this = data_; // back to beginning
297 char* p_result = result->data_; // advances through 'result'
298 replace_count = 0;
299
300 if (old_len == 0) {
301 // Should place new_str between each char in this
302 while (p_this < last_possible && replace_count != count) {
303 replace_count++;
304 memcpy(p_result, new_data, new_len); // Copy from new_str
305 p_result += new_len; // Move past new_str
306
307 // Write a char from this
308 *p_result = *p_this;
309 p_this++;
310 p_result++;
311 }
312
313 if (replace_count != count) {
314 // Write a copy of new_str at the end
315 assert(p_this == last_possible);
316 memcpy(p_result, new_data, new_len);
317 } else if (p_this <= last_possible) {
318 // Write the last part of string
319 memcpy(p_result, p_this, data_ + this_len - p_this);
320 }
321 } else {
322 while (p_this <= last_possible) {
323 // Note: would be more efficient if we remembered the match positions
324 if (replace_count != count && // limit replacements (if count != -1)
325 memcmp(p_this, old_data, old_len) == 0) { // equal
326 memcpy(p_result, new_data, new_len); // Copy from new_str
327 replace_count++;
328 p_result += new_len;
329 p_this += old_len;
330 } else { // copy 1 byte
331 *p_result = *p_this;
332 p_result++;
333 p_this++;
334 }
335 }
336 memcpy(p_result, p_this, data_ + this_len - p_this); // last part of string
337 }
338
339 return result;
340}
341
342enum class StripWhere {
343 Left,
344 Right,
345 Both,
346};
347
348const int kWhitespace = -1;
349
350bool OmitChar(int ch, int what) {
351 if (what == kWhitespace) {
352 // Intentional incompatibility with Python, where say \v is whitespace
353 // '\v'.strip() == ''
354 //
355 // But it is consistent with the JSON spec [ \t\r\n] and the rules in
356 // frontend/lexer_def.py
357 //
358 // Note that the YSH is separate, and Str => trim() respects Unicode.
359 return IsAsciiWhitespace(ch);
360 } else {
361 return what == ch;
362 }
363}
364
365bool OmitCharMany(int ch, BigStr* chars) {
366 for (int i = 0; i < len(chars); ++i) {
367 if (ch == chars->data_[i]) {
368 return true;
369 }
370 }
371 return false;
372}
373
374// StripAny is modeled after CPython's do_strip() in stringobject.c, and can
375// implement 6 functions:
376//
377// strip / lstrip / rstrip
378// strip(char) / lstrip(char) / rstrip(char)
379//
380// Args:
381// where: which ends to strip from
382// what: kWhitespace, or an ASCII code 0-255
383
384BigStr* StripAny(BigStr* s, StripWhere where, int what) {
385 int length = len(s);
386 const char* char_data = s->data();
387
388 int i = 0;
389 if (where != StripWhere::Right) {
390 while (i < length && OmitChar(char_data[i], what)) {
391 i++;
392 }
393 }
394
395 int j = length;
396 if (where != StripWhere::Left) {
397 do {
398 j--;
399 } while (j >= i && OmitChar(char_data[j], what));
400 j++;
401 }
402
403 if (i == j) { // Optimization to reuse existing object
404 return kEmptyString;
405 }
406
407 if (i == 0 && j == length) { // nothing stripped
408 return s;
409 }
410
411 // Note: makes a copy in leaky version, and will in GC version too
412 int new_len = j - i;
413 BigStr* result = NewStr(new_len);
414 memcpy(result->data(), s->data() + i, new_len);
415 return result;
416}
417
418BigStr* BigStr::strip() {
419 return StripAny(this, StripWhere::Both, kWhitespace);
420}
421
422// Used for CommandSub in osh/cmd_exec.py
423BigStr* BigStr::rstrip(BigStr* chars) {
424 int num_chars = len(chars);
425 if (num_chars == 0) {
426 return this;
427 }
428
429 // multiple chars, for word splitting
430 if (num_chars > 1) {
431 int j = len(this);
432 do {
433 j--;
434 } while (j >= 0 && OmitCharMany(data_[j], chars));
435 j++;
436
437 int new_len = j;
438 BigStr* result = NewStr(new_len);
439 memcpy(result->data(), data_, new_len);
440 return result;
441 }
442
443 // exactly 1 char
444 int c = chars->data_[0];
445 return StripAny(this, StripWhere::Right, c);
446}
447
448BigStr* BigStr::rstrip() {
449 return StripAny(this, StripWhere::Right, kWhitespace);
450}
451
452BigStr* BigStr::lstrip(BigStr* chars) {
453 DCHECK(len(chars) == 1);
454 int c = chars->data_[0];
455 return StripAny(this, StripWhere::Left, c);
456}
457
458BigStr* BigStr::lstrip() {
459 return StripAny(this, StripWhere::Left, kWhitespace);
460}
461
462BigStr* BigStr::join(List<BigStr*>* items) {
463 int length = 0;
464
465 int num_parts = len(items);
466
467 // " ".join([]) == ""
468 if (num_parts == 0) {
469 return kEmptyString;
470 }
471
472 // Common case
473 // 'anything'.join(["foo"]) == "foo"
474 if (num_parts == 1) {
475 return items->at(0);
476 }
477
478 for (int i = 0; i < num_parts; ++i) {
479 length += len(items->at(i));
480 }
481 // add length of all the separators
482 int this_len = len(this);
483 length += this_len * (num_parts - 1);
484
485 BigStr* result = NewStr(length);
486 char* p_result = result->data_; // advances through
487
488 for (int i = 0; i < num_parts; ++i) {
489 // log("i %d", i);
490 if (i != 0 && this_len) { // optimize common case of ''.join()
491 memcpy(p_result, data_, this_len); // copy the separator
492 p_result += this_len;
493 // log("this_len %d", this_len);
494 }
495
496 int n = len(items->at(i));
497 // log("n: %d", n);
498 memcpy(p_result, items->at(i)->data_, n); // copy the list item
499 p_result += n;
500 }
501
502 return result;
503}
504
505static void AppendPart(List<BigStr*>* result, BigStr* s, int left, int right) {
506 int new_len = right - left;
507 BigStr* part;
508 if (new_len == 0) {
509 part = kEmptyString;
510 } else {
511 part = NewStr(new_len);
512 memcpy(part->data_, s->data_ + left, new_len);
513 }
514 result->append(part);
515}
516
517// Split BigStr into List<BigStr*> of parts separated by 'sep'.
518// The code structure is taken from CPython's Objects/stringlib/split.h.
519List<BigStr*>* BigStr::split(BigStr* sep, int max_split) {
520 DCHECK(sep != nullptr);
521 DCHECK(len(sep) == 1); // we can only split one char
522 char sep_char = sep->data_[0];
523
524 int str_len = len(this);
525 if (str_len == 0) {
526 // weird case consistent with Python: ''.split(':') == ['']
527 return NewList<BigStr*>({kEmptyString});
528 }
529
530 List<BigStr*>* result = NewList<BigStr*>({});
531 int left = 0;
532 int right = 0;
533 int num_parts = 0; // 3 splits results in 4 parts
534
535 while (right < str_len && num_parts < max_split) {
536 // search for separator
537 for (; right < str_len; right++) {
538 if (data_[right] == sep_char) {
539 AppendPart(result, this, left, right);
540 right++;
541 left = right;
542 num_parts++;
543 break;
544 }
545 }
546 }
547 if (num_parts == 0) { // Optimization when there is no split
548 result->append(this);
549 } else if (left <= str_len) { // Last part
550 AppendPart(result, this, left, str_len);
551 }
552
553 return result;
554}
555
556List<BigStr*>* BigStr::split(BigStr* sep) {
557 return this->split(sep, len(this));
558}
559
560unsigned BigStr::hash(HashFunc h) {
561 if (!is_hashed_) {
562 hash_ = h(data_, len(this)) >> 1;
563 is_hashed_ = 1;
564 }
565 return hash_;
566}
567
568static inline BigStr* _StrFormat(const char* fmt, int fmt_len, va_list args) {
569 auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
570 auto end = std::cregex_iterator();
571
572 char int_buf[kMaxFmtWidth];
573 std::string buf;
574 for (std::cregex_iterator it = beg; it != end; ++it) {
575 const std::cmatch& match = *it;
576
577 const std::csub_match& lit_m = match[1];
578 DCHECK(lit_m.matched);
579 const std::string& lit_s = lit_m.str();
580 buf.append(lit_s);
581
582 int width = 0;
583 bool zero_pad = false;
584 bool pad_back = false;
585 const std::csub_match& width_m = match[2];
586 const std::string& width_s = width_m.str();
587 bool ok = false;
588 if (width_m.matched && !width_s.empty()) {
589 if (width_s[0] == '0') {
590 zero_pad = true;
591 DCHECK(width_s.size() > 1);
592 ok = StringToInt(width_s.c_str() + 1, width_s.size() - 1, 10, &width);
593 DCHECK(ok);
594 (void)ok; // silence unused var warning in opt
595 } else {
596 ok = StringToInt(width_s.c_str(), width_s.size(), 10, &width);
597 DCHECK(ok);
598 }
599 if (width < 0) {
600 pad_back = true;
601 width *= -1;
602 }
603 DCHECK(0 <= width && width < kMaxFmtWidth);
604 }
605
606 char const* str_to_add = nullptr;
607 int add_len = 0;
608 const std::csub_match& code_m = match[3];
609 const std::string& code_s = code_m.str();
610 if (!code_m.matched) {
611 DCHECK(!width_m.matched); // python errors on invalid format operators
612 break;
613 }
614 DCHECK(code_s.size() == 1);
615 switch (code_s[0]) {
616 case '%': {
617 str_to_add = code_s.c_str();
618 add_len = 1;
619 break;
620 }
621 case 's': {
622 BigStr* s = va_arg(args, BigStr*);
623 // Check type unconditionally because mycpp doesn't always check it
624 CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
625
626 str_to_add = s->data();
627 add_len = len(s);
628 zero_pad = false; // python ignores the 0 directive for strings
629 break;
630 }
631 case 'r': {
632 BigStr* s = va_arg(args, BigStr*);
633 // Check type unconditionally because mycpp doesn't always check it
634 CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
635
636 s = repr(s);
637 str_to_add = s->data();
638 add_len = len(s);
639 zero_pad = false; // python ignores the 0 directive for strings
640 break;
641 }
642 case 'd': // fallthrough
643 case 'o': {
644 int d = va_arg(args, int);
645 add_len = snprintf(int_buf, kMaxFmtWidth,
646 match.str().c_str() + lit_s.size(), d);
647 DCHECK(add_len > 0);
648 str_to_add = int_buf;
649 break;
650 }
651 default:
652 DCHECK(0);
653 break;
654 }
655 DCHECK(str_to_add != nullptr);
656
657 if (pad_back) {
658 buf.append(str_to_add, add_len);
659 }
660 if (add_len < width) {
661 for (int i = 0; i < width - add_len; ++i) {
662 buf.push_back(zero_pad ? '0' : ' ');
663 }
664 }
665 if (!pad_back) {
666 buf.append(str_to_add, add_len);
667 }
668 }
669
670 return StrFromC(buf.c_str(), buf.size());
671}
672
673BigStr* StrIter::Value() { // similar to at()
674 BigStr* result = NewStr(1);
675 result->data_[0] = s_->data_[i_];
676 DCHECK(result->data_[1] == '\0');
677 return result;
678}
679
680BigStr* StrFormat(const char* fmt, ...) {
681 va_list args;
682 va_start(args, fmt);
683 BigStr* ret = _StrFormat(fmt, strlen(fmt), args);
684 va_end(args);
685 return ret;
686}
687
688BigStr* StrFormat(BigStr* fmt, ...) {
689 va_list args;
690 va_start(args, fmt);
691 BigStr* ret = _StrFormat(fmt->data(), len(fmt), args);
692 va_end(args);
693 return ret;
694}