OILS / mycpp / gc_str.cc View on Github | oils.pub

674 lines, 416 significant
1#include "mycpp/gc_str.h"
2
3#include <ctype.h> // isalpha(), isdigit()
4#include <stdarg.h>
5
6#include <regex>
7
8#include "mycpp/common.h"
9#include "mycpp/gc_alloc.h" // NewStr()
10#include "mycpp/gc_builtins.h" // StringToInt()
11#include "mycpp/gc_list.h" // join(), split() use it
12
13GLOBAL_STR(kEmptyString, "");
14
15static const std::regex gStrFmtRegex("([^%]*)(?:%(-?[0-9]*)(.))?");
16static const int kMaxFmtWidth = 256; // arbitrary...
17
18int BigStr::find(BigStr* needle, int start, int end) {
19 if (end == -1) {
20 end = len(this);
21 }
22 int needle_len = len(needle);
23
24 if (needle_len > (end - start)) {
25 return -1; // needle is too long to be found (Python behavior)
26 }
27
28 if (needle_len == 1) {
29 char c = needle->data_[0];
30 // For 'aaa'.find('a', 0, 1)
31 // end = 1, needle_len = 1, last_start = 1 which means we go through once
32 for (int i = start; i < end; ++i) {
33 if (data_[i] == c) {
34 return i;
35 }
36 }
37 } else {
38 // Note: this works for finding the empty string. Empty string is found in
39 // empty range like [5, 5), but not in [5, 4)
40
41 // For 'aaa'.find('aa', 0, 2)
42 // end = 2, needle_len = 2, last_start = 1 which means we go through once
43
44 int last_start = end - needle_len + 1;
45 // could use a smarter substring search algorithm
46 for (int i = start; i < last_start; ++i) {
47 if (memcmp(data_ + i, needle->data_, needle_len) == 0) {
48 return i;
49 }
50 }
51 }
52 return -1;
53}
54
55int BigStr::rfind(BigStr* needle) {
56 int length = len(this);
57 DCHECK(len(needle) == 1); // Oils usage
58 char c = needle->data_[0];
59 for (int i = length - 1; i >= 0; --i) {
60 if (data_[i] == c) {
61 return i;
62 }
63 }
64 return -1;
65}
66
67bool BigStr::isdigit() {
68 int n = len(this);
69 if (n == 0) {
70 return false; // special case
71 }
72 for (int i = 0; i < n; ++i) {
73 if (!::isdigit(data_[i])) {
74 return false;
75 }
76 }
77 return true;
78}
79
80bool BigStr::isalpha() {
81 int n = len(this);
82 if (n == 0) {
83 return false; // special case
84 }
85 for (int i = 0; i < n; ++i) {
86 if (!::isalpha(data_[i])) {
87 return false;
88 }
89 }
90 return true;
91}
92
93// e.g. for osh/braces.py
94bool BigStr::isupper() {
95 int n = len(this);
96 if (n == 0) {
97 return false; // special case
98 }
99 for (int i = 0; i < n; ++i) {
100 if (!::isupper(data_[i])) {
101 return false;
102 }
103 }
104 return true;
105}
106
107bool BigStr::startswith(BigStr* s) {
108 int n = len(s);
109 if (n > len(this)) {
110 return false;
111 }
112 return memcmp(data_, s->data_, n) == 0;
113}
114
115bool BigStr::endswith(BigStr* s) {
116 int len_s = len(s);
117 int len_this = len(this);
118 if (len_s > len_this) {
119 return false;
120 }
121 const char* start = data_ + len_this - len_s;
122 return memcmp(start, s->data_, len_s) == 0;
123}
124
125// Get a string with one character
126BigStr* BigStr::at(int i) {
127 int length = len(this);
128 if (i < 0) {
129 i = length + i;
130 }
131 DCHECK(0 <= i);
132 DCHECK(i < length); // had a problem here!
133
134 BigStr* result = NewStr(1);
135 result->data_[0] = data_[i];
136 return result;
137}
138
139// s[begin:]
140BigStr* BigStr::slice(int begin) {
141 return slice(begin, len(this));
142}
143
144// s[begin:end]
145BigStr* BigStr::slice(int begin, int end) {
146 int length = len(this);
147 SLICE_ADJUST(begin, end, length);
148
149 DCHECK(0 <= begin && begin <= length);
150 DCHECK(0 <= end && end <= length);
151
152 int new_len = end - begin;
153 DCHECK(0 <= new_len && new_len <= length);
154
155 BigStr* result = NewStr(new_len); // has kEmptyString optimization
156 memcpy(result->data_, data_ + begin, new_len);
157
158 return result;
159}
160
161// Used by 'help' builtin and --help, neither of which translate yet.
162
163List<BigStr*>* BigStr::splitlines(bool keep) {
164 DCHECK(keep == true);
165 FAIL(kNotImplemented);
166}
167
168BigStr* BigStr::upper() {
169 int length = len(this);
170 BigStr* result = NewStr(length);
171 char* buffer = result->data();
172 for (int char_index = 0; char_index < length; ++char_index) {
173 buffer[char_index] = toupper(data_[char_index]);
174 }
175 return result;
176}
177
178BigStr* BigStr::lower() {
179 int length = len(this);
180 BigStr* result = NewStr(length);
181 char* buffer = result->data();
182 for (int char_index = 0; char_index < length; ++char_index) {
183 buffer[char_index] = tolower(data_[char_index]);
184 }
185 return result;
186}
187
188BigStr* BigStr::ljust(int width, BigStr* fillchar) {
189 DCHECK(len(fillchar) == 1);
190
191 int length = len(this);
192 int num_fill = width - length;
193 if (num_fill < 0) {
194 return this;
195 } else {
196 BigStr* result = NewStr(width);
197 char c = fillchar->data_[0];
198 memcpy(result->data_, data_, length);
199 for (int i = length; i < width; ++i) {
200 result->data_[i] = c;
201 }
202 return result;
203 }
204}
205
206BigStr* BigStr::rjust(int width, BigStr* fillchar) {
207 DCHECK(len(fillchar) == 1);
208
209 int length = len(this);
210 int num_fill = width - length;
211 if (num_fill < 0) {
212 return this;
213 } else {
214 BigStr* result = NewStr(width);
215 char c = fillchar->data_[0];
216 for (int i = 0; i < num_fill; ++i) {
217 result->data_[i] = c;
218 }
219 memcpy(result->data_ + num_fill, data_, length);
220 return result;
221 }
222}
223
224BigStr* BigStr::replace(BigStr* old, BigStr* new_str) {
225 // Use -1 as in python2: "aaaa".replace(-1) -> "AAAA"
226 return replace(old, new_str, -1);
227}
228
229BigStr* BigStr::replace(BigStr* old, BigStr* new_str, int count) {
230 // log("replacing %s with %s", old_data, new_str->data_);
231 const char* old_data = old->data_;
232
233 int this_len = len(this);
234 int old_len = len(old);
235
236 const char* last_possible = data_ + this_len - old_len;
237
238 const char* p_this = data_; // advances through 'this'
239
240 // First pass: Calculate number of replacements, and hence new length
241 int replace_count = 0;
242 if (old_len == 0) {
243 replace_count = this_len + 1;
244 if (count > 0) {
245 replace_count = min(replace_count, count);
246 }
247 } else {
248 while (p_this <= last_possible) {
249 if (replace_count != count && // limit replacements (if count != -1)
250 memcmp(p_this, old_data, old_len) == 0) { // equal
251 replace_count++;
252 p_this += old_len;
253 } else {
254 p_this++;
255 }
256 }
257 }
258
259 // log("replacements %d", replace_count);
260
261 if (replace_count == 0) {
262 return this; // Reuse the string if there were no replacements
263 }
264
265 int new_str_len = len(new_str);
266 int result_len =
267 this_len - (replace_count * old_len) + (replace_count * new_str_len);
268
269 BigStr* result = NewStr(result_len);
270
271 const char* new_data = new_str->data_;
272 const size_t new_len = new_str_len;
273
274 // Second pass: Copy pieces into 'result'
275 p_this = data_; // back to beginning
276 char* p_result = result->data_; // advances through 'result'
277 replace_count = 0;
278
279 if (old_len == 0) {
280 // Should place new_str between each char in this
281 while (p_this < last_possible && replace_count != count) {
282 replace_count++;
283 memcpy(p_result, new_data, new_len); // Copy from new_str
284 p_result += new_len; // Move past new_str
285
286 // Write a char from this
287 *p_result = *p_this;
288 p_this++;
289 p_result++;
290 }
291
292 if (replace_count != count) {
293 // Write a copy of new_str at the end
294 assert(p_this == last_possible);
295 memcpy(p_result, new_data, new_len);
296 } else if (p_this <= last_possible) {
297 // Write the last part of string
298 memcpy(p_result, p_this, data_ + this_len - p_this);
299 }
300 } else {
301 while (p_this <= last_possible) {
302 // Note: would be more efficient if we remembered the match positions
303 if (replace_count != count && // limit replacements (if count != -1)
304 memcmp(p_this, old_data, old_len) == 0) { // equal
305 memcpy(p_result, new_data, new_len); // Copy from new_str
306 replace_count++;
307 p_result += new_len;
308 p_this += old_len;
309 } else { // copy 1 byte
310 *p_result = *p_this;
311 p_result++;
312 p_this++;
313 }
314 }
315 memcpy(p_result, p_this, data_ + this_len - p_this); // last part of string
316 }
317
318 return result;
319}
320
321enum class StripWhere {
322 Left,
323 Right,
324 Both,
325};
326
327const int kWhitespace = -1;
328
329bool OmitChar(int ch, int what) {
330 if (what == kWhitespace) {
331 // Intentional incompatibility with Python, where say \v is whitespace
332 // '\v'.strip() == ''
333 //
334 // But it is consistent with the JSON spec [ \t\r\n] and the rules in
335 // frontend/lexer_def.py
336 //
337 // Note that the YSH is separate, and Str => trim() respects Unicode.
338 return IsAsciiWhitespace(ch);
339 } else {
340 return what == ch;
341 }
342}
343
344bool OmitCharMany(int ch, BigStr* chars) {
345 for (int i = 0; i < len(chars); ++i) {
346 if (ch == chars->data_[i]) {
347 return true;
348 }
349 }
350 return false;
351}
352
353// StripAny is modeled after CPython's do_strip() in stringobject.c, and can
354// implement 6 functions:
355//
356// strip / lstrip / rstrip
357// strip(char) / lstrip(char) / rstrip(char)
358//
359// Args:
360// where: which ends to strip from
361// what: kWhitespace, or an ASCII code 0-255
362
363BigStr* StripAny(BigStr* s, StripWhere where, int what) {
364 int length = len(s);
365 const char* char_data = s->data();
366
367 int i = 0;
368 if (where != StripWhere::Right) {
369 while (i < length && OmitChar(char_data[i], what)) {
370 i++;
371 }
372 }
373
374 int j = length;
375 if (where != StripWhere::Left) {
376 do {
377 j--;
378 } while (j >= i && OmitChar(char_data[j], what));
379 j++;
380 }
381
382 if (i == j) { // Optimization to reuse existing object
383 return kEmptyString;
384 }
385
386 if (i == 0 && j == length) { // nothing stripped
387 return s;
388 }
389
390 // Note: makes a copy in leaky version, and will in GC version too
391 int new_len = j - i;
392 BigStr* result = NewStr(new_len);
393 memcpy(result->data(), s->data() + i, new_len);
394 return result;
395}
396
397BigStr* BigStr::strip() {
398 return StripAny(this, StripWhere::Both, kWhitespace);
399}
400
401// Used for CommandSub in osh/cmd_exec.py
402BigStr* BigStr::rstrip(BigStr* chars) {
403 int num_chars = len(chars);
404 if (num_chars == 0) {
405 return this;
406 }
407
408 // multiple chars, for word splitting
409 if (num_chars > 1) {
410 const char* char_data = data_;
411 int j = len(this);
412 do {
413 j--;
414 } while (j >= 0 && OmitCharMany(data_[j], chars));
415 j++;
416
417 int new_len = j;
418 BigStr* result = NewStr(new_len);
419 memcpy(result->data(), data_, new_len);
420 return result;
421 }
422
423 // exactly 1 char
424 int c = chars->data_[0];
425 return StripAny(this, StripWhere::Right, c);
426}
427
428BigStr* BigStr::rstrip() {
429 return StripAny(this, StripWhere::Right, kWhitespace);
430}
431
432BigStr* BigStr::lstrip(BigStr* chars) {
433 DCHECK(len(chars) == 1);
434 int c = chars->data_[0];
435 return StripAny(this, StripWhere::Left, c);
436}
437
438BigStr* BigStr::lstrip() {
439 return StripAny(this, StripWhere::Left, kWhitespace);
440}
441
442BigStr* BigStr::join(List<BigStr*>* items) {
443 int length = 0;
444
445 int num_parts = len(items);
446
447 // " ".join([]) == ""
448 if (num_parts == 0) {
449 return kEmptyString;
450 }
451
452 // Common case
453 // 'anything'.join(["foo"]) == "foo"
454 if (num_parts == 1) {
455 return items->at(0);
456 }
457
458 for (int i = 0; i < num_parts; ++i) {
459 length += len(items->at(i));
460 }
461 // add length of all the separators
462 int this_len = len(this);
463 length += this_len * (num_parts - 1);
464
465 BigStr* result = NewStr(length);
466 char* p_result = result->data_; // advances through
467
468 for (int i = 0; i < num_parts; ++i) {
469 // log("i %d", i);
470 if (i != 0 && this_len) { // optimize common case of ''.join()
471 memcpy(p_result, data_, this_len); // copy the separator
472 p_result += this_len;
473 // log("this_len %d", this_len);
474 }
475
476 int n = len(items->at(i));
477 // log("n: %d", n);
478 memcpy(p_result, items->at(i)->data_, n); // copy the list item
479 p_result += n;
480 }
481
482 return result;
483}
484
485static void AppendPart(List<BigStr*>* result, BigStr* s, int left, int right) {
486 int new_len = right - left;
487 BigStr* part;
488 if (new_len == 0) {
489 part = kEmptyString;
490 } else {
491 part = NewStr(new_len);
492 memcpy(part->data_, s->data_ + left, new_len);
493 }
494 result->append(part);
495}
496
497// Split BigStr into List<BigStr*> of parts separated by 'sep'.
498// The code structure is taken from CPython's Objects/stringlib/split.h.
499List<BigStr*>* BigStr::split(BigStr* sep, int max_split) {
500 DCHECK(sep != nullptr);
501 DCHECK(len(sep) == 1); // we can only split one char
502 char sep_char = sep->data_[0];
503
504 int str_len = len(this);
505 if (str_len == 0) {
506 // weird case consistent with Python: ''.split(':') == ['']
507 return NewList<BigStr*>({kEmptyString});
508 }
509
510 List<BigStr*>* result = NewList<BigStr*>({});
511 int left = 0;
512 int right = 0;
513 int num_parts = 0; // 3 splits results in 4 parts
514
515 while (right < str_len && num_parts < max_split) {
516 // search for separator
517 for (; right < str_len; right++) {
518 if (data_[right] == sep_char) {
519 AppendPart(result, this, left, right);
520 right++;
521 left = right;
522 num_parts++;
523 break;
524 }
525 }
526 }
527 if (num_parts == 0) { // Optimization when there is no split
528 result->append(this);
529 } else if (left <= str_len) { // Last part
530 AppendPart(result, this, left, str_len);
531 }
532
533 return result;
534}
535
536List<BigStr*>* BigStr::split(BigStr* sep) {
537 return this->split(sep, len(this));
538}
539
540unsigned BigStr::hash(HashFunc h) {
541 if (!is_hashed_) {
542 hash_ = h(data_, len(this)) >> 1;
543 is_hashed_ = 1;
544 }
545 return hash_;
546}
547
548static inline BigStr* _StrFormat(const char* fmt, int fmt_len, va_list args) {
549 auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
550 auto end = std::cregex_iterator();
551
552 char int_buf[kMaxFmtWidth];
553 std::string buf;
554 for (std::cregex_iterator it = beg; it != end; ++it) {
555 const std::cmatch& match = *it;
556
557 const std::csub_match& lit_m = match[1];
558 DCHECK(lit_m.matched);
559 const std::string& lit_s = lit_m.str();
560 buf.append(lit_s);
561
562 int width = 0;
563 bool zero_pad = false;
564 bool pad_back = false;
565 const std::csub_match& width_m = match[2];
566 const std::string& width_s = width_m.str();
567 bool ok = false;
568 if (width_m.matched && !width_s.empty()) {
569 if (width_s[0] == '0') {
570 zero_pad = true;
571 DCHECK(width_s.size() > 1);
572 ok = StringToInt(width_s.c_str() + 1, width_s.size() - 1, 10, &width);
573 DCHECK(ok);
574 (void)ok; // silence unused var warning in opt
575 } else {
576 ok = StringToInt(width_s.c_str(), width_s.size(), 10, &width);
577 DCHECK(ok);
578 }
579 if (width < 0) {
580 pad_back = true;
581 width *= -1;
582 }
583 DCHECK(0 <= width && width < kMaxFmtWidth);
584 }
585
586 char const* str_to_add = nullptr;
587 int add_len = 0;
588 const std::csub_match& code_m = match[3];
589 const std::string& code_s = code_m.str();
590 if (!code_m.matched) {
591 DCHECK(!width_m.matched); // python errors on invalid format operators
592 break;
593 }
594 DCHECK(code_s.size() == 1);
595 switch (code_s[0]) {
596 case '%': {
597 str_to_add = code_s.c_str();
598 add_len = 1;
599 break;
600 }
601 case 's': {
602 BigStr* s = va_arg(args, BigStr*);
603 // Check type unconditionally because mycpp doesn't always check it
604 CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
605
606 str_to_add = s->data();
607 add_len = len(s);
608 zero_pad = false; // python ignores the 0 directive for strings
609 break;
610 }
611 case 'r': {
612 BigStr* s = va_arg(args, BigStr*);
613 // Check type unconditionally because mycpp doesn't always check it
614 CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
615
616 s = repr(s);
617 str_to_add = s->data();
618 add_len = len(s);
619 zero_pad = false; // python ignores the 0 directive for strings
620 break;
621 }
622 case 'd': // fallthrough
623 case 'o': {
624 int d = va_arg(args, int);
625 add_len = snprintf(int_buf, kMaxFmtWidth,
626 match.str().c_str() + lit_s.size(), d);
627 DCHECK(add_len > 0);
628 str_to_add = int_buf;
629 break;
630 }
631 default:
632 DCHECK(0);
633 break;
634 }
635 DCHECK(str_to_add != nullptr);
636
637 if (pad_back) {
638 buf.append(str_to_add, add_len);
639 }
640 if (add_len < width) {
641 for (int i = 0; i < width - add_len; ++i) {
642 buf.push_back(zero_pad ? '0' : ' ');
643 }
644 }
645 if (!pad_back) {
646 buf.append(str_to_add, add_len);
647 }
648 }
649
650 return StrFromC(buf.c_str(), buf.size());
651}
652
653BigStr* StrIter::Value() { // similar to at()
654 BigStr* result = NewStr(1);
655 result->data_[0] = s_->data_[i_];
656 DCHECK(result->data_[1] == '\0');
657 return result;
658}
659
660BigStr* StrFormat(const char* fmt, ...) {
661 va_list args;
662 va_start(args, fmt);
663 BigStr* ret = _StrFormat(fmt, strlen(fmt), args);
664 va_end(args);
665 return ret;
666}
667
668BigStr* StrFormat(BigStr* fmt, ...) {
669 va_list args;
670 va_start(args, fmt);
671 BigStr* ret = _StrFormat(fmt->data(), len(fmt), args);
672 va_end(args);
673 return ret;
674}