examples

Coverage Report

Created: 2025-05-15 22:19

/home/uke/oil/mycpp/gc_str.cc
Line
Count
Source (jump to first uncovered line)
1
#include "mycpp/gc_str.h"
2
3
#include <ctype.h>  // isalpha(), isdigit()
4
#include <stdarg.h>
5
6
#include <regex>
7
8
#include "mycpp/common.h"
9
#include "mycpp/gc_alloc.h"     // NewStr()
10
#include "mycpp/gc_builtins.h"  // StringToInt()
11
#include "mycpp/gc_list.h"      // join(), split() use it
12
13
GLOBAL_STR(kEmptyString, "");
14
15
static const std::regex gStrFmtRegex("([^%]*)(?:%(-?[0-9]*)(.))?");
16
static const int kMaxFmtWidth = 256;  // arbitrary...
17
18
99
int BigStr::find(BigStr* needle, int start, int end) {
19
99
  if (end == -1) {
20
45
    end = len(this);
21
45
  }
22
99
  int needle_len = len(needle);
23
24
99
  if (needle_len > (end - start)) {
25
15
    return -1;  // needle is too long to be found (Python behavior)
26
15
  }
27
28
84
  if (needle_len == 1) {
29
34
    char c = needle->data_[0];
30
    // For 'aaa'.find('a', 0, 1)
31
    // end = 1, needle_len = 1, last_start = 1 which means we go through once
32
157
    for (int i = start; i < end; ++i) {
33
133
      if (data_[i] == c) {
34
10
        return i;
35
10
      }
36
133
    }
37
50
  } else {
38
    // Note: this works for finding the empty string.  Empty string is found in
39
    // empty range like [5, 5), but not in [5, 4)
40
41
    // For 'aaa'.find('aa', 0, 2)
42
    // end = 2, needle_len = 2, last_start = 1 which means we go through once
43
44
50
    int last_start = end - needle_len + 1;
45
    // could use a smarter substring search algorithm
46
107
    for (int i = start; i < last_start; ++i) {
47
90
      if (memcmp(data_ + i, needle->data_, needle_len) == 0) {
48
33
        return i;
49
33
      }
50
90
    }
51
50
  }
52
41
  return -1;
53
84
}
54
55
0
int BigStr::rfind(BigStr* needle) {
56
0
  int length = len(this);
57
0
  DCHECK(len(needle) == 1);  // Oils usage
58
0
  char c = needle->data_[0];
59
0
  for (int i = length - 1; i >= 0; --i) {
60
0
    if (data_[i] == c) {
61
0
      return i;
62
0
    }
63
0
  }
64
0
  return -1;
65
0
}
66
67
47
bool BigStr::isdigit() {
68
47
  int n = len(this);
69
47
  if (n == 0) {
70
0
    return false;  // special case
71
0
  }
72
61
  for (int i = 0; i < n; ++i) {
73
47
    if (!::isdigit(data_[i])) {
74
33
      return false;
75
33
    }
76
47
  }
77
14
  return true;
78
47
}
79
80
33
bool BigStr::isalpha() {
81
33
  int n = len(this);
82
33
  if (n == 0) {
83
0
    return false;  // special case
84
0
  }
85
45
  for (int i = 0; i < n; ++i) {
86
33
    if (!::isalpha(data_[i])) {
87
21
      return false;
88
21
    }
89
33
  }
90
12
  return true;
91
33
}
92
93
// e.g. for osh/braces.py
94
0
bool BigStr::isupper() {
95
0
  int n = len(this);
96
0
  if (n == 0) {
97
0
    return false;  // special case
98
0
  }
99
0
  for (int i = 0; i < n; ++i) {
100
0
    if (!::isupper(data_[i])) {
101
0
      return false;
102
0
    }
103
0
  }
104
0
  return true;
105
0
}
106
107
5
bool BigStr::startswith(BigStr* s) {
108
5
  int n = len(s);
109
5
  if (n > len(this)) {
110
0
    return false;
111
0
  }
112
5
  return memcmp(data_, s->data_, n) == 0;
113
5
}
114
115
4
bool BigStr::endswith(BigStr* s) {
116
4
  int len_s = len(s);
117
4
  int len_this = len(this);
118
4
  if (len_s > len_this) {
119
1
    return false;
120
1
  }
121
3
  const char* start = data_ + len_this - len_s;
122
3
  return memcmp(start, s->data_, len_s) == 0;
123
4
}
124
125
// Get a string with one character
126
81
BigStr* BigStr::at(int i) {
127
81
  int length = len(this);
128
81
  if (i < 0) {
129
0
    i = length + i;
130
0
  }
131
81
  DCHECK(0 <= i);
132
81
  DCHECK(i < length);  // had a problem here!
133
134
0
  BigStr* result = NewStr(1);
135
81
  result->data_[0] = data_[i];
136
81
  return result;
137
81
}
138
139
// s[begin:]
140
4
BigStr* BigStr::slice(int begin) {
141
4
  return slice(begin, len(this));
142
4
}
143
144
// s[begin:end]
145
7
BigStr* BigStr::slice(int begin, int end) {
146
7
  int length = len(this);
147
7
  SLICE_ADJUST(begin, end, length);
148
149
7
  DCHECK(0 <= begin && begin <= length);
150
7
  DCHECK(0 <= end && end <= length);
151
152
0
  int new_len = end - begin;
153
7
  DCHECK(0 <= new_len && new_len <= length);
154
155
0
  BigStr* result = NewStr(new_len);  // has kEmptyString optimization
156
7
  memcpy(result->data_, data_ + begin, new_len);
157
158
7
  return result;
159
7
}
160
161
// Used by 'help' builtin and --help, neither of which translate yet.
162
163
0
List<BigStr*>* BigStr::splitlines(bool keep) {
164
0
  DCHECK(keep == true);
165
0
  FAIL(kNotImplemented);
166
0
}
167
168
3
BigStr* BigStr::upper() {
169
3
  int length = len(this);
170
3
  BigStr* result = NewStr(length);
171
3
  char* buffer = result->data();
172
18
  for (int char_index = 0; char_index < length; ++char_index) {
173
15
    buffer[char_index] = toupper(data_[char_index]);
174
15
  }
175
3
  return result;
176
3
}
177
178
0
BigStr* BigStr::lower() {
179
0
  int length = len(this);
180
0
  BigStr* result = NewStr(length);
181
0
  char* buffer = result->data();
182
0
  for (int char_index = 0; char_index < length; ++char_index) {
183
0
    buffer[char_index] = tolower(data_[char_index]);
184
0
  }
185
0
  return result;
186
0
}
187
188
0
BigStr* BigStr::ljust(int width, BigStr* fillchar) {
189
0
  DCHECK(len(fillchar) == 1);
190
191
0
  int length = len(this);
192
0
  int num_fill = width - length;
193
0
  if (num_fill < 0) {
194
0
    return this;
195
0
  } else {
196
0
    BigStr* result = NewStr(width);
197
0
    char c = fillchar->data_[0];
198
0
    memcpy(result->data_, data_, length);
199
0
    for (int i = length; i < width; ++i) {
200
0
      result->data_[i] = c;
201
0
    }
202
0
    return result;
203
0
  }
204
0
}
205
206
0
BigStr* BigStr::rjust(int width, BigStr* fillchar) {
207
0
  DCHECK(len(fillchar) == 1);
208
209
0
  int length = len(this);
210
0
  int num_fill = width - length;
211
0
  if (num_fill < 0) {
212
0
    return this;
213
0
  } else {
214
0
    BigStr* result = NewStr(width);
215
0
    char c = fillchar->data_[0];
216
0
    for (int i = 0; i < num_fill; ++i) {
217
0
      result->data_[i] = c;
218
0
    }
219
0
    memcpy(result->data_ + num_fill, data_, length);
220
0
    return result;
221
0
  }
222
0
}
223
224
5
BigStr* BigStr::replace(BigStr* old, BigStr* new_str) {
225
  // Use -1 as in python2: "aaaa".replace(-1) -> "AAAA"
226
5
  return replace(old, new_str, -1);
227
5
}
228
229
5
BigStr* BigStr::replace(BigStr* old, BigStr* new_str, int count) {
230
  // log("replacing %s with %s", old_data, new_str->data_);
231
5
  const char* old_data = old->data_;
232
233
5
  int this_len = len(this);
234
5
  int old_len = len(old);
235
236
5
  const char* last_possible = data_ + this_len - old_len;
237
238
5
  const char* p_this = data_;  // advances through 'this'
239
240
  // First pass: Calculate number of replacements, and hence new length
241
5
  int replace_count = 0;
242
5
  if (old_len == 0) {
243
0
    replace_count = this_len + 1;
244
0
    if (count > 0) {
245
0
      replace_count = min(replace_count, count);
246
0
    }
247
5
  } else {
248
44
    while (p_this <= last_possible) {
249
39
      if (replace_count != count &&  // limit replacements (if count != -1)
250
39
          memcmp(p_this, old_data, old_len) == 0) {  // equal
251
4
        replace_count++;
252
4
        p_this += old_len;
253
35
      } else {
254
35
        p_this++;
255
35
      }
256
39
    }
257
5
  }
258
259
  // log("replacements %d", replace_count);
260
261
5
  if (replace_count == 0) {
262
2
    return this;  // Reuse the string if there were no replacements
263
2
  }
264
265
3
  int new_str_len = len(new_str);
266
3
  int result_len =
267
3
      this_len - (replace_count * old_len) + (replace_count * new_str_len);
268
269
3
  BigStr* result = NewStr(result_len);
270
271
3
  const char* new_data = new_str->data_;
272
3
  const size_t new_len = new_str_len;
273
274
  // Second pass: Copy pieces into 'result'
275
3
  p_this = data_;                  // back to beginning
276
3
  char* p_result = result->data_;  // advances through 'result'
277
3
  replace_count = 0;
278
279
3
  if (old_len == 0) {
280
    // Should place new_str between each char in this
281
0
    while (p_this < last_possible && replace_count != count) {
282
0
      replace_count++;
283
0
      memcpy(p_result, new_data, new_len);  // Copy from new_str
284
0
      p_result += new_len;                  // Move past new_str
285
286
      // Write a char from this
287
0
      *p_result = *p_this;
288
0
      p_this++;
289
0
      p_result++;
290
0
    }
291
292
0
    if (replace_count != count) {
293
      // Write a copy of new_str at the end
294
0
      assert(p_this == last_possible);
295
0
      memcpy(p_result, new_data, new_len);
296
0
    } else if (p_this <= last_possible) {
297
      // Write the last part of string
298
0
      memcpy(p_result, p_this, data_ + this_len - p_this);
299
0
    }
300
3
  } else {
301
24
    while (p_this <= last_possible) {
302
      // Note: would be more efficient if we remembered the match positions
303
21
      if (replace_count != count &&  // limit replacements (if count != -1)
304
21
          memcmp(p_this, old_data, old_len) == 0) {  // equal
305
4
        memcpy(p_result, new_data, new_len);         // Copy from new_str
306
4
        replace_count++;
307
4
        p_result += new_len;
308
4
        p_this += old_len;
309
17
      } else {  // copy 1 byte
310
17
        *p_result = *p_this;
311
17
        p_result++;
312
17
        p_this++;
313
17
      }
314
21
    }
315
3
    memcpy(p_result, p_this, data_ + this_len - p_this);  // last part of string
316
3
  }
317
318
0
  return result;
319
5
}
320
321
enum class StripWhere {
322
  Left,
323
  Right,
324
  Both,
325
};
326
327
const int kWhitespace = -1;
328
329
0
bool OmitChar(int ch, int what) {
330
0
  if (what == kWhitespace) {
331
    // Intentional incompatibility with Python, where say \v is whitespace
332
    // '\v'.strip() == ''
333
    //
334
    // But it is consistent with the JSON spec [ \t\r\n] and the rules in
335
    // frontend/lexer_def.py
336
    //
337
    // Note that the YSH is separate, and Str => trim() respects Unicode.
338
0
    return IsAsciiWhitespace(ch);
339
0
  } else {
340
0
    return what == ch;
341
0
  }
342
0
}
343
344
// StripAny is modeled after CPython's do_strip() in stringobject.c, and can
345
// implement 6 functions:
346
//
347
//   strip / lstrip / rstrip
348
//   strip(char) / lstrip(char) / rstrip(char)
349
//
350
// Args:
351
//   where: which ends to strip from
352
//   what: kWhitespace, or an ASCII code 0-255
353
354
0
BigStr* StripAny(BigStr* s, StripWhere where, int what) {
355
0
  int length = len(s);
356
0
  const char* char_data = s->data();
357
358
0
  int i = 0;
359
0
  if (where != StripWhere::Right) {
360
0
    while (i < length && OmitChar(char_data[i], what)) {
361
0
      i++;
362
0
    }
363
0
  }
364
365
0
  int j = length;
366
0
  if (where != StripWhere::Left) {
367
0
    do {
368
0
      j--;
369
0
    } while (j >= i && OmitChar(char_data[j], what));
370
0
    j++;
371
0
  }
372
373
0
  if (i == j) {  // Optimization to reuse existing object
374
0
    return kEmptyString;
375
0
  }
376
377
0
  if (i == 0 && j == length) {  // nothing stripped
378
0
    return s;
379
0
  }
380
381
  // Note: makes a copy in leaky version, and will in GC version too
382
0
  int new_len = j - i;
383
0
  BigStr* result = NewStr(new_len);
384
0
  memcpy(result->data(), s->data() + i, new_len);
385
0
  return result;
386
0
}
387
388
0
BigStr* BigStr::strip() {
389
0
  return StripAny(this, StripWhere::Both, kWhitespace);
390
0
}
391
392
// Used for CommandSub in osh/cmd_exec.py
393
0
BigStr* BigStr::rstrip(BigStr* chars) {
394
0
  DCHECK(len(chars) == 1);
395
0
  int c = chars->data_[0];
396
0
  return StripAny(this, StripWhere::Right, c);
397
0
}
398
399
0
BigStr* BigStr::rstrip() {
400
0
  return StripAny(this, StripWhere::Right, kWhitespace);
401
0
}
402
403
0
BigStr* BigStr::lstrip(BigStr* chars) {
404
0
  DCHECK(len(chars) == 1);
405
0
  int c = chars->data_[0];
406
0
  return StripAny(this, StripWhere::Left, c);
407
0
}
408
409
0
BigStr* BigStr::lstrip() {
410
0
  return StripAny(this, StripWhere::Left, kWhitespace);
411
0
}
412
413
4
BigStr* BigStr::join(List<BigStr*>* items) {
414
4
  int length = 0;
415
416
4
  int num_parts = len(items);
417
418
  // " ".join([]) == ""
419
4
  if (num_parts == 0) {
420
1
    return kEmptyString;
421
1
  }
422
423
  // Common case
424
  // 'anything'.join(["foo"]) == "foo"
425
3
  if (num_parts == 1) {
426
0
    return items->at(0);
427
0
  }
428
429
281
  for (int i = 0; i < num_parts; ++i) {
430
278
    length += len(items->at(i));
431
278
  }
432
  // add length of all the separators
433
3
  int this_len = len(this);
434
3
  length += this_len * (num_parts - 1);
435
436
3
  BigStr* result = NewStr(length);
437
3
  char* p_result = result->data_;  // advances through
438
439
281
  for (int i = 0; i < num_parts; ++i) {
440
    // log("i %d", i);
441
278
    if (i != 0 && this_len) {             // optimize common case of ''.join()
442
0
      memcpy(p_result, data_, this_len);  // copy the separator
443
0
      p_result += this_len;
444
      // log("this_len %d", this_len);
445
0
    }
446
447
278
    int n = len(items->at(i));
448
    // log("n: %d", n);
449
278
    memcpy(p_result, items->at(i)->data_, n);  // copy the list item
450
278
    p_result += n;
451
278
  }
452
453
3
  return result;
454
3
}
455
456
0
static void AppendPart(List<BigStr*>* result, BigStr* s, int left, int right) {
457
0
  int new_len = right - left;
458
0
  BigStr* part;
459
0
  if (new_len == 0) {
460
0
    part = kEmptyString;
461
0
  } else {
462
0
    part = NewStr(new_len);
463
0
    memcpy(part->data_, s->data_ + left, new_len);
464
0
  }
465
0
  result->append(part);
466
0
}
467
468
// Split BigStr into List<BigStr*> of parts separated by 'sep'.
469
// The code structure is taken from CPython's Objects/stringlib/split.h.
470
0
List<BigStr*>* BigStr::split(BigStr* sep, int max_split) {
471
0
  DCHECK(sep != nullptr);
472
0
  DCHECK(len(sep) == 1);  // we can only split one char
473
0
  char sep_char = sep->data_[0];
474
475
0
  int str_len = len(this);
476
0
  if (str_len == 0) {
477
    // weird case consistent with Python: ''.split(':') == ['']
478
0
    return NewList<BigStr*>({kEmptyString});
479
0
  }
480
481
0
  List<BigStr*>* result = NewList<BigStr*>({});
482
0
  int left = 0;
483
0
  int right = 0;
484
0
  int num_parts = 0;  // 3 splits results in 4 parts
485
486
0
  while (right < str_len && num_parts < max_split) {
487
    // search for separator
488
0
    for (; right < str_len; right++) {
489
0
      if (data_[right] == sep_char) {
490
0
        AppendPart(result, this, left, right);
491
0
        right++;
492
0
        left = right;
493
0
        num_parts++;
494
0
        break;
495
0
      }
496
0
    }
497
0
  }
498
0
  if (num_parts == 0) {  // Optimization when there is no split
499
0
    result->append(this);
500
0
  } else if (left <= str_len) {  // Last part
501
0
    AppendPart(result, this, left, str_len);
502
0
  }
503
504
0
  return result;
505
0
}
506
507
0
List<BigStr*>* BigStr::split(BigStr* sep) {
508
0
  return this->split(sep, len(this));
509
0
}
510
511
33
unsigned BigStr::hash(HashFunc h) {
512
33
  if (!is_hashed_) {
513
12
    hash_ = h(data_, len(this)) >> 1;
514
12
    is_hashed_ = 1;
515
12
  }
516
33
  return hash_;
517
33
}
518
519
1.42k
static inline BigStr* _StrFormat(const char* fmt, int fmt_len, va_list args) {
520
1.42k
  auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
521
1.42k
  auto end = std::cregex_iterator();
522
523
1.42k
  char int_buf[kMaxFmtWidth];
524
1.42k
  std::string buf;
525
3.23k
  for (std::cregex_iterator it = beg; it != end; ++it) {
526
3.23k
    const std::cmatch& match = *it;
527
528
3.23k
    const std::csub_match& lit_m = match[1];
529
3.23k
    DCHECK(lit_m.matched);
530
0
    const std::string& lit_s = lit_m.str();
531
3.23k
    buf.append(lit_s);
532
533
3.23k
    int width = 0;
534
3.23k
    bool zero_pad = false;
535
3.23k
    bool pad_back = false;
536
3.23k
    const std::csub_match& width_m = match[2];
537
3.23k
    const std::string& width_s = width_m.str();
538
3.23k
    bool ok = false;
539
3.23k
    if (width_m.matched && !width_s.empty()) {
540
3
      if (width_s[0] == '0') {
541
1
        zero_pad = true;
542
1
        DCHECK(width_s.size() > 1);
543
0
        ok = StringToInt(width_s.c_str() + 1, width_s.size() - 1, 10, &width);
544
1
        DCHECK(ok);
545
0
        (void)ok;  // silence unused var warning in opt
546
2
      } else {
547
2
        ok = StringToInt(width_s.c_str(), width_s.size(), 10, &width);
548
2
        DCHECK(ok);
549
2
      }
550
3
      if (width < 0) {
551
0
        pad_back = true;
552
0
        width *= -1;
553
0
      }
554
3
      DCHECK(0 <= width && width < kMaxFmtWidth);
555
3
    }
556
557
0
    char const* str_to_add = nullptr;
558
3.23k
    int add_len = 0;
559
3.23k
    const std::csub_match& code_m = match[3];
560
3.23k
    const std::string& code_s = code_m.str();
561
3.23k
    if (!code_m.matched) {
562
1.42k
      DCHECK(!width_m.matched);  // python errors on invalid format operators
563
0
      break;
564
1.42k
    }
565
1.80k
    DCHECK(code_s.size() == 1);
566
0
    switch (code_s[0]) {
567
6
    case '%': {
568
6
      str_to_add = code_s.c_str();
569
6
      add_len = 1;
570
6
      break;
571
0
    }
572
314
    case 's': {
573
314
      BigStr* s = va_arg(args, BigStr*);
574
      // Check type unconditionally because mycpp doesn't always check it
575
314
      CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
576
577
0
      str_to_add = s->data();
578
314
      add_len = len(s);
579
314
      zero_pad = false;  // python ignores the 0 directive for strings
580
314
      break;
581
0
    }
582
25
    case 'r': {
583
25
      BigStr* s = va_arg(args, BigStr*);
584
      // Check type unconditionally because mycpp doesn't always check it
585
25
      CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
586
587
0
      s = repr(s);
588
25
      str_to_add = s->data();
589
25
      add_len = len(s);
590
25
      zero_pad = false;  // python ignores the 0 directive for strings
591
25
      break;
592
0
    }
593
1.45k
    case 'd':  // fallthrough
594
1.46k
    case 'o': {
595
1.46k
      int d = va_arg(args, int);
596
1.46k
      add_len = snprintf(int_buf, kMaxFmtWidth,
597
1.46k
                         match.str().c_str() + lit_s.size(), d);
598
1.46k
      DCHECK(add_len > 0);
599
0
      str_to_add = int_buf;
600
1.46k
      break;
601
1.45k
    }
602
0
    default:
603
0
      DCHECK(0);
604
0
      break;
605
1.80k
    }
606
1.80k
    DCHECK(str_to_add != nullptr);
607
608
1.80k
    if (pad_back) {
609
0
      buf.append(str_to_add, add_len);
610
0
    }
611
1.80k
    if (add_len < width) {
612
0
      for (int i = 0; i < width - add_len; ++i) {
613
0
        buf.push_back(zero_pad ? '0' : ' ');
614
0
      }
615
0
    }
616
1.80k
    if (!pad_back) {
617
1.80k
      buf.append(str_to_add, add_len);
618
1.80k
    }
619
1.80k
  }
620
621
1.42k
  return StrFromC(buf.c_str(), buf.size());
622
1.42k
}
623
624
35
BigStr* StrIter::Value() {  // similar to at()
625
35
  BigStr* result = NewStr(1);
626
35
  result->data_[0] = s_->data_[i_];
627
35
  DCHECK(result->data_[1] == '\0');
628
0
  return result;
629
35
}
630
631
1.42k
BigStr* StrFormat(const char* fmt, ...) {
632
1.42k
  va_list args;
633
1.42k
  va_start(args, fmt);
634
1.42k
  BigStr* ret = _StrFormat(fmt, strlen(fmt), args);
635
1.42k
  va_end(args);
636
1.42k
  return ret;
637
1.42k
}
638
639
3
BigStr* StrFormat(BigStr* fmt, ...) {
640
3
  va_list args;
641
3
  va_start(args, fmt);
642
3
  BigStr* ret = _StrFormat(fmt->data(), len(fmt), args);
643
3
  va_end(args);
644
3
  return ret;
645
3
}