mycpp

Coverage Report

Created: 2025-05-29 04:25

/home/uke/oil/mycpp/gc_str.h
Line
Count
Source (jump to first uncovered line)
1
#ifndef MYCPP_GC_STR_H
2
#define MYCPP_GC_STR_H
3
4
#include <limits.h>  // CHAR_BIT
5
6
#include "mycpp/common.h"  // DISALLOW_COPY_AND_ASSIGN
7
#include "mycpp/gc_obj.h"  // GC_OBJ
8
#include "mycpp/hash.h"    // HashFunc
9
10
// https://stackoverflow.com/questions/3919995/determining-sprintf-buffer-size-whats-the-standard/11092994#11092994
11
// Notes:
12
// - Python 2.7's intobject.c has an erroneous +6
13
// - This is 13, but len('-2147483648') is 11, which means we only need 12?
14
// - This formula is valid for octal(), because 2^(3 bits) = 8
15
16
const int kIntBufSize = CHAR_BIT * sizeof(int) / 3 + 3;
17
18
template <typename T>
19
class List;
20
21
class BigStr {
22
 public:
23
  // Don't call this directly.  Call NewStr() instead, which calls this.
24
1.77k
  BigStr() {
25
1.77k
  }
26
27
101
  char* data() {
28
101
    return data_;
29
101
  }
30
31
  // Call this after writing into buffer created by OverAllocatedStr()
32
  void MaybeShrink(int str_len);
33
34
  BigStr* at(int i);
35
36
  int find(BigStr* needle, int start = 0, int end = -1);
37
  int rfind(BigStr* needle);
38
39
  BigStr* slice(int begin);
40
  BigStr* slice(int begin, int end);
41
42
  BigStr* strip();
43
  // Used for CommandSub in osh/cmd_exec.py
44
  BigStr* rstrip(BigStr* chars);
45
  BigStr* rstrip();
46
47
  BigStr* lstrip(BigStr* chars);
48
  BigStr* lstrip();
49
50
  BigStr* ljust(int width, BigStr* fillchar);
51
  BigStr* rjust(int width, BigStr* fillchar);
52
53
  // Can take (start, end) so Tokens can be compared without allocation
54
  bool startswith(BigStr* s);
55
  bool endswith(BigStr* s);
56
57
  BigStr* replace(BigStr* old, BigStr* new_str);
58
  BigStr* replace(BigStr* old, BigStr* new_str, int count);
59
  BigStr* join(List<BigStr*>* items);
60
61
  List<BigStr*>* split(BigStr* sep);
62
  List<BigStr*>* split(BigStr* sep, int max_split);
63
  List<BigStr*>* splitlines(bool keep);
64
65
  // TODO: Move unicode functions out of mycpp runtime?  Because we won't match
66
  // Python exactly
67
  bool isdigit();
68
  bool isalpha();
69
  bool isupper();
70
71
  BigStr* upper();
72
  BigStr* lower();
73
74
  // Other options for fast comparison / hashing / string interning:
75
  // - unique_id_: an index into intern table.  I don't think this works unless
76
  //   you want to deal with rehashing all strings when the set grows.
77
  //   - although note that the JVM has -XX:StringTableSize=FIXED, which means
78
  //   - it can degrade into linked list performance
79
  // - Hashed strings become GLOBAL_STR().  Never deallocated.
80
  // - Hashed strings become part of the "large object space", which might be
81
  //   managed by mark and sweep.  This requires linked list overhead.
82
  //   (doubly-linked?)
83
  // - Intern strings at GARBAGE COLLECTION TIME, with
84
  //   LayoutForwarded::new_location_?  Is this possible?  Does it introduce
85
  //   too much coupling between strings, hash tables, and GC?
86
87
1.77k
  static constexpr ObjHeader obj_header() {
88
1.77k
    return ObjHeader::BigStr();
89
1.77k
  }
90
91
  unsigned hash(HashFunc h);
92
93
  int len_;
94
  unsigned hash_ : 31;
95
  unsigned is_hashed_ : 1;
96
  char data_[1];  // flexible array
97
98
 private:
99
  int _strip_left_pos();
100
  int _strip_right_pos();
101
102
  DISALLOW_COPY_AND_ASSIGN(BigStr)
103
};
104
105
constexpr int kStrHeaderSize = offsetof(BigStr, data_);
106
107
// Note: for SmallStr, we might copy into the VALUE
108
32
inline void BigStr::MaybeShrink(int str_len) {
109
32
  len_ = str_len;
110
32
  data_[len_] = '\0';  // NUL terminate
111
32
}
112
113
3.58k
inline int len(const BigStr* s) {
114
3.58k
  return s->len_;
115
3.58k
}
116
117
BigStr* StrFormat(const char* fmt, ...);
118
BigStr* StrFormat(BigStr* fmt, ...);
119
120
// NOTE: This iterates over bytes.
121
class StrIter {
122
 public:
123
1
  explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) {
124
    // Cheney only: s_ could be moved during iteration.
125
    // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_));
126
1
  }
127
1
  ~StrIter() {
128
    // gHeap.PopRoot();
129
1
  }
130
4
  void Next() {
131
4
    i_++;
132
4
  }
133
5
  bool Done() {
134
5
    return i_ >= len_;
135
5
  }
136
  BigStr* Value();  // similar to at()
137
138
 private:
139
  BigStr* s_;
140
  int i_;
141
  int len_;
142
143
  DISALLOW_COPY_AND_ASSIGN(StrIter)
144
};
145
146
extern BigStr* kEmptyString;
147
148
// GlobalStr notes:
149
// - sizeof("foo") == 4, for the NUL terminator.
150
// - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr.  We
151
// don't put it here because it triggers -Winvalid-offsetof
152
153
template <int N>
154
class GlobalStr {
155
  // A template type with the same layout as BigStr with length N-1 (which needs
156
  // a buffer of size N).  For initializing global constant instances.
157
 public:
158
  int len_;
159
  unsigned hash_ : 31;
160
  unsigned is_hashed_ : 1;
161
  const char data_[N];
162
163
  DISALLOW_COPY_AND_ASSIGN(GlobalStr)
164
};
165
166
union Str {
167
 public:
168
  // Instead of this at the start of every function:
169
  //   Str* s = nullptr;
170
  // It will now be:
171
  //   Str s(nullptr);
172
  //
173
  //   StackRoot _root(&s);
174
6
  explicit Str(BigStr* big) : big_(big) {
175
6
  }
176
177
5
  char* data() {
178
5
    return big_->data();
179
5
  }
180
181
5
  Str at(int i) {
182
5
    return Str(big_->at(i));
183
5
  }
184
185
0
  Str upper() {
186
0
    return Str(big_->upper());
187
0
  }
188
189
  uint64_t raw_bytes_;
190
  BigStr* big_;
191
  // TODO: add SmallStr, see mycpp/small_str_test.cc
192
};
193
194
6
inline int len(const Str s) {
195
6
  return len(s.big_);
196
6
}
197
198
// This macro is a workaround for the fact that it's impossible to have a
199
// a constexpr initializer for char[N].  The "String Literals as Non-Type
200
// Template Parameters" feature of C++ 20 would have done it, but it's not
201
// there.
202
//
203
// https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/
204
// https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor
205
//
206
// TODO: Can we hash values at compile time so they can be in the intern table?
207
208
#define GLOBAL_STR(name, val)                                                \
209
  GcGlobal<GlobalStr<sizeof(val)>> _##name = {                               \
210
      ObjHeader::Global(TypeTag::BigStr),                                    \
211
      {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
212
  BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj);
213
214
// New style for SmallStr compatibility
215
#define GLOBAL_STR2(name, val)                                               \
216
  GcGlobal<GlobalStr<sizeof(val)>> _##name = {                               \
217
      ObjHeader::Global(TypeTag::BigStr),                                    \
218
      {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
219
  Str name(reinterpret_cast<BigStr*>(&_##name.obj));
220
221
// Helper function that's consistent with JSON definition of ASCII whitespace,
222
// e.g.
223
// {"age": \t 42} is OK
224
// {"age": \v 42} is NOT OK
225
70
inline bool IsAsciiWhitespace(int ch) {
226
70
  return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
227
70
}
228
229
#endif  // MYCPP_GC_STR_H