OILS / mycpp / gc_str.h View on Github | oils.pub

230 lines, 113 significant
1#ifndef MYCPP_GC_STR_H
2#define MYCPP_GC_STR_H
3
4#include <limits.h> // CHAR_BIT
5
6#include "mycpp/common.h" // DISALLOW_COPY_AND_ASSIGN
7#include "mycpp/gc_obj.h" // GC_OBJ
8#include "mycpp/hash.h" // HashFunc
9
10// https://stackoverflow.com/questions/3919995/determining-sprintf-buffer-size-whats-the-standard/11092994#11092994
11// Notes:
12// - Python 2.7's intobject.c has an erroneous +6
13// - This is 13, but len('-2147483648') is 11, which means we only need 12?
14// - This formula is valid for octal(), because 2^(3 bits) = 8
15
16const int kIntBufSize = CHAR_BIT * sizeof(int) / 3 + 3;
17
18template <typename T>
19class List;
20
21class BigStr {
22 public:
23 // Don't call this directly. Call NewStr() instead, which calls this.
24 BigStr() {
25 }
26
27 char* data() {
28 return data_;
29 }
30
31 // Call this after writing into buffer created by OverAllocatedStr()
32 void MaybeShrink(int str_len);
33
34 BigStr* at(int i);
35
36 int find(BigStr* needle, int start = 0, int end = -1);
37 int rfind(BigStr* needle, int start = 0, int end = -1);
38
39 BigStr* slice(int begin);
40 BigStr* slice(int begin, int end);
41
42 BigStr* strip();
43 // Used for CommandSub in osh/cmd_exec.py
44 BigStr* rstrip(BigStr* chars);
45 BigStr* rstrip();
46
47 BigStr* lstrip(BigStr* chars);
48 BigStr* lstrip();
49
50 BigStr* ljust(int width, BigStr* fillchar);
51 BigStr* rjust(int width, BigStr* fillchar);
52
53 // Can take (start, end) so Tokens can be compared without allocation
54 bool startswith(BigStr* s);
55 bool endswith(BigStr* s);
56
57 BigStr* replace(BigStr* old, BigStr* new_str);
58 BigStr* replace(BigStr* old, BigStr* new_str, int count);
59 BigStr* join(List<BigStr*>* items);
60
61 List<BigStr*>* split(BigStr* sep);
62 List<BigStr*>* split(BigStr* sep, int max_split);
63 List<BigStr*>* splitlines(bool keep);
64
65 // TODO: Move unicode functions out of mycpp runtime? Because we won't match
66 // Python exactly
67 bool isdigit();
68 bool isalpha();
69 bool isupper();
70
71 BigStr* upper();
72 BigStr* lower();
73
74 // Other options for fast comparison / hashing / string interning:
75 // - unique_id_: an index into intern table. I don't think this works unless
76 // you want to deal with rehashing all strings when the set grows.
77 // - although note that the JVM has -XX:StringTableSize=FIXED, which means
78 // - it can degrade into linked list performance
79 // - Hashed strings become GLOBAL_STR(). Never deallocated.
80 // - Hashed strings become part of the "large object space", which might be
81 // managed by mark and sweep. This requires linked list overhead.
82 // (doubly-linked?)
83 // - Intern strings at GARBAGE COLLECTION TIME, with
84 // LayoutForwarded::new_location_? Is this possible? Does it introduce
85 // too much coupling between strings, hash tables, and GC?
86
87 static constexpr ObjHeader obj_header() {
88 return ObjHeader::BigStr();
89 }
90
91 unsigned hash(HashFunc h);
92
93 int len_;
94 unsigned hash_ : 31;
95 unsigned is_hashed_ : 1;
96 char data_[1]; // flexible array
97
98 private:
99 int internal_find(BigStr* needle, int direction, int start, int end);
100 int _strip_left_pos();
101 int _strip_right_pos();
102
103 DISALLOW_COPY_AND_ASSIGN(BigStr)
104};
105
106constexpr int kStrHeaderSize = offsetof(BigStr, data_);
107
108// Note: for SmallStr, we might copy into the VALUE
109inline void BigStr::MaybeShrink(int str_len) {
110 len_ = str_len;
111 data_[len_] = '\0'; // NUL terminate
112}
113
114inline int len(const BigStr* s) {
115 return s->len_;
116}
117
118BigStr* StrFormat(const char* fmt, ...);
119BigStr* StrFormat(BigStr* fmt, ...);
120
121// NOTE: This iterates over bytes.
122class StrIter {
123 public:
124 explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) {
125 // Cheney only: s_ could be moved during iteration.
126 // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_));
127 }
128 ~StrIter() {
129 // gHeap.PopRoot();
130 }
131 void Next() {
132 i_++;
133 }
134 bool Done() {
135 return i_ >= len_;
136 }
137 BigStr* Value(); // similar to at()
138
139 private:
140 BigStr* s_;
141 int i_;
142 int len_;
143
144 DISALLOW_COPY_AND_ASSIGN(StrIter)
145};
146
147extern BigStr* kEmptyString;
148
149// GlobalStr notes:
150// - sizeof("foo") == 4, for the NUL terminator.
151// - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr. We
152// don't put it here because it triggers -Winvalid-offsetof
153
154template <int N>
155class GlobalStr {
156 // A template type with the same layout as BigStr with length N-1 (which needs
157 // a buffer of size N). For initializing global constant instances.
158 public:
159 int len_;
160 unsigned hash_ : 31;
161 unsigned is_hashed_ : 1;
162 const char data_[N];
163
164 DISALLOW_COPY_AND_ASSIGN(GlobalStr)
165};
166
167union Str {
168 public:
169 // Instead of this at the start of every function:
170 // Str* s = nullptr;
171 // It will now be:
172 // Str s(nullptr);
173 //
174 // StackRoot _root(&s);
175 explicit Str(BigStr* big) : big_(big) {
176 }
177
178 char* data() {
179 return big_->data();
180 }
181
182 Str at(int i) {
183 return Str(big_->at(i));
184 }
185
186 Str upper() {
187 return Str(big_->upper());
188 }
189
190 uint64_t raw_bytes_;
191 BigStr* big_;
192 // TODO: add SmallStr, see mycpp/small_str_test.cc
193};
194
195inline int len(const Str s) {
196 return len(s.big_);
197}
198
199// This macro is a workaround for the fact that it's impossible to have a
200// a constexpr initializer for char[N]. The "String Literals as Non-Type
201// Template Parameters" feature of C++ 20 would have done it, but it's not
202// there.
203//
204// https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/
205// https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor
206//
207// TODO: Can we hash values at compile time so they can be in the intern table?
208
209#define GLOBAL_STR(name, val) \
210 GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
211 ObjHeader::Global(TypeTag::BigStr), \
212 {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
213 BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj);
214
215// New style for SmallStr compatibility
216#define GLOBAL_STR2(name, val) \
217 GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
218 ObjHeader::Global(TypeTag::BigStr), \
219 {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
220 Str name(reinterpret_cast<BigStr*>(&_##name.obj));
221
222// Helper function that's consistent with JSON definition of ASCII whitespace,
223// e.g.
224// {"age": \t 42} is OK
225// {"age": \v 42} is NOT OK
226inline bool IsAsciiWhitespace(int ch) {
227 return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
228}
229
230#endif // MYCPP_GC_STR_H