/home/uke/oil/mycpp/gc_str.h
Line | Count | Source (jump to first uncovered line) |
1 | | #ifndef MYCPP_GC_STR_H |
2 | | #define MYCPP_GC_STR_H |
3 | | |
4 | | #include <limits.h> // CHAR_BIT |
5 | | |
6 | | #include "mycpp/common.h" // DISALLOW_COPY_AND_ASSIGN |
7 | | #include "mycpp/gc_obj.h" // GC_OBJ |
8 | | #include "mycpp/hash.h" // HashFunc |
9 | | |
10 | | // https://stackoverflow.com/questions/3919995/determining-sprintf-buffer-size-whats-the-standard/11092994#11092994 |
11 | | // Notes: |
12 | | // - Python 2.7's intobject.c has an erroneous +6 |
13 | | // - This is 13, but len('-2147483648') is 11, which means we only need 12? |
14 | | // - This formula is valid for octal(), because 2^(3 bits) = 8 |
15 | | |
16 | | const int kIntBufSize = CHAR_BIT * sizeof(int) / 3 + 3; |
17 | | |
18 | | template <typename T> |
19 | | class List; |
20 | | |
21 | | class BigStr { |
22 | | public: |
23 | | // Don't call this directly. Call NewStr() instead, which calls this. |
24 | 522 | BigStr() { |
25 | 522 | } |
26 | | |
27 | 25 | char* data() { |
28 | 25 | return data_; |
29 | 25 | } |
30 | | |
31 | | // Call this after writing into buffer created by OverAllocatedStr() |
32 | | void MaybeShrink(int str_len); |
33 | | |
34 | | BigStr* at(int i); |
35 | | |
36 | | int find(BigStr* needle, int start = 0, int end = -1); |
37 | | int rfind(BigStr* needle); |
38 | | |
39 | | BigStr* slice(int begin); |
40 | | BigStr* slice(int begin, int end); |
41 | | |
42 | | BigStr* strip(); |
43 | | // Used for CommandSub in osh/cmd_exec.py |
44 | | BigStr* rstrip(BigStr* chars); |
45 | | BigStr* rstrip(); |
46 | | |
47 | | BigStr* lstrip(BigStr* chars); |
48 | | BigStr* lstrip(); |
49 | | |
50 | | BigStr* ljust(int width, BigStr* fillchar); |
51 | | BigStr* rjust(int width, BigStr* fillchar); |
52 | | |
53 | | // Can take (start, end) so Tokens can be compared without allocation |
54 | | bool startswith(BigStr* s); |
55 | | bool endswith(BigStr* s); |
56 | | |
57 | | BigStr* replace(BigStr* old, BigStr* new_str); |
58 | | BigStr* replace(BigStr* old, BigStr* new_str, int count); |
59 | | BigStr* join(List<BigStr*>* items); |
60 | | |
61 | | List<BigStr*>* split(BigStr* sep); |
62 | | List<BigStr*>* split(BigStr* sep, int max_split); |
63 | | List<BigStr*>* splitlines(bool keep); |
64 | | |
65 | | // TODO: Move unicode functions out of mycpp runtime? Because we won't match |
66 | | // Python exactly |
67 | | bool isdigit(); |
68 | | bool isalpha(); |
69 | | bool isupper(); |
70 | | |
71 | | BigStr* upper(); |
72 | | BigStr* lower(); |
73 | | |
74 | | // Other options for fast comparison / hashing / string interning: |
75 | | // - unique_id_: an index into intern table. I don't think this works unless |
76 | | // you want to deal with rehashing all strings when the set grows. |
77 | | // - although note that the JVM has -XX:StringTableSize=FIXED, which means |
78 | | // - it can degrade into linked list performance |
79 | | // - Hashed strings become GLOBAL_STR(). Never deallocated. |
80 | | // - Hashed strings become part of the "large object space", which might be |
81 | | // managed by mark and sweep. This requires linked list overhead. |
82 | | // (doubly-linked?) |
83 | | // - Intern strings at GARBAGE COLLECTION TIME, with |
84 | | // LayoutForwarded::new_location_? Is this possible? Does it introduce |
85 | | // too much coupling between strings, hash tables, and GC? |
86 | | |
87 | 522 | static constexpr ObjHeader obj_header() { |
88 | 522 | return ObjHeader::BigStr(); |
89 | 522 | } |
90 | | |
91 | | unsigned hash(HashFunc h); |
92 | | |
93 | | int len_; |
94 | | unsigned hash_ : 31; |
95 | | unsigned is_hashed_ : 1; |
96 | | char data_[1]; // flexible array |
97 | | |
98 | | private: |
99 | | int _strip_left_pos(); |
100 | | int _strip_right_pos(); |
101 | | |
102 | | DISALLOW_COPY_AND_ASSIGN(BigStr) |
103 | | }; |
104 | | |
105 | | constexpr int kStrHeaderSize = offsetof(BigStr, data_); |
106 | | |
107 | | // Note: for SmallStr, we might copy into the VALUE |
108 | 56 | inline void BigStr::MaybeShrink(int str_len) { |
109 | 56 | len_ = str_len; |
110 | 56 | data_[len_] = '\0'; // NUL terminate |
111 | 56 | } |
112 | | |
113 | 554 | inline int len(const BigStr* s) { |
114 | 554 | return s->len_; |
115 | 554 | } |
116 | | |
117 | | BigStr* StrFormat(const char* fmt, ...); |
118 | | BigStr* StrFormat(BigStr* fmt, ...); |
119 | | |
120 | | // NOTE: This iterates over bytes. |
121 | | class StrIter { |
122 | | public: |
123 | 0 | explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) { |
124 | 0 | // Cheney only: s_ could be moved during iteration. |
125 | 0 | // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_)); |
126 | 0 | } |
127 | 0 | ~StrIter() { |
128 | 0 | // gHeap.PopRoot(); |
129 | 0 | } |
130 | 0 | void Next() { |
131 | 0 | i_++; |
132 | 0 | } |
133 | 0 | bool Done() { |
134 | 0 | return i_ >= len_; |
135 | 0 | } |
136 | | BigStr* Value(); // similar to at() |
137 | | |
138 | | private: |
139 | | BigStr* s_; |
140 | | int i_; |
141 | | int len_; |
142 | | |
143 | | DISALLOW_COPY_AND_ASSIGN(StrIter) |
144 | | }; |
145 | | |
146 | | extern BigStr* kEmptyString; |
147 | | |
148 | | // GlobalStr notes: |
149 | | // - sizeof("foo") == 4, for the NUL terminator. |
150 | | // - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr. We |
151 | | // don't put it here because it triggers -Winvalid-offsetof |
152 | | |
153 | | template <int N> |
154 | | class GlobalStr { |
155 | | // A template type with the same layout as BigStr with length N-1 (which needs |
156 | | // a buffer of size N). For initializing global constant instances. |
157 | | public: |
158 | | int len_; |
159 | | unsigned hash_ : 31; |
160 | | unsigned is_hashed_ : 1; |
161 | | const char data_[N]; |
162 | | |
163 | | DISALLOW_COPY_AND_ASSIGN(GlobalStr) |
164 | | }; |
165 | | |
166 | | union Str { |
167 | | public: |
168 | | // Instead of this at the start of every function: |
169 | | // Str* s = nullptr; |
170 | | // It will now be: |
171 | | // Str s(nullptr); |
172 | | // |
173 | | // StackRoot _root(&s); |
174 | 0 | explicit Str(BigStr* big) : big_(big) { |
175 | 0 | } |
176 | | |
177 | 0 | char* data() { |
178 | 0 | return big_->data(); |
179 | 0 | } |
180 | | |
181 | 0 | Str at(int i) { |
182 | 0 | return Str(big_->at(i)); |
183 | 0 | } |
184 | | |
185 | 0 | Str upper() { |
186 | 0 | return Str(big_->upper()); |
187 | 0 | } |
188 | | |
189 | | uint64_t raw_bytes_; |
190 | | BigStr* big_; |
191 | | // TODO: add SmallStr, see mycpp/small_str_test.cc |
192 | | }; |
193 | | |
194 | 0 | inline int len(const Str s) { |
195 | 0 | return len(s.big_); |
196 | 0 | } |
197 | | |
198 | | // This macro is a workaround for the fact that it's impossible to have a |
199 | | // a constexpr initializer for char[N]. The "String Literals as Non-Type |
200 | | // Template Parameters" feature of C++ 20 would have done it, but it's not |
201 | | // there. |
202 | | // |
203 | | // https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/ |
204 | | // https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor |
205 | | // |
206 | | // TODO: Can we hash values at compile time so they can be in the intern table? |
207 | | |
208 | | #define GLOBAL_STR(name, val) \ |
209 | | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \ |
210 | | ObjHeader::Global(TypeTag::BigStr), \ |
211 | | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \ |
212 | | BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj); |
213 | | |
214 | | // New style for SmallStr compatibility |
215 | | #define GLOBAL_STR2(name, val) \ |
216 | | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \ |
217 | | ObjHeader::Global(TypeTag::BigStr), \ |
218 | | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \ |
219 | | Str name(reinterpret_cast<BigStr*>(&_##name.obj)); |
220 | | |
221 | | // Helper function that's consistent with JSON definition of ASCII whitespace, |
222 | | // e.g. |
223 | | // {"age": \t 42} is OK |
224 | | // {"age": \v 42} is NOT OK |
225 | 7 | inline bool IsAsciiWhitespace(int ch) { |
226 | 7 | return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'; |
227 | 7 | } |
228 | | |
229 | | #endif // MYCPP_GC_STR_H |