OILS / cpp / data_lang.cc View on Github | oilshell.org

273 lines, 131 significant
1// data_lang.cc
2
3#include "cpp/data_lang.h"
4
5#include "data_lang/j8.h"
6#include "data_lang/utf8.h"
7
8// TODO: remove duplication
9#define LOSSY_JSON (1 << 3)
10
11namespace {
12
13void WriteBString(BigStr* s, mylib::BufWriter* buf, int capacity) {
14 uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
15 uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
16
17 buf->WriteConst("b'");
18
19 // Set up pointers after writing opening quote
20 uint8_t* out = buf->LengthPointer(); // mutated
21 uint8_t* out_end = buf->CapacityPointer();
22
23 while (true) {
24 J8EncodeChunk(&in, in_end, &out, out_end, true); // Fill as much as we can
25 buf->SetLengthFrom(out);
26
27 if (in >= in_end) {
28 break;
29 }
30
31 // Same growth policy as below
32 capacity = capacity * 3 / 2;
33 // printf("[2] new capacity %d\n", capacity);
34 buf->EnsureMoreSpace(capacity);
35
36 // Recompute pointers
37 out = buf->LengthPointer();
38 out_end = buf->CapacityPointer();
39 }
40
41 buf->WriteConst("'");
42}
43
44void WriteBashDollarString(BigStr* s, mylib::BufWriter* buf, int capacity) {
45 uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
46 uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
47
48 buf->WriteConst("$'");
49
50 // Set up pointers after writing opening quote
51 uint8_t* out = buf->LengthPointer(); // mutated
52 uint8_t* out_end = buf->CapacityPointer();
53
54 while (true) {
55 BashDollarEncodeChunk(&in, in_end, &out,
56 out_end); // Fill as much as we can
57 buf->SetLengthFrom(out);
58
59 if (in >= in_end) {
60 break;
61 }
62
63 // Same growth policy as below
64 capacity = capacity * 3 / 2;
65 // printf("[2] new capacity %d\n", capacity);
66 buf->EnsureMoreSpace(capacity);
67
68 // Recompute pointers
69 out = buf->LengthPointer();
70 out_end = buf->CapacityPointer();
71 }
72
73 buf->WriteConst("'");
74}
75
76// Style is COPIED from pyj8::WriteString()
77// Functionality is like j8_libc.c ShellEncodeString, that is:
78//
79// call BourneShellEncodeChunk()
80// then either
81// WriteBString()
82// WriteBashDollarString()
83
84void ShellEncodeString(BigStr* s, int ysh_fallback, mylib::BufWriter* buf) {
85 uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
86 uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
87
88 // Growth policy: Start at a fixed size max(N + 3 + 2, J8_MIN_CAPACITY)
89 int capacity = len(s) + 3 + 2; // 3 for quotes, 2 potential \" \n
90 if (capacity < J8_MIN_CAPACITY) { // account for J8_MAX_BYTES_PER_INPUT_BYTE
91 capacity = J8_MIN_CAPACITY;
92 }
93 // printf("[1] capacity %d\n", capacity);
94
95 buf->EnsureMoreSpace(capacity);
96
97 int begin = buf->Length(); // maybe Truncate to this position
98 buf->WriteConst("'");
99
100 // Set up pointers after writing opening quote
101 uint8_t* out = buf->LengthPointer(); // mutated
102 uint8_t* out_end = buf->CapacityPointer();
103
104 while (true) {
105 // Fill in as much as we can
106 int cannot_encode = BourneShellEncodeChunk(&in, in_end, &out, out_end);
107 if (cannot_encode) {
108 buf->Truncate(begin);
109 if (ysh_fallback) {
110 WriteBString(s, buf, capacity); // fall back to b''
111 } else {
112 WriteBashDollarString(s, buf, capacity); // fall back to $''
113 }
114 return;
115 }
116 buf->SetLengthFrom(out);
117
118 // printf("[1] len %d\n", out_buf->len);
119
120 if (in >= in_end) {
121 break;
122 }
123
124 // Growth policy: every time through the loop, increase 1.5x
125 //
126 // The worst blowup is 6x, and 1.5 ** 5 > 6, so it will take 5 reallocs.
127 // This seems like a reasonable tradeoff between over-allocating and too
128 // many realloc().
129 capacity = capacity * 3 / 2;
130 // printf("[1] new capacity %d\n", capacity);
131 buf->EnsureMoreSpace(capacity);
132
133 // Recompute pointers
134 out = buf->LengthPointer(); // mutated
135 out_end = buf->CapacityPointer();
136 // printf("[1] out %p out_end %p\n", out, out_end);
137 }
138
139 buf->WriteConst("'");
140}
141
142} // namespace
143
144namespace fastfunc {
145
146bool CanOmitQuotes(BigStr* s) {
147 return ::CanOmitQuotes(reinterpret_cast<unsigned char*>(s->data_), len(s));
148}
149
150BigStr* J8EncodeString(BigStr* s, int j8_fallback) {
151 auto buf = Alloc<mylib::BufWriter>();
152 int options = j8_fallback ? 0 : LOSSY_JSON;
153 pyj8::WriteString(s, options, buf);
154 return buf->getvalue();
155}
156
157BigStr* ShellEncodeString(BigStr* s, int ysh_fallback) {
158 auto buf = Alloc<mylib::BufWriter>();
159 ::ShellEncodeString(s, ysh_fallback, buf);
160 return buf->getvalue();
161}
162
163Tuple2<int, int> Utf8DecodeOne(BigStr* s, int start) {
164 // Bounds check for safety
165 DCHECK(0 <= start && start < len(s));
166
167 const unsigned char* string = reinterpret_cast<unsigned char*>(s->data());
168
169 Utf8Result_t decode_result;
170 utf8_decode(string + start, &decode_result);
171 int32_t codepoint_or_error;
172 if (decode_result.error) {
173 codepoint_or_error = -decode_result.error;
174 } else {
175 codepoint_or_error = decode_result.codepoint;
176 }
177
178 return Tuple2<int, int>(codepoint_or_error, decode_result.bytes_read);
179}
180
181} // namespace fastfunc
182
183namespace pyj8 {
184
185bool PartIsUtf8(BigStr* s, int start, int end) {
186 Utf8Result result;
187
188 for (int i = start; i < end;) {
189 utf8_decode(reinterpret_cast<unsigned char*>(s->data_ + i), &result);
190 if (result.error) {
191 return false;
192 }
193
194 i += result.bytes_read;
195 }
196
197 return true;
198}
199
200void WriteString(BigStr* s, int options, mylib::BufWriter* buf) {
201 bool j8_fallback = !(options & LOSSY_JSON);
202
203 uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
204 uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
205
206 // Growth policy: Start at a fixed size max(N + 3 + 2, J8_MIN_CAPACITY)
207 int capacity = len(s) + 3 + 2; // 3 for quotes, 2 potential \" \n
208 if (capacity < J8_MIN_CAPACITY) { // account for J8_MAX_BYTES_PER_INPUT_BYTE
209 capacity = J8_MIN_CAPACITY;
210 }
211 // printf("[1] capacity %d\n", capacity);
212
213 buf->EnsureMoreSpace(capacity);
214
215 int begin = buf->Length(); // maybe Truncate to this position
216 buf->WriteConst("\"");
217
218 // Set up pointers after writing opening quote
219 uint8_t* out = buf->LengthPointer(); // mutated
220 uint8_t* out_end = buf->CapacityPointer();
221
222 while (true) {
223 // Fill in as much as we can
224 int invalid_utf8 = J8EncodeChunk(&in, in_end, &out, out_end, false);
225 if (invalid_utf8 && j8_fallback) {
226 buf->Truncate(begin);
227 WriteBString(s, buf, capacity); // fall back to b''
228 return;
229 }
230 buf->SetLengthFrom(out);
231
232 // printf("[1] len %d\n", out_buf->len);
233
234 if (in >= in_end) {
235 break;
236 }
237
238 // Growth policy: every time through the loop, increase 1.5x
239 //
240 // The worst blowup is 6x, and 1.5 ** 5 > 6, so it will take 5 reallocs.
241 // This seems like a reasonable tradeoff between over-allocating and too
242 // many realloc().
243 capacity = capacity * 3 / 2;
244 // printf("[1] new capacity %d\n", capacity);
245 buf->EnsureMoreSpace(capacity);
246
247 // Recompute pointers
248 out = buf->LengthPointer(); // mutated
249 out_end = buf->CapacityPointer();
250 // printf("[1] out %p out_end %p\n", out, out_end);
251 }
252
253 buf->WriteConst("\"");
254}
255
256} // namespace pyj8
257
258namespace j8 {
259
260int HeapValueId(value_asdl::value_t* val) {
261#ifndef OPTIMIZED
262 // ASDL generates headers with HeapTag::Scanned, but HeapTag::FixedSize would
263 // also be valid.
264 ObjHeader* h = ObjHeader::FromObject(val);
265 // Note: value::Stdin is a HeapTag::Global singleton, but we avoid calling it
266 // on that. Could return -1 for the HeapValueId instead of this assertion?
267 DCHECK(h->heap_tag == HeapTag::Scanned || h->heap_tag == HeapTag::FixedSize);
268#endif
269
270 return ObjectId(val);
271}
272
273} // namespace j8