1 | // data_lang.cc
|
2 |
|
3 | #include "cpp/data_lang.h"
|
4 |
|
5 | #include "data_lang/j8.h"
|
6 | #include "data_lang/utf8.h"
|
7 |
|
8 | // TODO: remove duplication
|
9 | #define LOSSY_JSON (1 << 3)
|
10 |
|
11 | namespace {
|
12 |
|
13 | void WriteBString(BigStr* s, mylib::BufWriter* buf, int capacity) {
|
14 | uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
|
15 | uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
|
16 |
|
17 | buf->WriteConst("b'");
|
18 |
|
19 | // Set up pointers after writing opening quote
|
20 | uint8_t* out = buf->LengthPointer(); // mutated
|
21 | uint8_t* out_end = buf->CapacityPointer();
|
22 |
|
23 | while (true) {
|
24 | J8EncodeChunk(&in, in_end, &out, out_end, true); // Fill as much as we can
|
25 | buf->SetLengthFrom(out);
|
26 |
|
27 | if (in >= in_end) {
|
28 | break;
|
29 | }
|
30 |
|
31 | // Same growth policy as below
|
32 | capacity = capacity * 3 / 2;
|
33 | // printf("[2] new capacity %d\n", capacity);
|
34 | buf->EnsureMoreSpace(capacity);
|
35 |
|
36 | // Recompute pointers
|
37 | out = buf->LengthPointer();
|
38 | out_end = buf->CapacityPointer();
|
39 | }
|
40 |
|
41 | buf->WriteConst("'");
|
42 | }
|
43 |
|
44 | void WriteBashDollarString(BigStr* s, mylib::BufWriter* buf, int capacity) {
|
45 | uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
|
46 | uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
|
47 |
|
48 | buf->WriteConst("$'");
|
49 |
|
50 | // Set up pointers after writing opening quote
|
51 | uint8_t* out = buf->LengthPointer(); // mutated
|
52 | uint8_t* out_end = buf->CapacityPointer();
|
53 |
|
54 | while (true) {
|
55 | BashDollarEncodeChunk(&in, in_end, &out,
|
56 | out_end); // Fill as much as we can
|
57 | buf->SetLengthFrom(out);
|
58 |
|
59 | if (in >= in_end) {
|
60 | break;
|
61 | }
|
62 |
|
63 | // Same growth policy as below
|
64 | capacity = capacity * 3 / 2;
|
65 | // printf("[2] new capacity %d\n", capacity);
|
66 | buf->EnsureMoreSpace(capacity);
|
67 |
|
68 | // Recompute pointers
|
69 | out = buf->LengthPointer();
|
70 | out_end = buf->CapacityPointer();
|
71 | }
|
72 |
|
73 | buf->WriteConst("'");
|
74 | }
|
75 |
|
76 | // Style is COPIED from pyj8::WriteString()
|
77 | // Functionality is like j8_libc.c ShellEncodeString, that is:
|
78 | //
|
79 | // call BourneShellEncodeChunk()
|
80 | // then either
|
81 | // WriteBString()
|
82 | // WriteBashDollarString()
|
83 |
|
84 | void ShellEncodeString(BigStr* s, int ysh_fallback, mylib::BufWriter* buf) {
|
85 | uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
|
86 | uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
|
87 |
|
88 | // Growth policy: Start at a fixed size max(N + 3 + 2, J8_MIN_CAPACITY)
|
89 | int capacity = len(s) + 3 + 2; // 3 for quotes, 2 potential \" \n
|
90 | if (capacity < J8_MIN_CAPACITY) { // account for J8_MAX_BYTES_PER_INPUT_BYTE
|
91 | capacity = J8_MIN_CAPACITY;
|
92 | }
|
93 | // printf("[1] capacity %d\n", capacity);
|
94 |
|
95 | buf->EnsureMoreSpace(capacity);
|
96 |
|
97 | int begin = buf->Length(); // maybe Truncate to this position
|
98 | buf->WriteConst("'");
|
99 |
|
100 | // Set up pointers after writing opening quote
|
101 | uint8_t* out = buf->LengthPointer(); // mutated
|
102 | uint8_t* out_end = buf->CapacityPointer();
|
103 |
|
104 | while (true) {
|
105 | // Fill in as much as we can
|
106 | int cannot_encode = BourneShellEncodeChunk(&in, in_end, &out, out_end);
|
107 | if (cannot_encode) {
|
108 | buf->Truncate(begin);
|
109 | if (ysh_fallback) {
|
110 | WriteBString(s, buf, capacity); // fall back to b''
|
111 | } else {
|
112 | WriteBashDollarString(s, buf, capacity); // fall back to $''
|
113 | }
|
114 | return;
|
115 | }
|
116 | buf->SetLengthFrom(out);
|
117 |
|
118 | // printf("[1] len %d\n", out_buf->len);
|
119 |
|
120 | if (in >= in_end) {
|
121 | break;
|
122 | }
|
123 |
|
124 | // Growth policy: every time through the loop, increase 1.5x
|
125 | //
|
126 | // The worst blowup is 6x, and 1.5 ** 5 > 6, so it will take 5 reallocs.
|
127 | // This seems like a reasonable tradeoff between over-allocating and too
|
128 | // many realloc().
|
129 | capacity = capacity * 3 / 2;
|
130 | // printf("[1] new capacity %d\n", capacity);
|
131 | buf->EnsureMoreSpace(capacity);
|
132 |
|
133 | // Recompute pointers
|
134 | out = buf->LengthPointer(); // mutated
|
135 | out_end = buf->CapacityPointer();
|
136 | // printf("[1] out %p out_end %p\n", out, out_end);
|
137 | }
|
138 |
|
139 | buf->WriteConst("'");
|
140 | }
|
141 |
|
142 | } // namespace
|
143 |
|
144 | namespace fastfunc {
|
145 |
|
146 | bool CanOmitQuotes(BigStr* s) {
|
147 | return ::CanOmitQuotes(reinterpret_cast<unsigned char*>(s->data_), len(s));
|
148 | }
|
149 |
|
150 | BigStr* J8EncodeString(BigStr* s, int j8_fallback) {
|
151 | auto buf = Alloc<mylib::BufWriter>();
|
152 | int options = j8_fallback ? 0 : LOSSY_JSON;
|
153 | pyj8::WriteString(s, options, buf);
|
154 | return buf->getvalue();
|
155 | }
|
156 |
|
157 | BigStr* ShellEncodeString(BigStr* s, int ysh_fallback) {
|
158 | auto buf = Alloc<mylib::BufWriter>();
|
159 | ::ShellEncodeString(s, ysh_fallback, buf);
|
160 | return buf->getvalue();
|
161 | }
|
162 |
|
163 | Tuple2<int, int> Utf8DecodeOne(BigStr* s, int start) {
|
164 | // Bounds check for safety
|
165 | DCHECK(0 <= start && start < len(s));
|
166 |
|
167 | const unsigned char* string = reinterpret_cast<unsigned char*>(s->data());
|
168 |
|
169 | Utf8Result_t decode_result;
|
170 | utf8_decode(string + start, &decode_result);
|
171 | int32_t codepoint_or_error;
|
172 | if (decode_result.error) {
|
173 | codepoint_or_error = -decode_result.error;
|
174 | } else {
|
175 | codepoint_or_error = decode_result.codepoint;
|
176 | }
|
177 |
|
178 | return Tuple2<int, int>(codepoint_or_error, decode_result.bytes_read);
|
179 | }
|
180 |
|
181 | } // namespace fastfunc
|
182 |
|
183 | namespace pyj8 {
|
184 |
|
185 | bool PartIsUtf8(BigStr* s, int start, int end) {
|
186 | Utf8Result result;
|
187 |
|
188 | for (int i = start; i < end;) {
|
189 | utf8_decode(reinterpret_cast<unsigned char*>(s->data_ + i), &result);
|
190 | if (result.error) {
|
191 | return false;
|
192 | }
|
193 |
|
194 | i += result.bytes_read;
|
195 | }
|
196 |
|
197 | return true;
|
198 | }
|
199 |
|
200 | void WriteString(BigStr* s, int options, mylib::BufWriter* buf) {
|
201 | bool j8_fallback = !(options & LOSSY_JSON);
|
202 |
|
203 | uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
|
204 | uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
|
205 |
|
206 | // Growth policy: Start at a fixed size max(N + 3 + 2, J8_MIN_CAPACITY)
|
207 | int capacity = len(s) + 3 + 2; // 3 for quotes, 2 potential \" \n
|
208 | if (capacity < J8_MIN_CAPACITY) { // account for J8_MAX_BYTES_PER_INPUT_BYTE
|
209 | capacity = J8_MIN_CAPACITY;
|
210 | }
|
211 | // printf("[1] capacity %d\n", capacity);
|
212 |
|
213 | buf->EnsureMoreSpace(capacity);
|
214 |
|
215 | int begin = buf->Length(); // maybe Truncate to this position
|
216 | buf->WriteConst("\"");
|
217 |
|
218 | // Set up pointers after writing opening quote
|
219 | uint8_t* out = buf->LengthPointer(); // mutated
|
220 | uint8_t* out_end = buf->CapacityPointer();
|
221 |
|
222 | while (true) {
|
223 | // Fill in as much as we can
|
224 | int invalid_utf8 = J8EncodeChunk(&in, in_end, &out, out_end, false);
|
225 | if (invalid_utf8 && j8_fallback) {
|
226 | buf->Truncate(begin);
|
227 | WriteBString(s, buf, capacity); // fall back to b''
|
228 | return;
|
229 | }
|
230 | buf->SetLengthFrom(out);
|
231 |
|
232 | // printf("[1] len %d\n", out_buf->len);
|
233 |
|
234 | if (in >= in_end) {
|
235 | break;
|
236 | }
|
237 |
|
238 | // Growth policy: every time through the loop, increase 1.5x
|
239 | //
|
240 | // The worst blowup is 6x, and 1.5 ** 5 > 6, so it will take 5 reallocs.
|
241 | // This seems like a reasonable tradeoff between over-allocating and too
|
242 | // many realloc().
|
243 | capacity = capacity * 3 / 2;
|
244 | // printf("[1] new capacity %d\n", capacity);
|
245 | buf->EnsureMoreSpace(capacity);
|
246 |
|
247 | // Recompute pointers
|
248 | out = buf->LengthPointer(); // mutated
|
249 | out_end = buf->CapacityPointer();
|
250 | // printf("[1] out %p out_end %p\n", out, out_end);
|
251 | }
|
252 |
|
253 | buf->WriteConst("\"");
|
254 | }
|
255 |
|
256 | } // namespace pyj8
|
257 |
|
258 | namespace j8 {
|
259 |
|
260 | int HeapValueId(value_asdl::value_t* val) {
|
261 | #ifndef OPTIMIZED
|
262 | // ASDL generates headers with HeapTag::Scanned, but HeapTag::FixedSize would
|
263 | // also be valid.
|
264 | ObjHeader* h = ObjHeader::FromObject(val);
|
265 | // Note: value::Stdin is a HeapTag::Global singleton, but we avoid calling it
|
266 | // on that. Could return -1 for the HeapValueId instead of this assertion?
|
267 | DCHECK(h->heap_tag == HeapTag::Scanned || h->heap_tag == HeapTag::FixedSize);
|
268 | #endif
|
269 |
|
270 | return ObjectId(val);
|
271 | }
|
272 |
|
273 | } // namespace j8
|