OILS / osh / word_compile.py View on Github | oils.pub

310 lines, 165 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3"""osh/word_compile.py.
4
5These functions are called after parsing, but don't depend on any runtime
6values.
7"""
8
9from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
10from _devbuild.gen.syntax_asdl import (
11 Token,
12 CharCode,
13 word_part_e,
14 word_part_t,
15)
16from core.error import p_die
17from data_lang import j8
18from frontend import consts
19from frontend import lexer
20from mycpp import mylib
21from mycpp.mylib import log, switch
22
23from typing import List, Optional, cast
24
25
26def EvalCharLiteralForRegex(tok):
27 # type: (Token) -> CharCode
28 """For regex char classes.
29
30 Similar logic as below.
31 """
32 id_ = tok.id
33 value = lexer.TokenVal(tok)
34
35 with switch(id_) as case:
36 if case(Id.Char_UBraced):
37 s = lexer.TokenSlice(tok, 3, -1) # \u{123}
38 i = int(s, 16)
39 return CharCode(tok, i, True) # u_braced
40
41 elif case(Id.Char_OneChar): # \'
42 # value[1] -> mylib.ByteAt()
43 one_char_str = consts.LookupCharC(value[1])
44 return CharCode(tok, ord(one_char_str), False)
45
46 elif case(Id.Char_YHex, Id.Char_Hex): # \yff, \xff is legacy
47 s = lexer.TokenSliceLeft(tok, 2)
48 i = int(s, 16)
49 return CharCode(tok, i, False)
50
51 elif case(Id.Lit_Chars, Id.Expr_Name, Id.Expr_DecInt):
52 # Id.Lit_Chars: Token in single quoted string ['a'] is Id.Lit_Chars
53 # Id.Expr_Name: [a-z] is ['a'-'Z'], and [a z] is ['a' 'Z']
54 # Id.Expr_DecInt: [0-9] is ['0'-'9'], and [0 9] is ['0' '9']
55
56 assert len(value) == 1, tok
57 # value[0] -> mylib.ByteAt()
58 return CharCode(tok, ord(value[0]), False)
59
60 else:
61 raise AssertionError(tok)
62
63
64def EvalCStringToken(id_, value):
65 # type: (Id_t, str) -> Optional[str]
66 """All types of C-style backslash-escaped strings use this function:
67
68 - echo -e and printf at runtime
69 - $'' and b'' u'' at parse time
70 """
71 code_point = -1
72
73 if id_ in (Id.Lit_Chars, Id.Lit_CharsWithoutPrefix, Id.Unknown_Backslash):
74 # shopt --set no_parse_backslash detects Unknown_Backslash at PARSE
75 # time in YSH.
76 return value
77
78 # single quotes in the middle of a triple quoted string
79 elif id_ == Id.Right_SingleQuote:
80 return value
81
82 elif id_ == Id.Char_OneChar:
83 c = value[1]
84 return consts.LookupCharC(c)
85
86 elif id_ == Id.Char_Stop: # \c returns a special sentinel
87 return None
88
89 elif id_ in (Id.Char_Octal3, Id.Char_Octal4):
90 if id_ == Id.Char_Octal3: # $'\377'
91 s = value[1:]
92 else: # echo -e '\0377'
93 s = value[2:]
94
95 i = int(s, 8)
96 if i >= 256:
97 i = i % 256
98 # NOTE: This is for strict mode
99 #raise AssertionError('Out of range')
100 return chr(i)
101
102 elif id_ in (Id.Char_Hex, Id.Char_YHex):
103 s = value[2:]
104 i = int(s, 16)
105 return chr(i)
106
107 # Note: we're not doing the surrogate range and max code point checks for
108 # echo -e and printf:
109 #
110 # 1. It's not compatible with bash
111 # 2. We don't have good error locations anyway
112
113 elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
114 s = value[2:]
115 code_point = int(s, 16)
116 return j8.Utf8Encode(code_point)
117
118 elif id_ == Id.Char_UBraced:
119 s = value[3:-1] # \u{123}
120 code_point = int(s, 16)
121 return j8.Utf8Encode(code_point)
122
123 else:
124 raise AssertionError(Id_str(id_))
125
126
127def EvalSingleQuoted(id_, tokens):
128 # type: (Id_t, List[Token]) -> str
129 """ Done at parse time """
130 if id_ in (Id.Left_SingleQuote, Id.Left_RSingleQuote, Id.Left_TSingleQuote,
131 Id.Left_RTSingleQuote):
132 strs = [lexer.TokenVal(t) for t in tokens]
133
134 elif id_ in (Id.Left_DollarSingleQuote, Id.Left_USingleQuote,
135 Id.Left_BSingleQuote, Id.Left_UTSingleQuote,
136 Id.Left_BTSingleQuote):
137 if 0:
138 for t in tokens:
139 print('T %s' % t)
140
141 strs = []
142 for t in tokens:
143 # More parse time validation for code points.
144 # EvalCStringToken() redoes some of this work, but right now it's
145 # shared with dynamic echo -e / printf, which don't have tokens.
146
147 # Only check J8 style strings, not Char_Unicode4 and Char_Unicode8,
148 # which are in OSH
149 if t.id == Id.Char_UBraced:
150 s = lexer.TokenSlice(t, 3, -1)
151 code_point = int(s, 16)
152 if code_point > 0x10ffff:
153 p_die("Code point can't be greater than U+10ffff", t)
154 if 0xD800 <= code_point and code_point < 0xE000:
155 p_die(
156 r"%s escape is illegal because it's in the surrogate range"
157 % lexer.TokenVal(t), t)
158
159 strs.append(EvalCStringToken(t.id, lexer.TokenVal(t)))
160
161 else:
162 raise AssertionError(id_)
163 return ''.join(strs)
164
165
166def _TokenConsistsOf(tok, byte_set):
167 # type: (Token, str) -> bool
168 start = tok.col
169 end = tok.col + tok.length
170 for i in xrange(start, end):
171 b = mylib.ByteAt(tok.line.content, i)
172 if not mylib.ByteInSet(b, byte_set):
173 return False
174 return True
175
176
177def _IsLeadingSpace(tok):
178 # type: (Token) -> bool
179 """ Determine if the token before ''' etc. is space to trim """
180 return _TokenConsistsOf(tok, ' \t')
181
182
183def _IsTrailingSpace(tok):
184 # type: (Token) -> bool
185 """ Determine if the space/newlines after ''' should be trimmed
186
187 Like s.isspace(), without legacy \f \v and Unicode.
188 """
189 return _TokenConsistsOf(tok, ' \n\r\t')
190
191
192# Whitespace trimming algorithms:
193#
194# 1. Trim what's after opening ''' or """, if it's whitespace
195# 2. Determine what's before closing ''' or """ -- this is what you strip
196# 3. Strip each line by mutating the token
197# - Change the ID from Id.Lit_Chars -> Id.Lit_CharsWithoutPrefix to maintain
198# the lossless invariant
199
200
201def RemoveLeadingSpaceDQ(parts):
202 # type: (List[word_part_t]) -> None
203 if len(parts) <= 1: # We need at least 2 parts to strip anything
204 return
205
206 # The first token may have a newline
207 UP_first = parts[0]
208 if UP_first.tag() == word_part_e.Literal:
209 first = cast(Token, UP_first)
210 #log('T %s', first_part)
211 if _IsTrailingSpace(first):
212 # Remove the first part. TODO: This could be expensive if there are many
213 # lines.
214 parts.pop(0)
215
216 UP_last = parts[-1]
217 to_strip = None # type: Optional[str]
218 if UP_last.tag() == word_part_e.Literal:
219 last = cast(Token, UP_last)
220 if _IsLeadingSpace(last):
221 to_strip = lexer.TokenVal(last)
222 parts.pop() # Remove the last part
223
224 if to_strip is None:
225 return
226
227 n = len(to_strip)
228 for part in parts:
229 if part.tag() != word_part_e.Literal:
230 continue
231
232 lit_tok = cast(Token, part)
233
234 if lit_tok.col == 0 and lexer.TokenStartsWith(lit_tok, to_strip):
235 # TODO: Lexer should not populate this!
236 assert lit_tok.tval is None, lit_tok.tval
237
238 lit_tok.col = n
239 lit_tok.length -= n
240 #log('n = %d, %s', n, lit_tok)
241
242 assert lit_tok.id == Id.Lit_Chars, lit_tok
243 # --tool lossless-cat has a special case for this
244 lit_tok.id = Id.Lit_CharsWithoutPrefix
245
246
247def RemoveLeadingSpaceSQ(tokens):
248 # type: (List[Token]) -> None
249 """Strip leading whitespace from tokens.
250
251 May return original list unmodified, or a new list.
252
253 Must respect lossless invariant - see test/lossless/multiline-str.sh
254
255 For now we create NEW Id.Ignored_LeadingSpace tokens, and are NOT in the
256 arena.
257 """
258 if 0:
259 log('--')
260 for tok in tokens:
261 #log('tok %s', tok)
262 import sys
263 from asdl import format as fmt
264 ast_f = fmt.DetectConsoleOutput(mylib.Stderr())
265 tree = tok.AbbreviatedTree()
266 fmt.PrintTree(tree, ast_f)
267 print('', file=sys.stderr)
268 log('--')
269
270 if len(tokens) <= 1: # We need at least 2 parts to strip anything
271 return
272
273 # var x = ''' # strip initial newline/whitespace
274 # x
275 # '''
276 first = tokens[0]
277 if first.id == Id.Lit_Chars:
278 if _IsTrailingSpace(first):
279 tokens.pop(0) # Remove the first part
280
281 # Figure out what to strip, based on last token
282 last = tokens[-1]
283 to_strip = None # type: Optional[str]
284 if last.id == Id.Lit_Chars:
285 if _IsLeadingSpace(last):
286 to_strip = lexer.TokenVal(last)
287 tokens.pop() # Remove the last part
288
289 if to_strip is None:
290 return
291
292 #log('SQ Stripping %r', to_strip)
293 n = len(to_strip)
294
295 #log('--')
296 for tok in tokens:
297 #log('tok %s', tok)
298 # Strip leading space on tokens that begin lines, by bumping start col
299 if tok.col == 0 and lexer.TokenStartsWith(tok, to_strip):
300 tok.col = n
301 tok.length -= n
302
303 assert tok.id == Id.Lit_Chars, tok
304 # --tool lossless-cat has a special case for this
305 tok.id = Id.Lit_CharsWithoutPrefix
306
307 #log('STRIP tok %s', tok)
308
309
310# vim: sw=4