OILS / osh / word_compile.py View on Github | oilshell.org

309 lines, 165 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3"""osh/word_compile.py.
4
5These functions are called after parsing, but don't depend on any runtime
6values.
7"""
8
9from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
10from _devbuild.gen.syntax_asdl import (
11 Token,
12 CharCode,
13 word_part_e,
14 word_part_t,
15)
16from core.error import p_die
17from data_lang import j8
18from frontend import consts
19from frontend import lexer
20from mycpp import mylib
21from mycpp.mylib import log, switch
22
23from typing import List, Optional, cast
24
25
26def EvalCharLiteralForRegex(tok):
27 # type: (Token) -> CharCode
28 """For regex char classes.
29
30 Similar logic as below.
31 """
32 id_ = tok.id
33 value = lexer.TokenVal(tok)
34
35 with switch(id_) as case:
36 if case(Id.Char_UBraced):
37 s = lexer.TokenSlice(tok, 3, -1) # \u{123}
38 i = int(s, 16)
39 return CharCode(tok, i, True) # u_braced
40
41 elif case(Id.Char_OneChar): # \'
42 # value[1] -> mylib.ByteAt()
43 one_char_str = consts.LookupCharC(value[1])
44 return CharCode(tok, ord(one_char_str), False)
45
46 elif case(Id.Char_Hex):
47 s = lexer.TokenSliceLeft(tok, 2)
48 i = int(s, 16)
49 return CharCode(tok, i, False)
50
51 elif case(Id.Lit_Chars, Id.Expr_Name, Id.Expr_DecInt):
52 # Id.Lit_Chars: Token in single quoted string ['a'] is Id.Lit_Chars
53 # Id.Expr_Name: [a-z] is ['a'-'Z'], and [a z] is ['a' 'Z']
54 # Id.Expr_DecInt: [0-9] is ['0'-'9'], and [0 9] is ['0' '9']
55
56 assert len(value) == 1, tok
57 # value[0] -> mylib.ByteAt()
58 return CharCode(tok, ord(value[0]), False)
59
60 else:
61 raise AssertionError(tok)
62
63
64def EvalCStringToken(id_, value):
65 # type: (Id_t, str) -> Optional[str]
66 """All types of C-style backslash-escaped strings use this function:
67
68 - echo -e and printf at runtime
69 - $'' and b'' u'' at parse time
70 """
71 code_point = -1
72
73 if id_ in (Id.Lit_Chars, Id.Lit_CharsWithoutPrefix, Id.Unknown_Backslash):
74 # shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH.
75 return value
76
77 # single quotes in the middle of a triple quoted string
78 elif id_ == Id.Right_SingleQuote:
79 return value
80
81 elif id_ == Id.Char_OneChar:
82 c = value[1]
83 return consts.LookupCharC(c)
84
85 elif id_ == Id.Char_Stop: # \c returns a special sentinel
86 return None
87
88 elif id_ in (Id.Char_Octal3, Id.Char_Octal4):
89 if id_ == Id.Char_Octal3: # $'\377'
90 s = value[1:]
91 else: # echo -e '\0377'
92 s = value[2:]
93
94 i = int(s, 8)
95 if i >= 256:
96 i = i % 256
97 # NOTE: This is for strict mode
98 #raise AssertionError('Out of range')
99 return chr(i)
100
101 elif id_ in (Id.Char_Hex, Id.Char_YHex):
102 s = value[2:]
103 i = int(s, 16)
104 return chr(i)
105
106 # Note: we're not doing the surrogate range and max code point checks for
107 # echo -e and printf:
108 #
109 # 1. It's not compatible with bash
110 # 2. We don't have good error locations anyway
111
112 elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
113 s = value[2:]
114 code_point = int(s, 16)
115 return j8.Utf8Encode(code_point)
116
117 elif id_ == Id.Char_UBraced:
118 s = value[3:-1] # \u{123}
119 code_point = int(s, 16)
120 return j8.Utf8Encode(code_point)
121
122 else:
123 raise AssertionError(Id_str(id_))
124
125
126def EvalSingleQuoted(id_, tokens):
127 # type: (Id_t, List[Token]) -> str
128 """ Done at parse time """
129 if id_ in (Id.Left_SingleQuote, Id.Left_RSingleQuote, Id.Left_TSingleQuote,
130 Id.Left_RTSingleQuote):
131 strs = [lexer.TokenVal(t) for t in tokens]
132
133 elif id_ in (Id.Left_DollarSingleQuote, Id.Left_USingleQuote,
134 Id.Left_BSingleQuote, Id.Left_UTSingleQuote,
135 Id.Left_BTSingleQuote):
136 if 0:
137 for t in tokens:
138 print('T %s' % t)
139
140 strs = []
141 for t in tokens:
142 # More parse time validation for code points.
143 # EvalCStringToken() redoes some of this work, but right now it's
144 # shared with dynamic echo -e / printf, which don't have tokens.
145
146 # Only check J8 style strings, not Char_Unicode4 and Char_Unicode8,
147 # which are in OSH
148 if t.id == Id.Char_UBraced:
149 s = lexer.TokenSlice(t, 3, -1)
150 code_point = int(s, 16)
151 if code_point > 0x10ffff:
152 p_die("Code point can't be greater than U+10ffff", t)
153 if 0xD800 <= code_point and code_point < 0xE000:
154 p_die(
155 r"%s escape is illegal because it's in the surrogate range"
156 % lexer.TokenVal(t), t)
157
158 strs.append(EvalCStringToken(t.id, lexer.TokenVal(t)))
159
160 else:
161 raise AssertionError(id_)
162 return ''.join(strs)
163
164
165def _TokenConsistsOf(tok, byte_set):
166 # type: (Token, str) -> bool
167 start = tok.col
168 end = tok.col + tok.length
169 for i in xrange(start, end):
170 b = mylib.ByteAt(tok.line.content, i)
171 if not mylib.ByteInSet(b, byte_set):
172 return False
173 return True
174
175
176def _IsLeadingSpace(tok):
177 # type: (Token) -> bool
178 """ Determine if the token before ''' etc. is space to trim """
179 return _TokenConsistsOf(tok, ' \t')
180
181
182def _IsTrailingSpace(tok):
183 # type: (Token) -> bool
184 """ Determine if the space/newlines after ''' should be trimmed
185
186 Like s.isspace(), without legacy \f \v and Unicode.
187 """
188 return _TokenConsistsOf(tok, ' \n\r\t')
189
190
191# Whitespace trimming algorithms:
192#
193# 1. Trim what's after opening ''' or """, if it's whitespace
194# 2. Determine what's before closing ''' or """ -- this is what you strip
195# 3. Strip each line by mutating the token
196# - Change the ID from Id.Lit_Chars -> Id.Lit_CharsWithoutPrefix to maintain
197# the lossless invariant
198
199
200def RemoveLeadingSpaceDQ(parts):
201 # type: (List[word_part_t]) -> None
202 if len(parts) <= 1: # We need at least 2 parts to strip anything
203 return
204
205 # The first token may have a newline
206 UP_first = parts[0]
207 if UP_first.tag() == word_part_e.Literal:
208 first = cast(Token, UP_first)
209 #log('T %s', first_part)
210 if _IsTrailingSpace(first):
211 # Remove the first part. TODO: This could be expensive if there are many
212 # lines.
213 parts.pop(0)
214
215 UP_last = parts[-1]
216 to_strip = None # type: Optional[str]
217 if UP_last.tag() == word_part_e.Literal:
218 last = cast(Token, UP_last)
219 if _IsLeadingSpace(last):
220 to_strip = lexer.TokenVal(last)
221 parts.pop() # Remove the last part
222
223 if to_strip is None:
224 return
225
226 n = len(to_strip)
227 for part in parts:
228 if part.tag() != word_part_e.Literal:
229 continue
230
231 lit_tok = cast(Token, part)
232
233 if lit_tok.col == 0 and lexer.TokenStartsWith(lit_tok, to_strip):
234 # TODO: Lexer should not populate this!
235 assert lit_tok.tval is None, lit_tok.tval
236
237 lit_tok.col = n
238 lit_tok.length -= n
239 #log('n = %d, %s', n, lit_tok)
240
241 assert lit_tok.id == Id.Lit_Chars, lit_tok
242 # --tool lossless-cat has a special case for this
243 lit_tok.id = Id.Lit_CharsWithoutPrefix
244
245
246def RemoveLeadingSpaceSQ(tokens):
247 # type: (List[Token]) -> None
248 """Strip leading whitespace from tokens.
249
250 May return original list unmodified, or a new list.
251
252 Must respect lossless invariant - see test/lossless/multiline-str.sh
253
254 For now we create NEW Id.Ignored_LeadingSpace tokens, and are NOT in the
255 arena.
256 """
257 if 0:
258 log('--')
259 for tok in tokens:
260 #log('tok %s', tok)
261 import sys
262 from asdl import format as fmt
263 ast_f = fmt.DetectConsoleOutput(mylib.Stderr())
264 tree = tok.AbbreviatedTree()
265 fmt.PrintTree(tree, ast_f)
266 print('', file=sys.stderr)
267 log('--')
268
269 if len(tokens) <= 1: # We need at least 2 parts to strip anything
270 return
271
272 # var x = ''' # strip initial newline/whitespace
273 # x
274 # '''
275 first = tokens[0]
276 if first.id == Id.Lit_Chars:
277 if _IsTrailingSpace(first):
278 tokens.pop(0) # Remove the first part
279
280 # Figure out what to strip, based on last token
281 last = tokens[-1]
282 to_strip = None # type: Optional[str]
283 if last.id == Id.Lit_Chars:
284 if _IsLeadingSpace(last):
285 to_strip = lexer.TokenVal(last)
286 tokens.pop() # Remove the last part
287
288 if to_strip is None:
289 return
290
291 #log('SQ Stripping %r', to_strip)
292 n = len(to_strip)
293
294 #log('--')
295 for tok in tokens:
296 #log('tok %s', tok)
297 # Strip leading space on tokens that begin lines, by bumping start col
298 if tok.col == 0 and lexer.TokenStartsWith(tok, to_strip):
299 tok.col = n
300 tok.length -= n
301
302 assert tok.id == Id.Lit_Chars, tok
303 # --tool lossless-cat has a special case for this
304 tok.id = Id.Lit_CharsWithoutPrefix
305
306 #log('STRIP tok %s', tok)
307
308
309# vim: sw=4