OILS / frontend / lexer.py View on Github | oils.pub

515 lines, 227 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8lexer.py - Library for lexing.
9"""
10
11from _devbuild.gen.syntax_asdl import Token, SourceLine
12from _devbuild.gen.types_asdl import lex_mode_t, lex_mode_e
13from _devbuild.gen.id_kind_asdl import Id_t, Id, Id_str
14from mycpp.mylib import log
15from frontend import match
16
17unused = log, Id_str
18
19from typing import List, Tuple, Counter, TYPE_CHECKING
20if TYPE_CHECKING:
21 from core.alloc import Arena
22 from frontend.reader import _Reader
23
24#
25# Optimized Token functions that use str.find(substr, start, end) to avoid
26# allocation of temporary slice
27#
28
29
30def IsPlusEquals(tok):
31 # type: (Token) -> bool
32 """Common pattern to test if we got foo= or foo+=
33 """
34 i = tok.col + tok.length - 2 # 'foo+='[-2] is '+'
35 return tok.line.content.find('+', i, i + 1) != -1
36
37
38def TokenContains(tok, substr):
39 # type: (Token, str) -> bool
40 return tok.line.content.find(substr, tok.col, tok.col + tok.length) != -1
41
42
43def TokenEquals(tok, s):
44 # type: (Token, str) -> bool
45 if len(s) != tok.length:
46 return False
47 return TokenContains(tok, s)
48
49
50def TokenStartsWith(tok, s):
51 # type: (Token, str) -> bool
52 return tok.line.content.find(s, tok.col, tok.col + len(s)) != -1
53
54
55def TokenEndsWith(tok, s):
56 # type: (Token, str) -> bool
57 end = tok.col + tok.length
58 return tok.line.content.find(s, end - len(s), end) != -1
59
60
61# Also: IsWhitespace, IsLeadingSpace
62
63
64def TokenVal(tok):
65 # type: (Token) -> str
66 """Compute string value on demand."""
67 return tok.line.content[tok.col:tok.col + tok.length]
68
69
70def TokenSliceLeft(tok, left_index):
71 # type: (Token, int) -> str
72 """Slice token directly, without creating intermediate string."""
73 assert left_index > 0
74 start = tok.col + left_index
75 return tok.line.content[start:tok.col + tok.length]
76
77
78def TokenSliceRight(tok, right_index):
79 # type: (Token, int) -> str
80 """Slice token directly, without creating intermediate string."""
81 assert right_index < 0
82 end = tok.col + tok.length + right_index
83 return tok.line.content[tok.col:end]
84
85
86def TokenSlice(tok, left, right):
87 # type: (Token, int, int) -> str
88 """Slice token directly, without creating intermediate string."""
89 assert left > 0, left
90 assert right < 0, right
91 start = tok.col + left
92 end = tok.col + tok.length + right
93 return tok.line.content[start:end]
94
95
96def LazyStr(tok):
97 # type: (Token) -> str
98 """Materialize the tval on demand, with special case for $myvar.
99
100 Most tokens do NOT need strings. We avoid allocating them in the lexer.
101
102 Note: SingleQuoted could have lazy sval, NOT at the token level.
103 """
104 if 0:
105 LAZY_ID_HIST[tok.id] += 1
106
107 if tok.tval is None:
108 if tok.id in (Id.VSub_DollarName, Id.VSub_Number): # $x or $2
109 # Special case for SimpleVarSub - completion also relies on this
110 tok.tval = TokenSliceLeft(tok, 1)
111 else:
112 tok.tval = TokenVal(tok)
113
114 return tok.tval
115
116
117def DummyToken(id_, val):
118 # type: (int, str) -> Token
119
120 col = -1
121 length = -1
122 return Token(id_, length, col, None, val)
123
124
125class LineLexer(object):
126
127 def __init__(self, arena):
128 # type: (Arena) -> None
129 self.arena = arena
130 self.replace_last_token = False # For MaybeUnreadOne
131
132 # Singleton instance because we don't allow globals.
133 # 2023-09: I tried LineLexer::Read() returning None, but that is subtly
134 # incorrect, e.g. in Lexer::Read() with NUL bytes.
135 self.eol_tok = DummyToken(Id.Eol_Tok, '')
136
137 self.Reset(None, 0) # Invalid src_line to start
138
139 def __repr__(self):
140 # type: () -> str
141 return '<LineLexer at pos %d of line %r>' % (self.line_pos,
142 self.src_line)
143
144 def Reset(self, src_line, line_pos):
145 # type: (SourceLine, int) -> None
146 self.src_line = src_line
147 self.line_pos = line_pos
148
149 def MaybeUnreadOne(self):
150 # type: () -> bool
151 """Return True if we can unread one character, or False otherwise.
152
153 NOTE: Only call this when you know the last token was exactly one character!
154 """
155 if self.line_pos == 0:
156 return False
157 else:
158 self.line_pos -= 1
159 self.replace_last_token = True # don't add the next token to the arena
160 return True
161
162 def GetEofToken(self, id_):
163 # type: (int) -> Token
164 """Create a new span ID for syntax errors involving the EOF token."""
165 if self.src_line is None:
166 # There are ZERO lines now. Add a dummy line 0 so the Token has a source
167 # to display errors.
168 src_line = self.arena.AddLine('', 0)
169 else:
170 src_line = self.src_line
171
172 return self.arena.NewToken(id_, self.line_pos, 0, src_line)
173
174 def LookAheadOne(self, lex_mode):
175 # type: (lex_mode_t) -> Id_t
176 """Look ahead exactly one token in the given lexer mode."""
177 pos = self.line_pos
178 line_str = self.src_line.content
179 n = len(line_str)
180 if pos == n:
181 return Id.Unknown_Tok
182 else:
183 tok_type, _ = match.OneToken(lex_mode, line_str, pos)
184 return tok_type
185
186 def AssertAtEndOfLine(self):
187 # type: () -> None
188 assert self.line_pos == len(self.src_line.content), \
189 '%d %s' % (self.line_pos, self.src_line.content)
190
191 def LookPastSpace(self, lex_mode):
192 # type: (lex_mode_t) -> Id_t
193 """Look ahead in current line for non-space token, using given lexer
194 mode.
195
196 Does NOT advance self.line_pos.
197
198 Called with at least the following modes:
199 lex_mode_e.Arith -- for ${a[@]} vs ${a[1+2]}
200 lex_mode_e.VSub_1
201 lex_mode_e.ShCommand
202
203 Note: Only ShCommand emits Id.WS_Space, but other lexer modes don't.
204 """
205 pos = self.line_pos
206 line_str = self.src_line.content
207 n = len(line_str)
208 #print('Look ahead from pos %d, line %r' % (pos,self.line))
209 while True:
210 if pos == n:
211 # We don't allow lookahead while already at end of line, because it
212 # would involve interacting with the line reader, and we never need
213 # it. In lex_mode_e.ShCommand, there is an explicit newline token, but
214 # lex_mode_e.Arith doesn't have it.
215 return Id.Unknown_Tok
216
217 tok_type, end_pos = match.OneToken(lex_mode, line_str, pos)
218
219 # NOTE: Instead of hard-coding this token, we could pass it in.
220 # LookPastSpace(lex_mode, past_token_type)
221 # - WS_Space only given in lex_mode_e.ShCommand
222 # - Id.Ignored_Space given in lex_mode_e.Expr
223 if tok_type != Id.WS_Space and tok_type != Id.Ignored_Space:
224 break
225 pos = end_pos
226
227 return tok_type
228
229 def LookAheadDParens(self, num_bytes_back):
230 # type: (int) -> bool
231 """Assuming we see ((, is there a matching )) for arithmetic?
232
233 Or is it two subshells with commands?
234
235 Returns True for arithmetic like:
236 ((expr))
237 ((expr || (expr) && expr))
238
239 Returns False for commands like:
240 ((command) )
241 ((command); (command))
242 (( (command) ); (command))
243
244 Args:
245 num_bytes_back: subtract this amount from the current line position
246 """
247 original_pos = self.line_pos
248 first_rparen = False # Is the previous token an ')' ?
249 parens_counter = 2
250
251 pos = self.line_pos
252 while True:
253 tok_type, end_pos = match.OneToken(lex_mode_e.Arith,
254 self.src_line.content,
255 pos)
256 if end_pos == pos: # end of line
257 break
258 pos = end_pos
259
260 if tok_type in (Id.Arith_LParen, Id.Left_DollarParen): # ( or $(
261 parens_counter += 1
262
263 if (tok_type == Id.Arith_RParen and first_rparen and
264 parens_counter == 1):
265 return True # saw closing ))
266
267 if tok_type == Id.Arith_RParen:
268 parens_counter -= 1
269 first_rparen = True
270 elif first_rparen and parens_counter == 1:
271 # We hit a case like this:
272 # ((command); (command))
273 #
274 # The preceding ) isn't immediately followed by another one,
275 # and we haven't seen the matching opening (
276 # So this is not an arithmetic expression
277 break
278 else:
279 first_rparen = False
280
281 # We may have to go back one byte for arithmetic
282 self.line_pos -= num_bytes_back
283 return False
284
285 def LookAheadFuncParens(self, num_bytes_back):
286 # type: (int) -> bool
287 """For finding the () in 'f ( ) { echo hi; }'.
288
289 Args:
290 num_bytes_back: either 0 or 1, for the number of characters to go back
291
292 The lookahead is limited to the current line, which sacrifices a rare
293 corner case. This not recognized as a function:
294
295 foo\
296 () {}
297
298 whereas this is
299
300 foo()
301 {}
302 """
303 pos = self.line_pos - num_bytes_back
304 assert pos > 0, pos
305 tok_type, _ = match.OneToken(lex_mode_e.FuncParens,
306 self.src_line.content, pos)
307 return tok_type == Id.LookAhead_FuncParens
308
309 def ByteLookAhead(self):
310 # type: () -> str
311 """Lookahead a single byte.
312
313 Useful when you know the token is one char.
314 """
315 pos = self.line_pos
316 if pos == len(self.src_line.content):
317 return ''
318 else:
319 return self.src_line.content[pos]
320
321 def ByteLookBack(self):
322 # type: () -> int
323 """A little hack for stricter proc arg list syntax.
324
325 There has to be a space before the paren.
326
327 Yes: json write (x)
328 No: json write(x)
329 """
330 pos = self.line_pos - 2
331 if pos < 0:
332 return -1
333 else:
334 return ord(self.src_line.content[pos])
335
336 def Read(self, lex_mode):
337 # type: (lex_mode_t) -> Token
338
339 # Inner loop optimization
340 if self.src_line:
341 line_str = self.src_line.content
342 else:
343 line_str = ''
344 line_pos = self.line_pos
345
346 tok_type, end_pos = match.OneToken(lex_mode, line_str, line_pos)
347 if tok_type == Id.Eol_Tok: # Do NOT add a span for this sentinel!
348 # LineLexer tells Lexer to read a new line.
349 return self.eol_tok
350
351 # NOTE: We're putting the arena hook in LineLexer and not Lexer because we
352 # want it to be "low level". The only thing fabricated here is a newline
353 # added at the last line, so we don't end with \0.
354 if self.replace_last_token: # make another token from the last span
355 self.arena.UnreadOne()
356 self.replace_last_token = False
357
358 tok_len = end_pos - line_pos
359 t = self.arena.NewToken(tok_type, line_pos, tok_len, self.src_line)
360
361 self.line_pos = end_pos
362 return t
363
364
365class Lexer(object):
366 """Read lines from the line_reader, split them into tokens with line_lexer,
367 returning them in a stream."""
368
369 def __init__(self, line_lexer, line_reader):
370 # type: (LineLexer, _Reader) -> None
371 """
372 Args:
373 line_lexer: Underlying object to get tokens from
374 line_reader: get new lines from here
375 """
376 self.line_lexer = line_lexer
377 self.line_reader = line_reader
378
379 self.line_id = -1 # Invalid one
380 self.translation_stack = [] # type: List[Tuple[Id_t, Id_t]]
381 self.emit_comp_dummy = False
382
383 def ResetInputObjects(self):
384 # type: () -> None
385 self.line_lexer.Reset(None, 0)
386
387 def MaybeUnreadOne(self):
388 # type: () -> bool
389 return self.line_lexer.MaybeUnreadOne()
390
391 def LookAheadOne(self, lex_mode):
392 # type: (lex_mode_t) -> Id_t
393 return self.line_lexer.LookAheadOne(lex_mode)
394
395 def LookPastSpace(self, lex_mode):
396 # type: (lex_mode_t) -> Id_t
397 return self.line_lexer.LookPastSpace(lex_mode)
398
399 def LookAheadDParens(self, shift_back):
400 # type: (int) -> bool
401 return self.line_lexer.LookAheadDParens(shift_back)
402
403 def LookAheadFuncParens(self, unread):
404 # type: (int) -> bool
405 return self.line_lexer.LookAheadFuncParens(unread)
406
407 def ByteLookAhead(self):
408 # type: () -> str
409 return self.line_lexer.ByteLookAhead()
410
411 def ByteLookBack(self):
412 # type: () -> int
413 return self.line_lexer.ByteLookBack()
414
415 def EmitCompDummy(self):
416 # type: () -> None
417 """Emit Id.Lit_CompDummy right before EOF, for completion."""
418 self.emit_comp_dummy = True
419
420 def PushHint(self, old_id, new_id):
421 # type: (Id_t, Id_t) -> None
422 """Use cases: Id.Op_RParen -> Id.Right_Subshell -- disambiguate
423 Id.Op_RParen -> Id.Eof_RParen.
424
425 Problems for $() nesting.
426
427 - posix:
428 - case foo) and case (foo)
429 - func() {}
430 - subshell ( )
431 - bash extensions:
432 - precedence in [[, e.g. [[ (1 == 2) && (2 == 3) ]]
433 - arrays: a=(1 2 3), a+=(4 5)
434 """
435 #log(' PushHint %s ==> %s', Id_str(old_id), Id_str(new_id))
436 self.translation_stack.append((old_id, new_id))
437
438 def MoveToNextLine(self):
439 # type: () -> bool
440 """For lookahead on the next line.
441
442 This is required by `ParseYshCase` and is used in `_NewlineOkForYshCase`.
443
444 We use this because otherwise calling `LookPastSpace` would return
445 `Id.Unknown_Tok` when the lexer has reached the end of the line. For an
446 example, take this case:
447
448 case (x) {
449 ^--- We are here
450
451 (else) {
452 ^--- We want lookahead to here
453
454 echo test
455 }
456 }
457
458 But, without `MoveToNextLine`, it is impossible to peek the '(' without
459 consuming it. And consuming it would be a problem once we want to hand off
460 pattern parsing to the expression parser.
461 """
462 # Only call this when you've seen \n
463 self.line_lexer.AssertAtEndOfLine()
464
465 src_line, line_pos = self.line_reader.GetLine()
466 if src_line is None:
467 return False # EOF, so we failed at moving to next line
468
469 self.line_lexer.Reset(src_line, line_pos) # fill with a new line
470 return True
471
472 def _Read(self, lex_mode):
473 # type: (lex_mode_t) -> Token
474 """Read from the normal line buffer, not an alias."""
475 t = self.line_lexer.Read(lex_mode)
476 if t.id == Id.Eol_Tok: # We hit \0 aka Eol_Tok, read a new line
477 src_line, line_pos = self.line_reader.GetLine()
478
479 if src_line is None: # no more lines
480 if self.emit_comp_dummy:
481 id_ = Id.Lit_CompDummy
482 self.emit_comp_dummy = False # emit EOF the next time
483 else:
484 id_ = Id.Eof_Real
485 return self.line_lexer.GetEofToken(id_)
486
487 self.line_lexer.Reset(src_line, line_pos) # fill with a new line
488 t = self.line_lexer.Read(lex_mode)
489
490 # e.g. translate ) or ` into EOF
491 if len(self.translation_stack):
492 old_id, new_id = self.translation_stack[-1] # top
493 if t.id == old_id:
494 #log('==> TRANSLATING %s ==> %s', Id_str(t.id), Id_str(new_id))
495 self.translation_stack.pop()
496 t.id = new_id
497
498 return t
499
500 def Read(self, lex_mode):
501 # type: (lex_mode_t) -> Token
502 while True:
503 t = self._Read(lex_mode)
504 if t.id != Id.Ignored_LineCont:
505 break
506
507 if 0:
508 ID_HIST[t.id] += 1
509 return t
510
511
512if 0: # mylib.PYTHON: note: breaks tarball build
513 import collections
514 ID_HIST = collections.Counter() # type: Counter[Id_t]
515 LAZY_ID_HIST = collections.Counter() # type: Counter[Id_t]