frontend/lexer.py

OILS / frontend / lexer.py View on Github | oils.pub

515 lines, 227 significant

1	# Copyright 2016 Andy Chu. All rights reserved.
2	# Licensed under the Apache License, Version 2.0 (the "License");
3	# you may not use this file except in compliance with the License.
4	# You may obtain a copy of the License at
5	#
6	# http://www.apache.org/licenses/LICENSE-2.0
7	"""
8	lexer.py - Library for lexing.
9	"""
10
11	from _devbuild.gen.syntax_asdl import Token, SourceLine
12	from _devbuild.gen.types_asdl import lex_mode_t, lex_mode_e
13	from _devbuild.gen.id_kind_asdl import Id_t, Id, Id_str
14	from mycpp.mylib import log
15	from frontend import match
16
17	unused = log, Id_str
18
19	from typing import List, Tuple, Counter, TYPE_CHECKING
20	if TYPE_CHECKING:
21	from core.alloc import Arena
22	from frontend.reader import _Reader
23
24	#
25	# Optimized Token functions that use str.find(substr, start, end) to avoid
26	# allocation of temporary slice
27	#
28
29
30	def IsPlusEquals(tok):
31	# type: (Token) -> bool
32	"""Common pattern to test if we got foo= or foo+=
33	"""
34	i = tok.col + tok.length - 2 # 'foo+='[-2] is '+'
35	return tok.line.content.find('+', i, i + 1) != -1
36
37
38	def TokenContains(tok, substr):
39	# type: (Token, str) -> bool
40	return tok.line.content.find(substr, tok.col, tok.col + tok.length) != -1
41
42
43	def TokenEquals(tok, s):
44	# type: (Token, str) -> bool
45	if len(s) != tok.length:
46	return False
47	return TokenContains(tok, s)
48
49
50	def TokenStartsWith(tok, s):
51	# type: (Token, str) -> bool
52	return tok.line.content.find(s, tok.col, tok.col + len(s)) != -1
53
54
55	def TokenEndsWith(tok, s):
56	# type: (Token, str) -> bool
57	end = tok.col + tok.length
58	return tok.line.content.find(s, end - len(s), end) != -1
59
60
61	# Also: IsWhitespace, IsLeadingSpace
62
63
64	def TokenVal(tok):
65	# type: (Token) -> str
66	"""Compute string value on demand."""
67	return tok.line.content[tok.col:tok.col + tok.length]
68
69
70	def TokenSliceLeft(tok, left_index):
71	# type: (Token, int) -> str
72	"""Slice token directly, without creating intermediate string."""
73	assert left_index > 0
74	start = tok.col + left_index
75	return tok.line.content[start:tok.col + tok.length]
76
77
78	def TokenSliceRight(tok, right_index):
79	# type: (Token, int) -> str
80	"""Slice token directly, without creating intermediate string."""
81	assert right_index < 0
82	end = tok.col + tok.length + right_index
83	return tok.line.content[tok.col:end]
84
85
86	def TokenSlice(tok, left, right):
87	# type: (Token, int, int) -> str
88	"""Slice token directly, without creating intermediate string."""
89	assert left > 0, left
90	assert right < 0, right
91	start = tok.col + left
92	end = tok.col + tok.length + right
93	return tok.line.content[start:end]
94
95
96	def LazyStr(tok):
97	# type: (Token) -> str
98	"""Materialize the tval on demand, with special case for $myvar.
99
100	Most tokens do NOT need strings. We avoid allocating them in the lexer.
101
102	Note: SingleQuoted could have lazy sval, NOT at the token level.
103	"""
104	if 0:
105	LAZY_ID_HIST[tok.id] += 1
106
107	if tok.tval is None:
108	if tok.id in (Id.VSub_DollarName, Id.VSub_Number): # $x or $2
109	# Special case for SimpleVarSub - completion also relies on this
110	tok.tval = TokenSliceLeft(tok, 1)
111	else:
112	tok.tval = TokenVal(tok)
113
114	return tok.tval
115
116
117	def DummyToken(id_, val):
118	# type: (int, str) -> Token
119
120	col = -1
121	length = -1
122	return Token(id_, length, col, None, val)
123
124
125	class LineLexer(object):
126
127	def __init__(self, arena):
128	# type: (Arena) -> None
129	self.arena = arena
130	self.replace_last_token = False # For MaybeUnreadOne
131
132	# Singleton instance because we don't allow globals.
133	# 2023-09: I tried LineLexer::Read() returning None, but that is subtly
134	# incorrect, e.g. in Lexer::Read() with NUL bytes.
135	self.eol_tok = DummyToken(Id.Eol_Tok, '')
136
137	self.Reset(None, 0) # Invalid src_line to start
138
139	def __repr__(self):
140	# type: () -> str
141	return '<LineLexer at pos %d of line %r>' % (self.line_pos,
142	self.src_line)
143
144	def Reset(self, src_line, line_pos):
145	# type: (SourceLine, int) -> None
146	self.src_line = src_line
147	self.line_pos = line_pos
148
149	def MaybeUnreadOne(self):
150	# type: () -> bool
151	"""Return True if we can unread one character, or False otherwise.
152
153	NOTE: Only call this when you know the last token was exactly one character!
154	"""
155	if self.line_pos == 0:
156	return False
157	else:
158	self.line_pos -= 1
159	self.replace_last_token = True # don't add the next token to the arena
160	return True
161
162	def GetEofToken(self, id_):
163	# type: (int) -> Token
164	"""Create a new span ID for syntax errors involving the EOF token."""
165	if self.src_line is None:
166	# There are ZERO lines now. Add a dummy line 0 so the Token has a source
167	# to display errors.
168	src_line = self.arena.AddLine('', 0)
169	else:
170	src_line = self.src_line
171
172	return self.arena.NewToken(id_, self.line_pos, 0, src_line)
173
174	def LookAheadOne(self, lex_mode):
175	# type: (lex_mode_t) -> Id_t
176	"""Look ahead exactly one token in the given lexer mode."""
177	pos = self.line_pos
178	line_str = self.src_line.content
179	n = len(line_str)
180	if pos == n:
181	return Id.Unknown_Tok
182	else:
183	tok_type, _ = match.OneToken(lex_mode, line_str, pos)
184	return tok_type
185
186	def AssertAtEndOfLine(self):
187	# type: () -> None
188	assert self.line_pos == len(self.src_line.content), \
189	'%d %s' % (self.line_pos, self.src_line.content)
190
191	def LookPastSpace(self, lex_mode):
192	# type: (lex_mode_t) -> Id_t
193	"""Look ahead in current line for non-space token, using given lexer
194	mode.
195
196	Does NOT advance self.line_pos.
197
198	Called with at least the following modes:
199	lex_mode_e.Arith -- for ${a[@]} vs ${a[1+2]}
200	lex_mode_e.VSub_1
201	lex_mode_e.ShCommand
202
203	Note: Only ShCommand emits Id.WS_Space, but other lexer modes don't.
204	"""
205	pos = self.line_pos
206	line_str = self.src_line.content
207	n = len(line_str)
208	#print('Look ahead from pos %d, line %r' % (pos,self.line))
209	while True:
210	if pos == n:
211	# We don't allow lookahead while already at end of line, because it
212	# would involve interacting with the line reader, and we never need
213	# it. In lex_mode_e.ShCommand, there is an explicit newline token, but
214	# lex_mode_e.Arith doesn't have it.
215	return Id.Unknown_Tok
216
217	tok_type, end_pos = match.OneToken(lex_mode, line_str, pos)
218
219	# NOTE: Instead of hard-coding this token, we could pass it in.
220	# LookPastSpace(lex_mode, past_token_type)
221	# - WS_Space only given in lex_mode_e.ShCommand
222	# - Id.Ignored_Space given in lex_mode_e.Expr
223	if tok_type != Id.WS_Space and tok_type != Id.Ignored_Space:
224	break
225	pos = end_pos
226
227	return tok_type
228
229	def LookAheadDParens(self, num_bytes_back):
230	# type: (int) -> bool
231	"""Assuming we see ((, is there a matching )) for arithmetic?
232
233	Or is it two subshells with commands?
234
235	Returns True for arithmetic like:
236	((expr))
237	((expr \|\| (expr) && expr))
238
239	Returns False for commands like:
240	((command) )
241	((command); (command))
242	(( (command) ); (command))
243
244	Args:
245	num_bytes_back: subtract this amount from the current line position
246	"""
247	original_pos = self.line_pos
248	first_rparen = False # Is the previous token an ')' ?
249	parens_counter = 2
250
251	pos = self.line_pos
252	while True:
253	tok_type, end_pos = match.OneToken(lex_mode_e.Arith,
254	self.src_line.content,
255	pos)
256	if end_pos == pos: # end of line
257	break
258	pos = end_pos
259
260	if tok_type in (Id.Arith_LParen, Id.Left_DollarParen): # ( or $(
261	parens_counter += 1
262
263	if (tok_type == Id.Arith_RParen and first_rparen and
264	parens_counter == 1):
265	return True # saw closing ))
266
267	if tok_type == Id.Arith_RParen:
268	parens_counter -= 1
269	first_rparen = True
270	elif first_rparen and parens_counter == 1:
271	# We hit a case like this:
272	# ((command); (command))
273	#
274	# The preceding ) isn't immediately followed by another one,
275	# and we haven't seen the matching opening (
276	# So this is not an arithmetic expression
277	break
278	else:
279	first_rparen = False
280
281	# We may have to go back one byte for arithmetic
282	self.line_pos -= num_bytes_back
283	return False
284
285	def LookAheadFuncParens(self, num_bytes_back):
286	# type: (int) -> bool
287	"""For finding the () in 'f ( ) { echo hi; }'.
288
289	Args:
290	num_bytes_back: either 0 or 1, for the number of characters to go back
291
292	The lookahead is limited to the current line, which sacrifices a rare
293	corner case. This not recognized as a function:
294
295	foo\
296	() {}
297
298	whereas this is
299
300	foo()
301	{}
302	"""
303	pos = self.line_pos - num_bytes_back
304	assert pos > 0, pos
305	tok_type, _ = match.OneToken(lex_mode_e.FuncParens,
306	self.src_line.content, pos)
307	return tok_type == Id.LookAhead_FuncParens
308
309	def ByteLookAhead(self):
310	# type: () -> str
311	"""Lookahead a single byte.
312
313	Useful when you know the token is one char.
314	"""
315	pos = self.line_pos
316	if pos == len(self.src_line.content):
317	return ''
318	else:
319	return self.src_line.content[pos]
320
321	def ByteLookBack(self):
322	# type: () -> int
323	"""A little hack for stricter proc arg list syntax.
324
325	There has to be a space before the paren.
326
327	Yes: json write (x)
328	No: json write(x)
329	"""
330	pos = self.line_pos - 2
331	if pos < 0:
332	return -1
333	else:
334	return ord(self.src_line.content[pos])
335
336	def Read(self, lex_mode):
337	# type: (lex_mode_t) -> Token
338
339	# Inner loop optimization
340	if self.src_line:
341	line_str = self.src_line.content
342	else:
343	line_str = ''
344	line_pos = self.line_pos
345
346	tok_type, end_pos = match.OneToken(lex_mode, line_str, line_pos)
347	if tok_type == Id.Eol_Tok: # Do NOT add a span for this sentinel!
348	# LineLexer tells Lexer to read a new line.
349	return self.eol_tok
350
351	# NOTE: We're putting the arena hook in LineLexer and not Lexer because we
352	# want it to be "low level". The only thing fabricated here is a newline
353	# added at the last line, so we don't end with \0.
354	if self.replace_last_token: # make another token from the last span
355	self.arena.UnreadOne()
356	self.replace_last_token = False
357
358	tok_len = end_pos - line_pos
359	t = self.arena.NewToken(tok_type, line_pos, tok_len, self.src_line)
360
361	self.line_pos = end_pos
362	return t
363
364
365	class Lexer(object):
366	"""Read lines from the line_reader, split them into tokens with line_lexer,
367	returning them in a stream."""
368
369	def __init__(self, line_lexer, line_reader):
370	# type: (LineLexer, _Reader) -> None
371	"""
372	Args:
373	line_lexer: Underlying object to get tokens from
374	line_reader: get new lines from here
375	"""
376	self.line_lexer = line_lexer
377	self.line_reader = line_reader
378
379	self.line_id = -1 # Invalid one
380	self.translation_stack = [] # type: List[Tuple[Id_t, Id_t]]
381	self.emit_comp_dummy = False
382
383	def ResetInputObjects(self):
384	# type: () -> None
385	self.line_lexer.Reset(None, 0)
386
387	def MaybeUnreadOne(self):
388	# type: () -> bool
389	return self.line_lexer.MaybeUnreadOne()
390
391	def LookAheadOne(self, lex_mode):
392	# type: (lex_mode_t) -> Id_t
393	return self.line_lexer.LookAheadOne(lex_mode)
394
395	def LookPastSpace(self, lex_mode):
396	# type: (lex_mode_t) -> Id_t
397	return self.line_lexer.LookPastSpace(lex_mode)
398
399	def LookAheadDParens(self, shift_back):
400	# type: (int) -> bool
401	return self.line_lexer.LookAheadDParens(shift_back)
402
403	def LookAheadFuncParens(self, unread):
404	# type: (int) -> bool
405	return self.line_lexer.LookAheadFuncParens(unread)
406
407	def ByteLookAhead(self):
408	# type: () -> str
409	return self.line_lexer.ByteLookAhead()
410
411	def ByteLookBack(self):
412	# type: () -> int
413	return self.line_lexer.ByteLookBack()
414
415	def EmitCompDummy(self):
416	# type: () -> None
417	"""Emit Id.Lit_CompDummy right before EOF, for completion."""
418	self.emit_comp_dummy = True
419
420	def PushHint(self, old_id, new_id):
421	# type: (Id_t, Id_t) -> None
422	"""Use cases: Id.Op_RParen -> Id.Right_Subshell -- disambiguate
423	Id.Op_RParen -> Id.Eof_RParen.
424
425	Problems for $() nesting.
426
427	- posix:
428	- case foo) and case (foo)
429	- func() {}
430	- subshell ( )
431	- bash extensions:
432	- precedence in [[, e.g. [[ (1 == 2) && (2 == 3) ]]
433	- arrays: a=(1 2 3), a+=(4 5)
434	"""
435	#log(' PushHint %s ==> %s', Id_str(old_id), Id_str(new_id))
436	self.translation_stack.append((old_id, new_id))
437
438	def MoveToNextLine(self):
439	# type: () -> bool
440	"""For lookahead on the next line.
441
442	This is required by `ParseYshCase` and is used in `_NewlineOkForYshCase`.
443
444	We use this because otherwise calling `LookPastSpace` would return
445	`Id.Unknown_Tok` when the lexer has reached the end of the line. For an
446	example, take this case:
447
448	case (x) {
449	^--- We are here
450
451	(else) {
452	^--- We want lookahead to here
453
454	echo test
455	}
456	}
457
458	But, without `MoveToNextLine`, it is impossible to peek the '(' without
459	consuming it. And consuming it would be a problem once we want to hand off
460	pattern parsing to the expression parser.
461	"""
462	# Only call this when you've seen \n
463	self.line_lexer.AssertAtEndOfLine()
464
465	src_line, line_pos = self.line_reader.GetLine()
466	if src_line is None:
467	return False # EOF, so we failed at moving to next line
468
469	self.line_lexer.Reset(src_line, line_pos) # fill with a new line
470	return True
471
472	def _Read(self, lex_mode):
473	# type: (lex_mode_t) -> Token
474	"""Read from the normal line buffer, not an alias."""
475	t = self.line_lexer.Read(lex_mode)
476	if t.id == Id.Eol_Tok: # We hit \0 aka Eol_Tok, read a new line
477	src_line, line_pos = self.line_reader.GetLine()
478
479	if src_line is None: # no more lines
480	if self.emit_comp_dummy:
481	id_ = Id.Lit_CompDummy
482	self.emit_comp_dummy = False # emit EOF the next time
483	else:
484	id_ = Id.Eof_Real
485	return self.line_lexer.GetEofToken(id_)
486
487	self.line_lexer.Reset(src_line, line_pos) # fill with a new line
488	t = self.line_lexer.Read(lex_mode)
489
490	# e.g. translate ) or ` into EOF
491	if len(self.translation_stack):
492	old_id, new_id = self.translation_stack[-1] # top
493	if t.id == old_id:
494	#log('==> TRANSLATING %s ==> %s', Id_str(t.id), Id_str(new_id))
495	self.translation_stack.pop()
496	t.id = new_id
497
498	return t
499
500	def Read(self, lex_mode):
501	# type: (lex_mode_t) -> Token
502	while True:
503	t = self._Read(lex_mode)
504	if t.id != Id.Ignored_LineCont:
505	break
506
507	if 0:
508	ID_HIST[t.id] += 1
509	return t
510
511
512	if 0: # mylib.PYTHON: note: breaks tarball build
513	import collections
514	ID_HIST = collections.Counter() # type: Counter[Id_t]
515	LAZY_ID_HIST = collections.Counter() # type: Counter[Id_t]