builtin/printf

OILS / builtin / printf_osh.py View on Github | oilshell.org

552 lines, 369 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	import time as time_ # avoid name conflict
5
6	from _devbuild.gen import arg_types
7	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind, Kind_t
8	from _devbuild.gen.runtime_asdl import cmd_value
9	from _devbuild.gen.syntax_asdl import (
10	loc,
11	loc_e,
12	loc_t,
13	source,
14	Token,
15	CompoundWord,
16	printf_part,
17	printf_part_e,
18	printf_part_t,
19	)
20	from _devbuild.gen.types_asdl import lex_mode_e, lex_mode_t
21	from _devbuild.gen.value_asdl import (value, value_e)
22
23	from core import alloc
24	from core import error
25	from core.error import p_die
26	from core import state
27	from core import vm
28	from frontend import flag_util
29	from frontend import consts
30	from frontend import lexer
31	from frontend import match
32	from frontend import reader
33	from mycpp import mops
34	from mycpp import mylib
35	from mycpp.mylib import log
36	from osh import sh_expr_eval
37	from osh import string_ops
38	from osh import word_compile
39	from data_lang import j8_lite
40
41	import posix_ as posix
42
43	from typing import Dict, List, Optional, TYPE_CHECKING, cast
44
45	if TYPE_CHECKING:
46	from display import ui
47	from frontend import parse_lib
48
49	_ = log
50
51
52	class _FormatStringParser(object):
53	"""
54	Grammar:
55
56	width = Num \| Star
57	precision = Dot (Num \| Star \| Zero)?
58	fmt = Percent (Flag \| Zero)* width? precision? (Type \| Time)
59	part = Char_* \| Format_EscapedPercent \| fmt
60	printf_format = part* Eof_Real # we're using the main lexer
61
62	Maybe: bash also supports %(strftime)T
63	"""
64
65	def __init__(self, lexer):
66	# type: (lexer.Lexer) -> None
67	self.lexer = lexer
68
69	# uninitialized values
70	self.cur_token = None # type: Token
71	self.token_type = Id.Undefined_Tok # type: Id_t
72	self.token_kind = Kind.Undefined # type: Kind_t
73
74	def _Next(self, lex_mode):
75	# type: (lex_mode_t) -> None
76	"""Advance a token."""
77	self.cur_token = self.lexer.Read(lex_mode)
78	self.token_type = self.cur_token.id
79	self.token_kind = consts.GetKind(self.token_type)
80
81	def _ParseFormatStr(self):
82	# type: () -> printf_part_t
83	"""fmt = ..."""
84	self._Next(lex_mode_e.PrintfPercent) # move past %
85
86	part = printf_part.Percent.CreateNull(alloc_lists=True)
87	while self.token_type in (Id.Format_Flag, Id.Format_Zero):
88	# space and + could be implemented
89	flag = lexer.TokenVal(self.cur_token) # allocation will be cached
90	if flag in '# +':
91	p_die("osh printf doesn't support the %r flag" % flag,
92	self.cur_token)
93
94	part.flags.append(self.cur_token)
95	self._Next(lex_mode_e.PrintfPercent)
96
97	if self.token_type in (Id.Format_Num, Id.Format_Star):
98	part.width = self.cur_token
99	self._Next(lex_mode_e.PrintfPercent)
100
101	if self.token_type == Id.Format_Dot:
102	part.precision = self.cur_token
103	self._Next(lex_mode_e.PrintfPercent) # past dot
104	if self.token_type in (Id.Format_Num, Id.Format_Star,
105	Id.Format_Zero):
106	part.precision = self.cur_token
107	self._Next(lex_mode_e.PrintfPercent)
108
109	if self.token_type in (Id.Format_Type, Id.Format_Time):
110	part.type = self.cur_token
111
112	# ADDITIONAL VALIDATION outside the "grammar".
113	type_val = lexer.TokenVal(part.type) # allocation will be cached
114	if type_val in 'eEfFgG':
115	p_die("osh printf doesn't support floating point", part.type)
116	# These two could be implemented. %c needs utf-8 decoding.
117	if type_val == 'c':
118	p_die("osh printf doesn't support single characters (bytes)",
119	part.type)
120
121	elif self.token_type == Id.Unknown_Tok:
122	p_die('Invalid printf format character', self.cur_token)
123
124	else:
125	p_die('Expected a printf format character', self.cur_token)
126
127	return part
128
129	def Parse(self):
130	# type: () -> List[printf_part_t]
131	self._Next(lex_mode_e.PrintfOuter)
132	parts = [] # type: List[printf_part_t]
133	while True:
134	if (self.token_kind in (Kind.Lit, Kind.Char) or self.token_type
135	in (Id.Format_EscapedPercent, Id.Unknown_Backslash)):
136
137	# Note: like in echo -e, we don't fail with Unknown_Backslash here
138	# when shopt -u parse_backslash because it's at runtime rather than
139	# parse time.
140	# Users should use $'' or the future static printf ${x %.3f}.
141
142	parts.append(self.cur_token)
143
144	elif self.token_type == Id.Format_Percent:
145	parts.append(self._ParseFormatStr())
146
147	elif self.token_type in (Id.Eof_Real, Id.Eol_Tok):
148	# Id.Eol_Tok: special case for format string of '\x00'.
149	break
150
151	else:
152	raise AssertionError(Id_str(self.token_type))
153
154	self._Next(lex_mode_e.PrintfOuter)
155
156	return parts
157
158
159	class _PrintfState(object):
160
161	def __init__(self):
162	# type: () -> None
163	self.arg_index = 0
164	self.backslash_c = False
165	self.status = 0 # set to 1 before returning
166
167
168	class Printf(vm._Builtin):
169
170	def __init__(
171	self,
172	mem, # type: state.Mem
173	parse_ctx, # type: parse_lib.ParseContext
174	unsafe_arith, # type: sh_expr_eval.UnsafeArith
175	errfmt, # type: ui.ErrorFormatter
176	):
177	# type: (...) -> None
178	self.mem = mem
179	self.parse_ctx = parse_ctx
180	self.unsafe_arith = unsafe_arith
181	self.errfmt = errfmt
182	self.parse_cache = {} # type: Dict[str, List[printf_part_t]]
183
184	# this object initialized in main()
185	self.shell_start_time = time_.time()
186
187	def _Percent(
188	self,
189	pr, # type: _PrintfState
190	part, # type: printf_part.Percent
191	varargs, # type: List[str]
192	locs, # type: List[CompoundWord]
193	):
194	# type: (...) -> Optional[str]
195
196	num_args = len(varargs)
197
198	# TODO: Cache this?
199	flags = [] # type: List[str]
200	if len(part.flags) > 0:
201	for flag_token in part.flags:
202	flags.append(lexer.TokenVal(flag_token))
203
204	width = -1 # nonexistent
205	if part.width:
206	if part.width.id in (Id.Format_Num, Id.Format_Zero):
207	width_str = lexer.TokenVal(part.width)
208	width_loc = part.width # type: loc_t
209	elif part.width.id == Id.Format_Star: # depends on data
210	if pr.arg_index < num_args:
211	width_str = varargs[pr.arg_index]
212	width_loc = locs[pr.arg_index]
213	pr.arg_index += 1
214	else:
215	width_str = '' # invalid
216	width_loc = loc.Missing
217	else:
218	raise AssertionError()
219
220	try:
221	width = int(width_str)
222	except ValueError:
223	if width_loc.tag() == loc_e.Missing:
224	width_loc = part.width
225	self.errfmt.Print_("printf got invalid width %r" % width_str,
226	blame_loc=width_loc)
227	pr.status = 1
228	return None
229
230	precision = -1 # nonexistent
231	if part.precision:
232	if part.precision.id == Id.Format_Dot:
233	precision_str = '0'
234	precision_loc = part.precision # type: loc_t
235	elif part.precision.id in (Id.Format_Num, Id.Format_Zero):
236	precision_str = lexer.TokenVal(part.precision)
237	precision_loc = part.precision
238	elif part.precision.id == Id.Format_Star:
239	if pr.arg_index < num_args:
240	precision_str = varargs[pr.arg_index]
241	precision_loc = locs[pr.arg_index]
242	pr.arg_index += 1
243	else:
244	precision_str = ''
245	precision_loc = loc.Missing
246	else:
247	raise AssertionError()
248
249	try:
250	precision = int(precision_str)
251	except ValueError:
252	if precision_loc.tag() == loc_e.Missing:
253	precision_loc = part.precision
254	self.errfmt.Print_('printf got invalid precision %r' %
255	precision_str,
256	blame_loc=precision_loc)
257	pr.status = 1
258	return None
259
260	if pr.arg_index < num_args:
261	s = varargs[pr.arg_index]
262	word_loc = locs[pr.arg_index] # type: loc_t
263	pr.arg_index += 1
264	has_arg = True
265	else:
266	s = ''
267	word_loc = loc.Missing
268	has_arg = False
269
270	# Note: %s could be lexed into Id.Percent_S. Although small string
271	# optimization would remove the allocation as well.
272	typ = lexer.TokenVal(part.type)
273	if typ == 's':
274	if precision >= 0:
275	s = s[:precision] # truncate
276
277	elif typ == 'q':
278	# Most shells give \' for single quote, while OSH gives
279	# $'\'' this could matter when SSH'ing.
280	# Ditto for $'\\' vs. '\'
281
282	s = j8_lite.MaybeShellEncode(s)
283
284	elif typ == 'b':
285	# Process just like echo -e, except \c handling is simpler.
286
287	c_parts = [] # type: List[str]
288	lex = match.EchoLexer(s)
289	while True:
290	id_, tok_val = lex.Next()
291	if id_ == Id.Eol_Tok: # Note: This is really a NUL terminator
292	break
293
294	p = word_compile.EvalCStringToken(id_, tok_val)
295
296	# Unusual behavior: '\c' aborts processing!
297	if p is None:
298	pr.backslash_c = True
299	break
300
301	c_parts.append(p)
302	s = ''.join(c_parts)
303
304	elif part.type.id == Id.Format_Time or typ in 'diouxX':
305	# %(...)T and %d share this complex integer conversion logic
306
307	if match.LooksLikeInteger(s):
308	# Note: spaces like ' -42 ' accepted and normalized
309	ok, d = mops.FromStr2(s)
310	if not ok:
311	self.errfmt.Print_("Integer too big: %s" % s, word_loc)
312	pr.status = 1
313	return None
314
315	else:
316	# Check for 'a and "a
317	# These are interpreted as the numeric ASCII value of 'a'
318	num_bytes = len(s)
319	if num_bytes > 0 and s[0] in '\'"':
320	if num_bytes == 1:
321	# NUL after quote
322	d = mops.ZERO
323	elif num_bytes == 2:
324	# Allow invalid UTF-8, because all shells do
325	d = mops.IntWiden(ord(s[1]))
326	else:
327	try:
328	small_i = string_ops.DecodeUtf8Char(s, 1)
329	except error.Expr as e:
330	# Take the numeric value of first char, ignoring
331	# the rest of the bytes.
332	# Something like strict_arith or strict_printf
333	# could throw an error in this case.
334	self.errfmt.Print_(
335	'Warning: %s' % e.UserErrorString(), word_loc)
336	small_i = ord(s[1])
337
338	d = mops.IntWiden(small_i)
339
340	# No argument means -1 for %(...)T as in Bash Reference Manual
341	# 4.2 - "If no argument is specified, conversion behaves as if
342	# -1 had been given."
343	elif not has_arg and part.type.id == Id.Format_Time:
344	d = mops.MINUS_ONE
345
346	else:
347	if has_arg:
348	blame_loc = word_loc # type: loc_t
349	else:
350	blame_loc = part.type
351	self.errfmt.Print_(
352	'printf expected an integer, got %r' % s, blame_loc)
353	pr.status = 1
354	return None
355
356	if part.type.id == Id.Format_Time:
357	# Initialize timezone:
358	# `localtime' uses the current timezone information initialized
359	# by `tzset'. The function `tzset' refers to the environment
360	# variable `TZ'. When the exported variable `TZ' is present,
361	# its value should be reflected in the real environment
362	# variable `TZ' before call of `tzset'.
363	#
364	# Note: unlike LANG, TZ doesn't seem to change behavior if it's
365	# not exported.
366	#
367	# TODO: In YSH, provide an API that doesn't rely on libc's global
368	# state.
369
370	tzcell = self.mem.GetCell('TZ')
371	if (tzcell and tzcell.exported and
372	tzcell.val.tag() == value_e.Str):
373	tzval = cast(value.Str, tzcell.val)
374	posix.putenv('TZ', tzval.s)
375
376	time_.tzset()
377
378	# Handle special values:
379	# User can specify two special values -1 and -2 as in Bash
380	# Reference Manual 4.2: "Two special argument values may be
381	# used: -1 represents the current time, and -2 represents the
382	# time the shell was invoked." from
383	# https://www.gnu.org/software/bash/manual/html_node/Bash-Builtins.html#index-printf
384	if mops.Equal(d, mops.MINUS_ONE): # -1 is current time
385	# TODO: 2038 problem
386	ts = time_.time()
387	elif mops.Equal(d, mops.MINUS_TWO): # -2 is shell start time
388	ts = self.shell_start_time
389	else:
390	ts = mops.BigTruncate(d)
391
392	s = time_.strftime(typ[1:-2], time_.localtime(ts))
393	if precision >= 0:
394	s = s[:precision] # truncate
395
396	else: # typ in 'diouxX'
397	# Disallowed because it depends on 32- or 64- bit
398	if mops.Greater(mops.ZERO, d) and typ in 'ouxX':
399	# TODO: Don't truncate it
400	self.errfmt.Print_(
401	"Can't format negative number with %%%s: %d" %
402	(typ, mops.BigTruncate(d)), part.type)
403	pr.status = 1
404	return None
405
406	if typ == 'o':
407	s = mops.ToOctal(d)
408	elif typ == 'x':
409	s = mops.ToHexLower(d)
410	elif typ == 'X':
411	s = mops.ToHexUpper(d)
412	else: # diu
413	s = mops.ToStr(d) # without spaces like ' -42 '
414
415	# There are TWO different ways to ZERO PAD, and they differ on
416	# the negative sign! See spec/builtin-printf
417
418	zero_pad = 0 # no zero padding
419	if width >= 0 and '0' in flags:
420	zero_pad = 1 # style 1
421	elif precision > 0 and len(s) < precision:
422	zero_pad = 2 # style 2
423
424	if zero_pad:
425	negative = (s[0] == '-')
426	if negative:
427	digits = s[1:]
428	sign = '-'
429	if zero_pad == 1:
430	# [%06d] -42 becomes [-00042] (6 TOTAL)
431	n = width - 1
432	else:
433	# [%6.6d] -42 becomes [-000042] (1 for '-' + 6)
434	n = precision
435	else:
436	digits = s
437	sign = ''
438	if zero_pad == 1:
439	n = width
440	else:
441	n = precision
442	s = sign + digits.rjust(n, '0')
443
444	else:
445	raise AssertionError()
446
447	if width >= 0:
448	if '-' in flags:
449	s = s.ljust(width, ' ')
450	else:
451	s = s.rjust(width, ' ')
452	return s
453
454	def _Format(self, parts, varargs, locs, out):
455	# type: (List[printf_part_t], List[str], List[CompoundWord], List[str]) -> int
456	"""Hairy printf formatting logic."""
457
458	pr = _PrintfState()
459	num_args = len(varargs)
460
461	while True: # loop over arguments
462	for part in parts: # loop over parsed format string
463	UP_part = part
464	if part.tag() == printf_part_e.Literal:
465	part = cast(Token, UP_part)
466	if part.id == Id.Format_EscapedPercent:
467	s = '%'
468	else:
469	s = word_compile.EvalCStringToken(
470	part.id, lexer.LazyStr(part))
471
472	elif part.tag() == printf_part_e.Percent:
473	part = cast(printf_part.Percent, UP_part)
474
475	s = self._Percent(pr, part, varargs, locs)
476	if pr.status != 0:
477	return pr.status
478
479	else:
480	raise AssertionError()
481
482	out.append(s)
483
484	if pr.backslash_c: # 'printf %b a\cb xx' - \c terminates processing!
485	break
486
487	if pr.arg_index == 0:
488	# We went through ALL parts and didn't consume ANY arg.
489	# Example: print x y
490	break
491	if pr.arg_index >= num_args:
492	# We printed all args
493	break
494	# If there are more args, keep going. This implement 'arg recycling'
495	# behavior
496	# printf '%s ' 1 2 3 => 1 2 3
497
498	return 0
499
500	def Run(self, cmd_val):
501	# type: (cmd_value.Argv) -> int
502	"""
503	printf: printf [-v var] format [argument ...]
504	"""
505	attrs, arg_r = flag_util.ParseCmdVal('printf', cmd_val)
506	arg = arg_types.printf(attrs.attrs)
507
508	fmt, fmt_loc = arg_r.ReadRequired2('requires a format string')
509	varargs, locs = arg_r.Rest2()
510
511	#log('fmt %s', fmt)
512	#log('vals %s', vals)
513
514	arena = self.parse_ctx.arena
515	if fmt in self.parse_cache:
516	parts = self.parse_cache[fmt]
517	else:
518	line_reader = reader.StringLineReader(fmt, arena)
519	# TODO: Make public
520	lexer = self.parse_ctx.MakeLexer(line_reader)
521	parser = _FormatStringParser(lexer)
522
523	with alloc.ctx_SourceCode(arena,
524	source.Dynamic('printf arg', fmt_loc)):
525	try:
526	parts = parser.Parse()
527	except error.Parse as e:
528	self.errfmt.PrettyPrintError(e)
529	return 2 # parse error
530
531	self.parse_cache[fmt] = parts
532
533	if 0:
534	print()
535	for part in parts:
536	part.PrettyPrint()
537	print()
538
539	out = [] # type: List[str]
540	status = self._Format(parts, varargs, locs, out)
541	if status != 0:
542	return status # failure
543
544	result = ''.join(out)
545	if arg.v is not None:
546	# TODO: get the location for arg.v!
547	v_loc = loc.Missing
548	lval = self.unsafe_arith.ParseLValue(arg.v, v_loc)
549	state.BuiltinSetValue(self.mem, lval, value.Str(result))
550	else:
551	mylib.Stdout().write(result)
552	return 0