builtin/printf

OILS / builtin / printf_osh.py View on Github | oilshell.org

549 lines, 365 significant

1	#!/usr/bin/env python2
2	"""Builtin_printf.py."""
3	from __future__ import print_function
4
5	import time as time_ # avoid name conflict
6
7	from _devbuild.gen import arg_types
8	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind, Kind_t
9	from _devbuild.gen.runtime_asdl import cmd_value
10	from _devbuild.gen.syntax_asdl import (
11	loc,
12	loc_e,
13	loc_t,
14	source,
15	Token,
16	CompoundWord,
17	printf_part,
18	printf_part_e,
19	printf_part_t,
20	)
21	from _devbuild.gen.types_asdl import lex_mode_e, lex_mode_t
22	from _devbuild.gen.value_asdl import (value, value_e)
23
24	from core import alloc
25	from core import error
26	from core.error import e_die, p_die
27	from core import state
28	from core import vm
29	from frontend import flag_util
30	from frontend import consts
31	from frontend import lexer
32	from frontend import match
33	from frontend import reader
34	from mycpp import mops
35	from mycpp import mylib
36	from mycpp.mylib import log
37	from osh import sh_expr_eval
38	from osh import string_ops
39	from osh import word_compile
40	from data_lang import j8_lite
41
42	import posix_ as posix
43
44	from typing import Dict, List, Optional, TYPE_CHECKING, cast
45
46	if TYPE_CHECKING:
47	from display import ui
48	from frontend import parse_lib
49
50	_ = log
51
52
53	class _FormatStringParser(object):
54	"""
55	Grammar:
56
57	width = Num \| Star
58	precision = Dot (Num \| Star \| Zero)?
59	fmt = Percent (Flag \| Zero)* width? precision? (Type \| Time)
60	part = Char_* \| Format_EscapedPercent \| fmt
61	printf_format = part* Eof_Real # we're using the main lexer
62
63	Maybe: bash also supports %(strftime)T
64	"""
65
66	def __init__(self, lexer):
67	# type: (lexer.Lexer) -> None
68	self.lexer = lexer
69
70	# uninitialized values
71	self.cur_token = None # type: Token
72	self.token_type = Id.Undefined_Tok # type: Id_t
73	self.token_kind = Kind.Undefined # type: Kind_t
74
75	def _Next(self, lex_mode):
76	# type: (lex_mode_t) -> None
77	"""Advance a token."""
78	self.cur_token = self.lexer.Read(lex_mode)
79	self.token_type = self.cur_token.id
80	self.token_kind = consts.GetKind(self.token_type)
81
82	def _ParseFormatStr(self):
83	# type: () -> printf_part_t
84	"""fmt = ..."""
85	self._Next(lex_mode_e.PrintfPercent) # move past %
86
87	part = printf_part.Percent.CreateNull(alloc_lists=True)
88	while self.token_type in (Id.Format_Flag, Id.Format_Zero):
89	# space and + could be implemented
90	flag = lexer.TokenVal(self.cur_token) # allocation will be cached
91	if flag in '# +':
92	p_die("osh printf doesn't support the %r flag" % flag,
93	self.cur_token)
94
95	part.flags.append(self.cur_token)
96	self._Next(lex_mode_e.PrintfPercent)
97
98	if self.token_type in (Id.Format_Num, Id.Format_Star):
99	part.width = self.cur_token
100	self._Next(lex_mode_e.PrintfPercent)
101
102	if self.token_type == Id.Format_Dot:
103	part.precision = self.cur_token
104	self._Next(lex_mode_e.PrintfPercent) # past dot
105	if self.token_type in (Id.Format_Num, Id.Format_Star,
106	Id.Format_Zero):
107	part.precision = self.cur_token
108	self._Next(lex_mode_e.PrintfPercent)
109
110	if self.token_type in (Id.Format_Type, Id.Format_Time):
111	part.type = self.cur_token
112
113	# ADDITIONAL VALIDATION outside the "grammar".
114	type_val = lexer.TokenVal(part.type) # allocation will be cached
115	if type_val in 'eEfFgG':
116	p_die("osh printf doesn't support floating point", part.type)
117	# These two could be implemented. %c needs utf-8 decoding.
118	if type_val == 'c':
119	p_die("osh printf doesn't support single characters (bytes)",
120	part.type)
121
122	elif self.token_type == Id.Unknown_Tok:
123	p_die('Invalid printf format character', self.cur_token)
124
125	else:
126	p_die('Expected a printf format character', self.cur_token)
127
128	return part
129
130	def Parse(self):
131	# type: () -> List[printf_part_t]
132	self._Next(lex_mode_e.PrintfOuter)
133	parts = [] # type: List[printf_part_t]
134	while True:
135	if (self.token_kind in (Kind.Lit, Kind.Char) or self.token_type
136	in (Id.Format_EscapedPercent, Id.Unknown_Backslash)):
137
138	# Note: like in echo -e, we don't fail with Unknown_Backslash here
139	# when shopt -u parse_backslash because it's at runtime rather than
140	# parse time.
141	# Users should use $'' or the future static printf ${x %.3f}.
142
143	parts.append(self.cur_token)
144
145	elif self.token_type == Id.Format_Percent:
146	parts.append(self._ParseFormatStr())
147
148	elif self.token_type in (Id.Eof_Real, Id.Eol_Tok):
149	# Id.Eol_Tok: special case for format string of '\x00'.
150	break
151
152	else:
153	raise AssertionError(Id_str(self.token_type))
154
155	self._Next(lex_mode_e.PrintfOuter)
156
157	return parts
158
159
160	class _PrintfState(object):
161
162	def __init__(self):
163	# type: () -> None
164	self.arg_index = 0
165	self.backslash_c = False
166	self.status = 0 # set to 1 before returning
167
168
169	class Printf(vm._Builtin):
170
171	def __init__(
172	self,
173	mem, # type: state.Mem
174	parse_ctx, # type: parse_lib.ParseContext
175	unsafe_arith, # type: sh_expr_eval.UnsafeArith
176	errfmt, # type: ui.ErrorFormatter
177	):
178	# type: (...) -> None
179	self.mem = mem
180	self.parse_ctx = parse_ctx
181	self.unsafe_arith = unsafe_arith
182	self.errfmt = errfmt
183	self.parse_cache = {} # type: Dict[str, List[printf_part_t]]
184
185	# this object initialized in main()
186	self.shell_start_time = time_.time()
187
188	def _Percent(
189	self,
190	pr, # type: _PrintfState
191	part, # type: printf_part.Percent
192	varargs, # type: List[str]
193	locs, # type: List[CompoundWord]
194	):
195	# type: (...) -> Optional[str]
196
197	num_args = len(varargs)
198
199	# TODO: Cache this?
200	flags = [] # type: List[str]
201	if len(part.flags) > 0:
202	for flag_token in part.flags:
203	flags.append(lexer.TokenVal(flag_token))
204
205	width = -1 # nonexistent
206	if part.width:
207	if part.width.id in (Id.Format_Num, Id.Format_Zero):
208	width_str = lexer.TokenVal(part.width)
209	width_loc = part.width # type: loc_t
210	elif part.width.id == Id.Format_Star: # depends on data
211	if pr.arg_index < num_args:
212	width_str = varargs[pr.arg_index]
213	width_loc = locs[pr.arg_index]
214	pr.arg_index += 1
215	else:
216	width_str = '' # invalid
217	width_loc = loc.Missing
218	else:
219	raise AssertionError()
220
221	try:
222	width = int(width_str)
223	except ValueError:
224	if width_loc.tag() == loc_e.Missing:
225	width_loc = part.width
226	self.errfmt.Print_("printf got invalid width %r" % width_str,
227	blame_loc=width_loc)
228	pr.status = 1
229	return None
230
231	precision = -1 # nonexistent
232	if part.precision:
233	if part.precision.id == Id.Format_Dot:
234	precision_str = '0'
235	precision_loc = part.precision # type: loc_t
236	elif part.precision.id in (Id.Format_Num, Id.Format_Zero):
237	precision_str = lexer.TokenVal(part.precision)
238	precision_loc = part.precision
239	elif part.precision.id == Id.Format_Star:
240	if pr.arg_index < num_args:
241	precision_str = varargs[pr.arg_index]
242	precision_loc = locs[pr.arg_index]
243	pr.arg_index += 1
244	else:
245	precision_str = ''
246	precision_loc = loc.Missing
247	else:
248	raise AssertionError()
249
250	try:
251	precision = int(precision_str)
252	except ValueError:
253	if precision_loc.tag() == loc_e.Missing:
254	precision_loc = part.precision
255	self.errfmt.Print_('printf got invalid precision %r' %
256	precision_str,
257	blame_loc=precision_loc)
258	pr.status = 1
259	return None
260
261	if pr.arg_index < num_args:
262	s = varargs[pr.arg_index]
263	word_loc = locs[pr.arg_index] # type: loc_t
264	pr.arg_index += 1
265	has_arg = True
266	else:
267	s = ''
268	word_loc = loc.Missing
269	has_arg = False
270
271	# Note: %s could be lexed into Id.Percent_S. Although small string
272	# optimization would remove the allocation as well.
273	typ = lexer.TokenVal(part.type)
274	if typ == 's':
275	if precision >= 0:
276	s = s[:precision] # truncate
277
278	elif typ == 'q':
279	# Most shells give \' for single quote, while OSH gives
280	# $'\'' this could matter when SSH'ing.
281	# Ditto for $'\\' vs. '\'
282
283	s = j8_lite.MaybeShellEncode(s)
284
285	elif typ == 'b':
286	# Process just like echo -e, except \c handling is simpler.
287
288	c_parts = [] # type: List[str]
289	lex = match.EchoLexer(s)
290	while True:
291	id_, tok_val = lex.Next()
292	if id_ == Id.Eol_Tok: # Note: This is really a NUL terminator
293	break
294
295	p = word_compile.EvalCStringToken(id_, tok_val)
296
297	# Unusual behavior: '\c' aborts processing!
298	if p is None:
299	pr.backslash_c = True
300	break
301
302	c_parts.append(p)
303	s = ''.join(c_parts)
304
305	elif part.type.id == Id.Format_Time or typ in 'diouxX':
306	# %(...)T and %d share this complex integer conversion logic
307
308	if match.LooksLikeInteger(s):
309	# Note: spaces like ' -42 ' accepted and normalized
310	ok, d = mops.FromStr2(s)
311	if not ok:
312	e_die("Integer too big: %s" % s, word_loc)
313
314	else:
315	# Check for 'a and "a
316	# These are interpreted as the numeric ASCII value of 'a'
317	num_bytes = len(s)
318	if num_bytes > 0 and s[0] in '\'"':
319	if num_bytes == 1:
320	# NUL after quote
321	d = mops.ZERO
322	elif num_bytes == 2:
323	# Allow invalid UTF-8, because all shells do
324	d = mops.IntWiden(ord(s[1]))
325	else:
326	try:
327	small_i = string_ops.DecodeUtf8Char(s, 1)
328	except error.Expr as e:
329	# Take the numeric value of first char, ignoring
330	# the rest of the bytes.
331	# Something like strict_arith or strict_printf
332	# could throw an error in this case.
333	self.errfmt.Print_(
334	'Warning: %s' % e.UserErrorString(), word_loc)
335	small_i = ord(s[1])
336
337	d = mops.IntWiden(small_i)
338
339	# No argument means -1 for %(...)T as in Bash Reference Manual
340	# 4.2 - "If no argument is specified, conversion behaves as if
341	# -1 had been given."
342	elif not has_arg and part.type.id == Id.Format_Time:
343	d = mops.MINUS_ONE
344
345	else:
346	if has_arg:
347	blame_loc = word_loc # type: loc_t
348	else:
349	blame_loc = part.type
350	self.errfmt.Print_(
351	'printf expected an integer, got %r' % s, blame_loc)
352	pr.status = 1
353	return None
354
355	if part.type.id == Id.Format_Time:
356	# Initialize timezone:
357	# `localtime' uses the current timezone information initialized
358	# by `tzset'. The function `tzset' refers to the environment
359	# variable `TZ'. When the exported variable `TZ' is present,
360	# its value should be reflected in the real environment
361	# variable `TZ' before call of `tzset'.
362	#
363	# Note: unlike LANG, TZ doesn't seem to change behavior if it's
364	# not exported.
365	#
366	# TODO: In YSH, provide an API that doesn't rely on libc's global
367	# state.
368
369	tzcell = self.mem.GetCell('TZ')
370	if (tzcell and tzcell.exported and
371	tzcell.val.tag() == value_e.Str):
372	tzval = cast(value.Str, tzcell.val)
373	posix.putenv('TZ', tzval.s)
374
375	time_.tzset()
376
377	# Handle special values:
378	# User can specify two special values -1 and -2 as in Bash
379	# Reference Manual 4.2: "Two special argument values may be
380	# used: -1 represents the current time, and -2 represents the
381	# time the shell was invoked." from
382	# https://www.gnu.org/software/bash/manual/html_node/Bash-Builtins.html#index-printf
383	if mops.Equal(d, mops.MINUS_ONE): # -1 is current time
384	# TODO: 2038 problem
385	ts = time_.time()
386	elif mops.Equal(d, mops.MINUS_TWO): # -2 is shell start time
387	ts = self.shell_start_time
388	else:
389	ts = mops.BigTruncate(d)
390
391	s = time_.strftime(typ[1:-2], time_.localtime(ts))
392	if precision >= 0:
393	s = s[:precision] # truncate
394
395	else: # typ in 'diouxX'
396	# Disallowed because it depends on 32- or 64- bit
397	if mops.Greater(mops.ZERO, d) and typ in 'ouxX':
398	# TODO: Don't truncate it
399	e_die(
400	"Can't format negative number with %%%s: %d" %
401	(typ, mops.BigTruncate(d)), part.type)
402
403	if typ == 'o':
404	s = mops.ToOctal(d)
405	elif typ == 'x':
406	s = mops.ToHexLower(d)
407	elif typ == 'X':
408	s = mops.ToHexUpper(d)
409	else: # diu
410	s = mops.ToStr(d) # without spaces like ' -42 '
411
412	# There are TWO different ways to ZERO PAD, and they differ on
413	# the negative sign! See spec/builtin-printf
414
415	zero_pad = 0 # no zero padding
416	if width >= 0 and '0' in flags:
417	zero_pad = 1 # style 1
418	elif precision > 0 and len(s) < precision:
419	zero_pad = 2 # style 2
420
421	if zero_pad:
422	negative = (s[0] == '-')
423	if negative:
424	digits = s[1:]
425	sign = '-'
426	if zero_pad == 1:
427	# [%06d] -42 becomes [-00042] (6 TOTAL)
428	n = width - 1
429	else:
430	# [%6.6d] -42 becomes [-000042] (1 for '-' + 6)
431	n = precision
432	else:
433	digits = s
434	sign = ''
435	if zero_pad == 1:
436	n = width
437	else:
438	n = precision
439	s = sign + digits.rjust(n, '0')
440
441	else:
442	raise AssertionError()
443
444	if width >= 0:
445	if '-' in flags:
446	s = s.ljust(width, ' ')
447	else:
448	s = s.rjust(width, ' ')
449	return s
450
451	def _Format(self, parts, varargs, locs, out):
452	# type: (List[printf_part_t], List[str], List[CompoundWord], List[str]) -> int
453	"""Hairy printf formatting logic."""
454
455	pr = _PrintfState()
456	num_args = len(varargs)
457
458	while True: # loop over arguments
459	for part in parts: # loop over parsed format string
460	UP_part = part
461	if part.tag() == printf_part_e.Literal:
462	part = cast(Token, UP_part)
463	if part.id == Id.Format_EscapedPercent:
464	s = '%'
465	else:
466	s = word_compile.EvalCStringToken(
467	part.id, lexer.LazyStr(part))
468
469	elif part.tag() == printf_part_e.Percent:
470	part = cast(printf_part.Percent, UP_part)
471
472	s = self._Percent(pr, part, varargs, locs)
473	if pr.status != 0:
474	return pr.status
475
476	else:
477	raise AssertionError()
478
479	out.append(s)
480
481	if pr.backslash_c: # 'printf %b a\cb xx' - \c terminates processing!
482	break
483
484	if pr.arg_index == 0:
485	# We went through ALL parts and didn't consume ANY arg.
486	# Example: print x y
487	break
488	if pr.arg_index >= num_args:
489	# We printed all args
490	break
491	# If there are more args, keep going. This implement 'arg recycling'
492	# behavior
493	# printf '%s ' 1 2 3 => 1 2 3
494
495	return 0
496
497	def Run(self, cmd_val):
498	# type: (cmd_value.Argv) -> int
499	"""
500	printf: printf [-v var] format [argument ...]
501	"""
502	attrs, arg_r = flag_util.ParseCmdVal('printf', cmd_val)
503	arg = arg_types.printf(attrs.attrs)
504
505	fmt, fmt_loc = arg_r.ReadRequired2('requires a format string')
506	varargs, locs = arg_r.Rest2()
507
508	#log('fmt %s', fmt)
509	#log('vals %s', vals)
510
511	arena = self.parse_ctx.arena
512	if fmt in self.parse_cache:
513	parts = self.parse_cache[fmt]
514	else:
515	line_reader = reader.StringLineReader(fmt, arena)
516	# TODO: Make public
517	lexer = self.parse_ctx.MakeLexer(line_reader)
518	parser = _FormatStringParser(lexer)
519
520	with alloc.ctx_SourceCode(arena,
521	source.Dynamic('printf arg', fmt_loc)):
522	try:
523	parts = parser.Parse()
524	except error.Parse as e:
525	self.errfmt.PrettyPrintError(e)
526	return 2 # parse error
527
528	self.parse_cache[fmt] = parts
529
530	if 0:
531	print()
532	for part in parts:
533	part.PrettyPrint()
534	print()
535
536	out = [] # type: List[str]
537	status = self._Format(parts, varargs, locs, out)
538	if status != 0:
539	return status # failure
540
541	result = ''.join(out)
542	if arg.v is not None:
543	# TODO: get the location for arg.v!
544	v_loc = loc.Missing
545	lval = self.unsafe_arith.ParseLValue(arg.v, v_loc)
546	state.BuiltinSetValue(self.mem, lval, value.Str(result))
547	else:
548	mylib.Stdout().write(result)
549	return 0