OILS / builtin / printf_osh.py View on Github | oilshell.org

549 lines, 365 significant
1#!/usr/bin/env python2
2"""Builtin_printf.py."""
3from __future__ import print_function
4
5import time as time_ # avoid name conflict
6
7from _devbuild.gen import arg_types
8from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind, Kind_t
9from _devbuild.gen.runtime_asdl import cmd_value
10from _devbuild.gen.syntax_asdl import (
11 loc,
12 loc_e,
13 loc_t,
14 source,
15 Token,
16 CompoundWord,
17 printf_part,
18 printf_part_e,
19 printf_part_t,
20)
21from _devbuild.gen.types_asdl import lex_mode_e, lex_mode_t
22from _devbuild.gen.value_asdl import (value, value_e)
23
24from core import alloc
25from core import error
26from core.error import e_die, p_die
27from core import state
28from core import vm
29from frontend import flag_util
30from frontend import consts
31from frontend import lexer
32from frontend import match
33from frontend import reader
34from mycpp import mops
35from mycpp import mylib
36from mycpp.mylib import log
37from osh import sh_expr_eval
38from osh import string_ops
39from osh import word_compile
40from data_lang import j8_lite
41
42import posix_ as posix
43
44from typing import Dict, List, Optional, TYPE_CHECKING, cast
45
46if TYPE_CHECKING:
47 from display import ui
48 from frontend import parse_lib
49
50_ = log
51
52
53class _FormatStringParser(object):
54 """
55 Grammar:
56
57 width = Num | Star
58 precision = Dot (Num | Star | Zero)?
59 fmt = Percent (Flag | Zero)* width? precision? (Type | Time)
60 part = Char_* | Format_EscapedPercent | fmt
61 printf_format = part* Eof_Real # we're using the main lexer
62
63 Maybe: bash also supports %(strftime)T
64 """
65
66 def __init__(self, lexer):
67 # type: (lexer.Lexer) -> None
68 self.lexer = lexer
69
70 # uninitialized values
71 self.cur_token = None # type: Token
72 self.token_type = Id.Undefined_Tok # type: Id_t
73 self.token_kind = Kind.Undefined # type: Kind_t
74
75 def _Next(self, lex_mode):
76 # type: (lex_mode_t) -> None
77 """Advance a token."""
78 self.cur_token = self.lexer.Read(lex_mode)
79 self.token_type = self.cur_token.id
80 self.token_kind = consts.GetKind(self.token_type)
81
82 def _ParseFormatStr(self):
83 # type: () -> printf_part_t
84 """fmt = ..."""
85 self._Next(lex_mode_e.PrintfPercent) # move past %
86
87 part = printf_part.Percent.CreateNull(alloc_lists=True)
88 while self.token_type in (Id.Format_Flag, Id.Format_Zero):
89 # space and + could be implemented
90 flag = lexer.TokenVal(self.cur_token) # allocation will be cached
91 if flag in '# +':
92 p_die("osh printf doesn't support the %r flag" % flag,
93 self.cur_token)
94
95 part.flags.append(self.cur_token)
96 self._Next(lex_mode_e.PrintfPercent)
97
98 if self.token_type in (Id.Format_Num, Id.Format_Star):
99 part.width = self.cur_token
100 self._Next(lex_mode_e.PrintfPercent)
101
102 if self.token_type == Id.Format_Dot:
103 part.precision = self.cur_token
104 self._Next(lex_mode_e.PrintfPercent) # past dot
105 if self.token_type in (Id.Format_Num, Id.Format_Star,
106 Id.Format_Zero):
107 part.precision = self.cur_token
108 self._Next(lex_mode_e.PrintfPercent)
109
110 if self.token_type in (Id.Format_Type, Id.Format_Time):
111 part.type = self.cur_token
112
113 # ADDITIONAL VALIDATION outside the "grammar".
114 type_val = lexer.TokenVal(part.type) # allocation will be cached
115 if type_val in 'eEfFgG':
116 p_die("osh printf doesn't support floating point", part.type)
117 # These two could be implemented. %c needs utf-8 decoding.
118 if type_val == 'c':
119 p_die("osh printf doesn't support single characters (bytes)",
120 part.type)
121
122 elif self.token_type == Id.Unknown_Tok:
123 p_die('Invalid printf format character', self.cur_token)
124
125 else:
126 p_die('Expected a printf format character', self.cur_token)
127
128 return part
129
130 def Parse(self):
131 # type: () -> List[printf_part_t]
132 self._Next(lex_mode_e.PrintfOuter)
133 parts = [] # type: List[printf_part_t]
134 while True:
135 if (self.token_kind in (Kind.Lit, Kind.Char) or self.token_type
136 in (Id.Format_EscapedPercent, Id.Unknown_Backslash)):
137
138 # Note: like in echo -e, we don't fail with Unknown_Backslash here
139 # when shopt -u parse_backslash because it's at runtime rather than
140 # parse time.
141 # Users should use $'' or the future static printf ${x %.3f}.
142
143 parts.append(self.cur_token)
144
145 elif self.token_type == Id.Format_Percent:
146 parts.append(self._ParseFormatStr())
147
148 elif self.token_type in (Id.Eof_Real, Id.Eol_Tok):
149 # Id.Eol_Tok: special case for format string of '\x00'.
150 break
151
152 else:
153 raise AssertionError(Id_str(self.token_type))
154
155 self._Next(lex_mode_e.PrintfOuter)
156
157 return parts
158
159
160class _PrintfState(object):
161
162 def __init__(self):
163 # type: () -> None
164 self.arg_index = 0
165 self.backslash_c = False
166 self.status = 0 # set to 1 before returning
167
168
169class Printf(vm._Builtin):
170
171 def __init__(
172 self,
173 mem, # type: state.Mem
174 parse_ctx, # type: parse_lib.ParseContext
175 unsafe_arith, # type: sh_expr_eval.UnsafeArith
176 errfmt, # type: ui.ErrorFormatter
177 ):
178 # type: (...) -> None
179 self.mem = mem
180 self.parse_ctx = parse_ctx
181 self.unsafe_arith = unsafe_arith
182 self.errfmt = errfmt
183 self.parse_cache = {} # type: Dict[str, List[printf_part_t]]
184
185 # this object initialized in main()
186 self.shell_start_time = time_.time()
187
188 def _Percent(
189 self,
190 pr, # type: _PrintfState
191 part, # type: printf_part.Percent
192 varargs, # type: List[str]
193 locs, # type: List[CompoundWord]
194 ):
195 # type: (...) -> Optional[str]
196
197 num_args = len(varargs)
198
199 # TODO: Cache this?
200 flags = [] # type: List[str]
201 if len(part.flags) > 0:
202 for flag_token in part.flags:
203 flags.append(lexer.TokenVal(flag_token))
204
205 width = -1 # nonexistent
206 if part.width:
207 if part.width.id in (Id.Format_Num, Id.Format_Zero):
208 width_str = lexer.TokenVal(part.width)
209 width_loc = part.width # type: loc_t
210 elif part.width.id == Id.Format_Star: # depends on data
211 if pr.arg_index < num_args:
212 width_str = varargs[pr.arg_index]
213 width_loc = locs[pr.arg_index]
214 pr.arg_index += 1
215 else:
216 width_str = '' # invalid
217 width_loc = loc.Missing
218 else:
219 raise AssertionError()
220
221 try:
222 width = int(width_str)
223 except ValueError:
224 if width_loc.tag() == loc_e.Missing:
225 width_loc = part.width
226 self.errfmt.Print_("printf got invalid width %r" % width_str,
227 blame_loc=width_loc)
228 pr.status = 1
229 return None
230
231 precision = -1 # nonexistent
232 if part.precision:
233 if part.precision.id == Id.Format_Dot:
234 precision_str = '0'
235 precision_loc = part.precision # type: loc_t
236 elif part.precision.id in (Id.Format_Num, Id.Format_Zero):
237 precision_str = lexer.TokenVal(part.precision)
238 precision_loc = part.precision
239 elif part.precision.id == Id.Format_Star:
240 if pr.arg_index < num_args:
241 precision_str = varargs[pr.arg_index]
242 precision_loc = locs[pr.arg_index]
243 pr.arg_index += 1
244 else:
245 precision_str = ''
246 precision_loc = loc.Missing
247 else:
248 raise AssertionError()
249
250 try:
251 precision = int(precision_str)
252 except ValueError:
253 if precision_loc.tag() == loc_e.Missing:
254 precision_loc = part.precision
255 self.errfmt.Print_('printf got invalid precision %r' %
256 precision_str,
257 blame_loc=precision_loc)
258 pr.status = 1
259 return None
260
261 if pr.arg_index < num_args:
262 s = varargs[pr.arg_index]
263 word_loc = locs[pr.arg_index] # type: loc_t
264 pr.arg_index += 1
265 has_arg = True
266 else:
267 s = ''
268 word_loc = loc.Missing
269 has_arg = False
270
271 # Note: %s could be lexed into Id.Percent_S. Although small string
272 # optimization would remove the allocation as well.
273 typ = lexer.TokenVal(part.type)
274 if typ == 's':
275 if precision >= 0:
276 s = s[:precision] # truncate
277
278 elif typ == 'q':
279 # Most shells give \' for single quote, while OSH gives
280 # $'\'' this could matter when SSH'ing.
281 # Ditto for $'\\' vs. '\'
282
283 s = j8_lite.MaybeShellEncode(s)
284
285 elif typ == 'b':
286 # Process just like echo -e, except \c handling is simpler.
287
288 c_parts = [] # type: List[str]
289 lex = match.EchoLexer(s)
290 while True:
291 id_, tok_val = lex.Next()
292 if id_ == Id.Eol_Tok: # Note: This is really a NUL terminator
293 break
294
295 p = word_compile.EvalCStringToken(id_, tok_val)
296
297 # Unusual behavior: '\c' aborts processing!
298 if p is None:
299 pr.backslash_c = True
300 break
301
302 c_parts.append(p)
303 s = ''.join(c_parts)
304
305 elif part.type.id == Id.Format_Time or typ in 'diouxX':
306 # %(...)T and %d share this complex integer conversion logic
307
308 if match.LooksLikeInteger(s):
309 # Note: spaces like ' -42 ' accepted and normalized
310 ok, d = mops.FromStr2(s)
311 if not ok:
312 e_die("Integer too big: %s" % s, word_loc)
313
314 else:
315 # Check for 'a and "a
316 # These are interpreted as the numeric ASCII value of 'a'
317 num_bytes = len(s)
318 if num_bytes > 0 and s[0] in '\'"':
319 if num_bytes == 1:
320 # NUL after quote
321 d = mops.ZERO
322 elif num_bytes == 2:
323 # Allow invalid UTF-8, because all shells do
324 d = mops.IntWiden(ord(s[1]))
325 else:
326 try:
327 small_i = string_ops.DecodeUtf8Char(s, 1)
328 except error.Expr as e:
329 # Take the numeric value of first char, ignoring
330 # the rest of the bytes.
331 # Something like strict_arith or strict_printf
332 # could throw an error in this case.
333 self.errfmt.Print_(
334 'Warning: %s' % e.UserErrorString(), word_loc)
335 small_i = ord(s[1])
336
337 d = mops.IntWiden(small_i)
338
339 # No argument means -1 for %(...)T as in Bash Reference Manual
340 # 4.2 - "If no argument is specified, conversion behaves as if
341 # -1 had been given."
342 elif not has_arg and part.type.id == Id.Format_Time:
343 d = mops.MINUS_ONE
344
345 else:
346 if has_arg:
347 blame_loc = word_loc # type: loc_t
348 else:
349 blame_loc = part.type
350 self.errfmt.Print_(
351 'printf expected an integer, got %r' % s, blame_loc)
352 pr.status = 1
353 return None
354
355 if part.type.id == Id.Format_Time:
356 # Initialize timezone:
357 # `localtime' uses the current timezone information initialized
358 # by `tzset'. The function `tzset' refers to the environment
359 # variable `TZ'. When the exported variable `TZ' is present,
360 # its value should be reflected in the real environment
361 # variable `TZ' before call of `tzset'.
362 #
363 # Note: unlike LANG, TZ doesn't seem to change behavior if it's
364 # not exported.
365 #
366 # TODO: In YSH, provide an API that doesn't rely on libc's global
367 # state.
368
369 tzcell = self.mem.GetCell('TZ')
370 if (tzcell and tzcell.exported and
371 tzcell.val.tag() == value_e.Str):
372 tzval = cast(value.Str, tzcell.val)
373 posix.putenv('TZ', tzval.s)
374
375 time_.tzset()
376
377 # Handle special values:
378 # User can specify two special values -1 and -2 as in Bash
379 # Reference Manual 4.2: "Two special argument values may be
380 # used: -1 represents the current time, and -2 represents the
381 # time the shell was invoked." from
382 # https://www.gnu.org/software/bash/manual/html_node/Bash-Builtins.html#index-printf
383 if mops.Equal(d, mops.MINUS_ONE): # -1 is current time
384 # TODO: 2038 problem
385 ts = time_.time()
386 elif mops.Equal(d, mops.MINUS_TWO): # -2 is shell start time
387 ts = self.shell_start_time
388 else:
389 ts = mops.BigTruncate(d)
390
391 s = time_.strftime(typ[1:-2], time_.localtime(ts))
392 if precision >= 0:
393 s = s[:precision] # truncate
394
395 else: # typ in 'diouxX'
396 # Disallowed because it depends on 32- or 64- bit
397 if mops.Greater(mops.ZERO, d) and typ in 'ouxX':
398 # TODO: Don't truncate it
399 e_die(
400 "Can't format negative number with %%%s: %d" %
401 (typ, mops.BigTruncate(d)), part.type)
402
403 if typ == 'o':
404 s = mops.ToOctal(d)
405 elif typ == 'x':
406 s = mops.ToHexLower(d)
407 elif typ == 'X':
408 s = mops.ToHexUpper(d)
409 else: # diu
410 s = mops.ToStr(d) # without spaces like ' -42 '
411
412 # There are TWO different ways to ZERO PAD, and they differ on
413 # the negative sign! See spec/builtin-printf
414
415 zero_pad = 0 # no zero padding
416 if width >= 0 and '0' in flags:
417 zero_pad = 1 # style 1
418 elif precision > 0 and len(s) < precision:
419 zero_pad = 2 # style 2
420
421 if zero_pad:
422 negative = (s[0] == '-')
423 if negative:
424 digits = s[1:]
425 sign = '-'
426 if zero_pad == 1:
427 # [%06d] -42 becomes [-00042] (6 TOTAL)
428 n = width - 1
429 else:
430 # [%6.6d] -42 becomes [-000042] (1 for '-' + 6)
431 n = precision
432 else:
433 digits = s
434 sign = ''
435 if zero_pad == 1:
436 n = width
437 else:
438 n = precision
439 s = sign + digits.rjust(n, '0')
440
441 else:
442 raise AssertionError()
443
444 if width >= 0:
445 if '-' in flags:
446 s = s.ljust(width, ' ')
447 else:
448 s = s.rjust(width, ' ')
449 return s
450
451 def _Format(self, parts, varargs, locs, out):
452 # type: (List[printf_part_t], List[str], List[CompoundWord], List[str]) -> int
453 """Hairy printf formatting logic."""
454
455 pr = _PrintfState()
456 num_args = len(varargs)
457
458 while True: # loop over arguments
459 for part in parts: # loop over parsed format string
460 UP_part = part
461 if part.tag() == printf_part_e.Literal:
462 part = cast(Token, UP_part)
463 if part.id == Id.Format_EscapedPercent:
464 s = '%'
465 else:
466 s = word_compile.EvalCStringToken(
467 part.id, lexer.LazyStr(part))
468
469 elif part.tag() == printf_part_e.Percent:
470 part = cast(printf_part.Percent, UP_part)
471
472 s = self._Percent(pr, part, varargs, locs)
473 if pr.status != 0:
474 return pr.status
475
476 else:
477 raise AssertionError()
478
479 out.append(s)
480
481 if pr.backslash_c: # 'printf %b a\cb xx' - \c terminates processing!
482 break
483
484 if pr.arg_index == 0:
485 # We went through ALL parts and didn't consume ANY arg.
486 # Example: print x y
487 break
488 if pr.arg_index >= num_args:
489 # We printed all args
490 break
491 # If there are more args, keep going. This implement 'arg recycling'
492 # behavior
493 # printf '%s ' 1 2 3 => 1 2 3
494
495 return 0
496
497 def Run(self, cmd_val):
498 # type: (cmd_value.Argv) -> int
499 """
500 printf: printf [-v var] format [argument ...]
501 """
502 attrs, arg_r = flag_util.ParseCmdVal('printf', cmd_val)
503 arg = arg_types.printf(attrs.attrs)
504
505 fmt, fmt_loc = arg_r.ReadRequired2('requires a format string')
506 varargs, locs = arg_r.Rest2()
507
508 #log('fmt %s', fmt)
509 #log('vals %s', vals)
510
511 arena = self.parse_ctx.arena
512 if fmt in self.parse_cache:
513 parts = self.parse_cache[fmt]
514 else:
515 line_reader = reader.StringLineReader(fmt, arena)
516 # TODO: Make public
517 lexer = self.parse_ctx.MakeLexer(line_reader)
518 parser = _FormatStringParser(lexer)
519
520 with alloc.ctx_SourceCode(arena,
521 source.Dynamic('printf arg', fmt_loc)):
522 try:
523 parts = parser.Parse()
524 except error.Parse as e:
525 self.errfmt.PrettyPrintError(e)
526 return 2 # parse error
527
528 self.parse_cache[fmt] = parts
529
530 if 0:
531 print()
532 for part in parts:
533 part.PrettyPrint()
534 print()
535
536 out = [] # type: List[str]
537 status = self._Format(parts, varargs, locs, out)
538 if status != 0:
539 return status # failure
540
541 result = ''.join(out)
542 if arg.v is not None:
543 # TODO: get the location for arg.v!
544 v_loc = loc.Missing
545 lval = self.unsafe_arith.ParseLValue(arg.v, v_loc)
546 state.BuiltinSetValue(self.mem, lval, value.Str(result))
547 else:
548 mylib.Stdout().write(result)
549 return 0