OILS / builtin / printf_osh.py View on Github | oilshell.org

547 lines, 363 significant
1#!/usr/bin/env python2
2"""Builtin_printf.py."""
3from __future__ import print_function
4
5import time as time_ # avoid name conflict
6
7from _devbuild.gen import arg_types
8from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind, Kind_t
9from _devbuild.gen.runtime_asdl import cmd_value
10from _devbuild.gen.syntax_asdl import (
11 loc,
12 loc_e,
13 loc_t,
14 source,
15 Token,
16 CompoundWord,
17 printf_part,
18 printf_part_e,
19 printf_part_t,
20)
21from _devbuild.gen.types_asdl import lex_mode_e, lex_mode_t
22from _devbuild.gen.value_asdl import (value, value_e)
23
24from core import alloc
25from core import error
26from core.error import e_die, p_die
27from core import state
28from core import vm
29from frontend import flag_util
30from frontend import consts
31from frontend import lexer
32from frontend import match
33from frontend import reader
34from mycpp import mops
35from mycpp import mylib
36from mycpp.mylib import log
37from osh import sh_expr_eval
38from osh import string_ops
39from osh import word_compile
40from data_lang import j8_lite
41
42import posix_ as posix
43
44from typing import Dict, List, Optional, TYPE_CHECKING, cast
45
46if TYPE_CHECKING:
47 from display import ui
48 from frontend import parse_lib
49
50_ = log
51
52
53class _FormatStringParser(object):
54 """
55 Grammar:
56
57 width = Num | Star
58 precision = Dot (Num | Star | Zero)?
59 fmt = Percent (Flag | Zero)* width? precision? (Type | Time)
60 part = Char_* | Format_EscapedPercent | fmt
61 printf_format = part* Eof_Real # we're using the main lexer
62
63 Maybe: bash also supports %(strftime)T
64 """
65
66 def __init__(self, lexer):
67 # type: (lexer.Lexer) -> None
68 self.lexer = lexer
69
70 # uninitialized values
71 self.cur_token = None # type: Token
72 self.token_type = Id.Undefined_Tok # type: Id_t
73 self.token_kind = Kind.Undefined # type: Kind_t
74
75 def _Next(self, lex_mode):
76 # type: (lex_mode_t) -> None
77 """Advance a token."""
78 self.cur_token = self.lexer.Read(lex_mode)
79 self.token_type = self.cur_token.id
80 self.token_kind = consts.GetKind(self.token_type)
81
82 def _ParseFormatStr(self):
83 # type: () -> printf_part_t
84 """fmt = ..."""
85 self._Next(lex_mode_e.PrintfPercent) # move past %
86
87 part = printf_part.Percent.CreateNull(alloc_lists=True)
88 while self.token_type in (Id.Format_Flag, Id.Format_Zero):
89 # space and + could be implemented
90 flag = lexer.TokenVal(self.cur_token) # allocation will be cached
91 if flag in '# +':
92 p_die("osh printf doesn't support the %r flag" % flag,
93 self.cur_token)
94
95 part.flags.append(self.cur_token)
96 self._Next(lex_mode_e.PrintfPercent)
97
98 if self.token_type in (Id.Format_Num, Id.Format_Star):
99 part.width = self.cur_token
100 self._Next(lex_mode_e.PrintfPercent)
101
102 if self.token_type == Id.Format_Dot:
103 part.precision = self.cur_token
104 self._Next(lex_mode_e.PrintfPercent) # past dot
105 if self.token_type in (Id.Format_Num, Id.Format_Star,
106 Id.Format_Zero):
107 part.precision = self.cur_token
108 self._Next(lex_mode_e.PrintfPercent)
109
110 if self.token_type in (Id.Format_Type, Id.Format_Time):
111 part.type = self.cur_token
112
113 # ADDITIONAL VALIDATION outside the "grammar".
114 type_val = lexer.TokenVal(part.type) # allocation will be cached
115 if type_val in 'eEfFgG':
116 p_die("osh printf doesn't support floating point", part.type)
117 # These two could be implemented. %c needs utf-8 decoding.
118 if type_val == 'c':
119 p_die("osh printf doesn't support single characters (bytes)",
120 part.type)
121
122 elif self.token_type == Id.Unknown_Tok:
123 p_die('Invalid printf format character', self.cur_token)
124
125 else:
126 p_die('Expected a printf format character', self.cur_token)
127
128 return part
129
130 def Parse(self):
131 # type: () -> List[printf_part_t]
132 self._Next(lex_mode_e.PrintfOuter)
133 parts = [] # type: List[printf_part_t]
134 while True:
135 if (self.token_kind in (Kind.Lit, Kind.Char) or self.token_type
136 in (Id.Format_EscapedPercent, Id.Unknown_Backslash)):
137
138 # Note: like in echo -e, we don't fail with Unknown_Backslash here
139 # when shopt -u parse_backslash because it's at runtime rather than
140 # parse time.
141 # Users should use $'' or the future static printf ${x %.3f}.
142
143 parts.append(self.cur_token)
144
145 elif self.token_type == Id.Format_Percent:
146 parts.append(self._ParseFormatStr())
147
148 elif self.token_type in (Id.Eof_Real, Id.Eol_Tok):
149 # Id.Eol_Tok: special case for format string of '\x00'.
150 break
151
152 else:
153 raise AssertionError(Id_str(self.token_type))
154
155 self._Next(lex_mode_e.PrintfOuter)
156
157 return parts
158
159
160class _PrintfState(object):
161
162 def __init__(self):
163 # type: () -> None
164 self.arg_index = 0
165 self.backslash_c = False
166 self.status = 0 # set to 1 before returning
167
168
169class Printf(vm._Builtin):
170
171 def __init__(
172 self,
173 mem, # type: state.Mem
174 parse_ctx, # type: parse_lib.ParseContext
175 unsafe_arith, # type: sh_expr_eval.UnsafeArith
176 errfmt, # type: ui.ErrorFormatter
177 ):
178 # type: (...) -> None
179 self.mem = mem
180 self.parse_ctx = parse_ctx
181 self.unsafe_arith = unsafe_arith
182 self.errfmt = errfmt
183 self.parse_cache = {} # type: Dict[str, List[printf_part_t]]
184
185 # this object initialized in main()
186 self.shell_start_time = time_.time()
187
188 def _Percent(
189 self,
190 pr, # type: _PrintfState
191 part, # type: printf_part.Percent
192 varargs, # type: List[str]
193 locs, # type: List[CompoundWord]
194 ):
195 # type: (...) -> Optional[str]
196
197 num_args = len(varargs)
198
199 # TODO: Cache this?
200 flags = [] # type: List[str]
201 if len(part.flags) > 0:
202 for flag_token in part.flags:
203 flags.append(lexer.TokenVal(flag_token))
204
205 width = -1 # nonexistent
206 if part.width:
207 if part.width.id in (Id.Format_Num, Id.Format_Zero):
208 width_str = lexer.TokenVal(part.width)
209 width_loc = part.width # type: loc_t
210 elif part.width.id == Id.Format_Star: # depends on data
211 if pr.arg_index < num_args:
212 width_str = varargs[pr.arg_index]
213 width_loc = locs[pr.arg_index]
214 pr.arg_index += 1
215 else:
216 width_str = '' # invalid
217 width_loc = loc.Missing
218 else:
219 raise AssertionError()
220
221 try:
222 width = int(width_str)
223 except ValueError:
224 if width_loc.tag() == loc_e.Missing:
225 width_loc = part.width
226 self.errfmt.Print_("printf got invalid width %r" % width_str,
227 blame_loc=width_loc)
228 pr.status = 1
229 return None
230
231 precision = -1 # nonexistent
232 if part.precision:
233 if part.precision.id == Id.Format_Dot:
234 precision_str = '0'
235 precision_loc = part.precision # type: loc_t
236 elif part.precision.id in (Id.Format_Num, Id.Format_Zero):
237 precision_str = lexer.TokenVal(part.precision)
238 precision_loc = part.precision
239 elif part.precision.id == Id.Format_Star:
240 if pr.arg_index < num_args:
241 precision_str = varargs[pr.arg_index]
242 precision_loc = locs[pr.arg_index]
243 pr.arg_index += 1
244 else:
245 precision_str = ''
246 precision_loc = loc.Missing
247 else:
248 raise AssertionError()
249
250 try:
251 precision = int(precision_str)
252 except ValueError:
253 if precision_loc.tag() == loc_e.Missing:
254 precision_loc = part.precision
255 self.errfmt.Print_('printf got invalid precision %r' %
256 precision_str,
257 blame_loc=precision_loc)
258 pr.status = 1
259 return None
260
261 if pr.arg_index < num_args:
262 s = varargs[pr.arg_index]
263 word_loc = locs[pr.arg_index] # type: loc_t
264 pr.arg_index += 1
265 has_arg = True
266 else:
267 s = ''
268 word_loc = loc.Missing
269 has_arg = False
270
271 # Note: %s could be lexed into Id.Percent_S. Although small string
272 # optimization would remove the allocation as well.
273 typ = lexer.TokenVal(part.type)
274 if typ == 's':
275 if precision >= 0:
276 s = s[:precision] # truncate
277
278 elif typ == 'q':
279 # Most shells give \' for single quote, while OSH gives
280 # $'\'' this could matter when SSH'ing.
281 # Ditto for $'\\' vs. '\'
282
283 s = j8_lite.MaybeShellEncode(s)
284
285 elif typ == 'b':
286 # Process just like echo -e, except \c handling is simpler.
287
288 c_parts = [] # type: List[str]
289 lex = match.EchoLexer(s)
290 while True:
291 id_, tok_val = lex.Next()
292 if id_ == Id.Eol_Tok: # Note: This is really a NUL terminator
293 break
294
295 p = word_compile.EvalCStringToken(id_, tok_val)
296
297 # Unusual behavior: '\c' aborts processing!
298 if p is None:
299 pr.backslash_c = True
300 break
301
302 c_parts.append(p)
303 s = ''.join(c_parts)
304
305 elif part.type.id == Id.Format_Time or typ in 'diouxX':
306 # %(...)T and %d share this complex integer conversion logic
307
308 if match.LooksLikeInteger(s):
309 # Note: spaces like ' -42 ' accepted and normalized
310 d = mops.FromStr(s)
311
312 else:
313 # Check for 'a and "a
314 # These are interpreted as the numeric ASCII value of 'a'
315 num_bytes = len(s)
316 if num_bytes > 0 and s[0] in '\'"':
317 if num_bytes == 1:
318 # NUL after quote
319 d = mops.ZERO
320 elif num_bytes == 2:
321 # Allow invalid UTF-8, because all shells do
322 d = mops.IntWiden(ord(s[1]))
323 else:
324 try:
325 small_i = string_ops.DecodeUtf8Char(s, 1)
326 except error.Expr as e:
327 # Take the numeric value of first char, ignoring
328 # the rest of the bytes.
329 # Something like strict_arith or strict_printf
330 # could throw an error in this case.
331 self.errfmt.Print_(
332 'Warning: %s' % e.UserErrorString(), word_loc)
333 small_i = ord(s[1])
334
335 d = mops.IntWiden(small_i)
336
337 # No argument means -1 for %(...)T as in Bash Reference Manual
338 # 4.2 - "If no argument is specified, conversion behaves as if
339 # -1 had been given."
340 elif not has_arg and part.type.id == Id.Format_Time:
341 d = mops.MINUS_ONE
342
343 else:
344 if has_arg:
345 blame_loc = word_loc # type: loc_t
346 else:
347 blame_loc = part.type
348 self.errfmt.Print_(
349 'printf expected an integer, got %r' % s, blame_loc)
350 pr.status = 1
351 return None
352
353 if part.type.id == Id.Format_Time:
354 # Initialize timezone:
355 # `localtime' uses the current timezone information initialized
356 # by `tzset'. The function `tzset' refers to the environment
357 # variable `TZ'. When the exported variable `TZ' is present,
358 # its value should be reflected in the real environment
359 # variable `TZ' before call of `tzset'.
360 #
361 # Note: unlike LANG, TZ doesn't seem to change behavior if it's
362 # not exported.
363 #
364 # TODO: In YSH, provide an API that doesn't rely on libc's global
365 # state.
366
367 tzcell = self.mem.GetCell('TZ')
368 if (tzcell and tzcell.exported and
369 tzcell.val.tag() == value_e.Str):
370 tzval = cast(value.Str, tzcell.val)
371 posix.putenv('TZ', tzval.s)
372
373 time_.tzset()
374
375 # Handle special values:
376 # User can specify two special values -1 and -2 as in Bash
377 # Reference Manual 4.2: "Two special argument values may be
378 # used: -1 represents the current time, and -2 represents the
379 # time the shell was invoked." from
380 # https://www.gnu.org/software/bash/manual/html_node/Bash-Builtins.html#index-printf
381 if mops.Equal(d, mops.MINUS_ONE): # -1 is current time
382 # TODO: 2038 problem
383 ts = time_.time()
384 elif mops.Equal(d, mops.MINUS_TWO): # -2 is shell start time
385 ts = self.shell_start_time
386 else:
387 ts = mops.BigTruncate(d)
388
389 s = time_.strftime(typ[1:-2], time_.localtime(ts))
390 if precision >= 0:
391 s = s[:precision] # truncate
392
393 else: # typ in 'diouxX'
394 # Disallowed because it depends on 32- or 64- bit
395 if mops.Greater(mops.ZERO, d) and typ in 'ouxX':
396 # TODO: Don't truncate it
397 e_die(
398 "Can't format negative number with %%%s: %d" %
399 (typ, mops.BigTruncate(d)), part.type)
400
401 if typ == 'o':
402 s = mops.ToOctal(d)
403 elif typ == 'x':
404 s = mops.ToHexLower(d)
405 elif typ == 'X':
406 s = mops.ToHexUpper(d)
407 else: # diu
408 s = mops.ToStr(d) # without spaces like ' -42 '
409
410 # There are TWO different ways to ZERO PAD, and they differ on
411 # the negative sign! See spec/builtin-printf
412
413 zero_pad = 0 # no zero padding
414 if width >= 0 and '0' in flags:
415 zero_pad = 1 # style 1
416 elif precision > 0 and len(s) < precision:
417 zero_pad = 2 # style 2
418
419 if zero_pad:
420 negative = (s[0] == '-')
421 if negative:
422 digits = s[1:]
423 sign = '-'
424 if zero_pad == 1:
425 # [%06d] -42 becomes [-00042] (6 TOTAL)
426 n = width - 1
427 else:
428 # [%6.6d] -42 becomes [-000042] (1 for '-' + 6)
429 n = precision
430 else:
431 digits = s
432 sign = ''
433 if zero_pad == 1:
434 n = width
435 else:
436 n = precision
437 s = sign + digits.rjust(n, '0')
438
439 else:
440 raise AssertionError()
441
442 if width >= 0:
443 if '-' in flags:
444 s = s.ljust(width, ' ')
445 else:
446 s = s.rjust(width, ' ')
447 return s
448
449 def _Format(self, parts, varargs, locs, out):
450 # type: (List[printf_part_t], List[str], List[CompoundWord], List[str]) -> int
451 """Hairy printf formatting logic."""
452
453 pr = _PrintfState()
454 num_args = len(varargs)
455
456 while True: # loop over arguments
457 for part in parts: # loop over parsed format string
458 UP_part = part
459 if part.tag() == printf_part_e.Literal:
460 part = cast(Token, UP_part)
461 if part.id == Id.Format_EscapedPercent:
462 s = '%'
463 else:
464 s = word_compile.EvalCStringToken(
465 part.id, lexer.LazyStr(part))
466
467 elif part.tag() == printf_part_e.Percent:
468 part = cast(printf_part.Percent, UP_part)
469
470 s = self._Percent(pr, part, varargs, locs)
471 if pr.status != 0:
472 return pr.status
473
474 else:
475 raise AssertionError()
476
477 out.append(s)
478
479 if pr.backslash_c: # 'printf %b a\cb xx' - \c terminates processing!
480 break
481
482 if pr.arg_index == 0:
483 # We went through ALL parts and didn't consume ANY arg.
484 # Example: print x y
485 break
486 if pr.arg_index >= num_args:
487 # We printed all args
488 break
489 # If there are more args, keep going. This implement 'arg recycling'
490 # behavior
491 # printf '%s ' 1 2 3 => 1 2 3
492
493 return 0
494
495 def Run(self, cmd_val):
496 # type: (cmd_value.Argv) -> int
497 """
498 printf: printf [-v var] format [argument ...]
499 """
500 attrs, arg_r = flag_util.ParseCmdVal('printf', cmd_val)
501 arg = arg_types.printf(attrs.attrs)
502
503 fmt, fmt_loc = arg_r.ReadRequired2('requires a format string')
504 varargs, locs = arg_r.Rest2()
505
506 #log('fmt %s', fmt)
507 #log('vals %s', vals)
508
509 arena = self.parse_ctx.arena
510 if fmt in self.parse_cache:
511 parts = self.parse_cache[fmt]
512 else:
513 line_reader = reader.StringLineReader(fmt, arena)
514 # TODO: Make public
515 lexer = self.parse_ctx.MakeLexer(line_reader)
516 parser = _FormatStringParser(lexer)
517
518 with alloc.ctx_SourceCode(arena,
519 source.Dynamic('printf arg', fmt_loc)):
520 try:
521 parts = parser.Parse()
522 except error.Parse as e:
523 self.errfmt.PrettyPrintError(e)
524 return 2 # parse error
525
526 self.parse_cache[fmt] = parts
527
528 if 0:
529 print()
530 for part in parts:
531 part.PrettyPrint()
532 print()
533
534 out = [] # type: List[str]
535 status = self._Format(parts, varargs, locs, out)
536 if status != 0:
537 return status # failure
538
539 result = ''.join(out)
540 if arg.v is not None:
541 # TODO: get the location for arg.v!
542 v_loc = loc.Missing
543 lval = self.unsafe_arith.ParseLValue(arg.v, v_loc)
544 state.BuiltinSetValue(self.mem, lval, value.Str(result))
545 else:
546 mylib.Stdout().write(result)
547 return 0