OILS / builtin / printf_osh.py View on Github | oilshell.org

552 lines, 369 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3
4import time as time_ # avoid name conflict
5
6from _devbuild.gen import arg_types
7from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind, Kind_t
8from _devbuild.gen.runtime_asdl import cmd_value
9from _devbuild.gen.syntax_asdl import (
10 loc,
11 loc_e,
12 loc_t,
13 source,
14 Token,
15 CompoundWord,
16 printf_part,
17 printf_part_e,
18 printf_part_t,
19)
20from _devbuild.gen.types_asdl import lex_mode_e, lex_mode_t
21from _devbuild.gen.value_asdl import (value, value_e)
22
23from core import alloc
24from core import error
25from core.error import p_die
26from core import state
27from core import vm
28from frontend import flag_util
29from frontend import consts
30from frontend import lexer
31from frontend import match
32from frontend import reader
33from mycpp import mops
34from mycpp import mylib
35from mycpp.mylib import log
36from osh import sh_expr_eval
37from osh import string_ops
38from osh import word_compile
39from data_lang import j8_lite
40
41import posix_ as posix
42
43from typing import Dict, List, Optional, TYPE_CHECKING, cast
44
45if TYPE_CHECKING:
46 from display import ui
47 from frontend import parse_lib
48
49_ = log
50
51
52class _FormatStringParser(object):
53 """
54 Grammar:
55
56 width = Num | Star
57 precision = Dot (Num | Star | Zero)?
58 fmt = Percent (Flag | Zero)* width? precision? (Type | Time)
59 part = Char_* | Format_EscapedPercent | fmt
60 printf_format = part* Eof_Real # we're using the main lexer
61
62 Maybe: bash also supports %(strftime)T
63 """
64
65 def __init__(self, lexer):
66 # type: (lexer.Lexer) -> None
67 self.lexer = lexer
68
69 # uninitialized values
70 self.cur_token = None # type: Token
71 self.token_type = Id.Undefined_Tok # type: Id_t
72 self.token_kind = Kind.Undefined # type: Kind_t
73
74 def _Next(self, lex_mode):
75 # type: (lex_mode_t) -> None
76 """Advance a token."""
77 self.cur_token = self.lexer.Read(lex_mode)
78 self.token_type = self.cur_token.id
79 self.token_kind = consts.GetKind(self.token_type)
80
81 def _ParseFormatStr(self):
82 # type: () -> printf_part_t
83 """fmt = ..."""
84 self._Next(lex_mode_e.PrintfPercent) # move past %
85
86 part = printf_part.Percent.CreateNull(alloc_lists=True)
87 while self.token_type in (Id.Format_Flag, Id.Format_Zero):
88 # space and + could be implemented
89 flag = lexer.TokenVal(self.cur_token) # allocation will be cached
90 if flag in '# +':
91 p_die("osh printf doesn't support the %r flag" % flag,
92 self.cur_token)
93
94 part.flags.append(self.cur_token)
95 self._Next(lex_mode_e.PrintfPercent)
96
97 if self.token_type in (Id.Format_Num, Id.Format_Star):
98 part.width = self.cur_token
99 self._Next(lex_mode_e.PrintfPercent)
100
101 if self.token_type == Id.Format_Dot:
102 part.precision = self.cur_token
103 self._Next(lex_mode_e.PrintfPercent) # past dot
104 if self.token_type in (Id.Format_Num, Id.Format_Star,
105 Id.Format_Zero):
106 part.precision = self.cur_token
107 self._Next(lex_mode_e.PrintfPercent)
108
109 if self.token_type in (Id.Format_Type, Id.Format_Time):
110 part.type = self.cur_token
111
112 # ADDITIONAL VALIDATION outside the "grammar".
113 type_val = lexer.TokenVal(part.type) # allocation will be cached
114 if type_val in 'eEfFgG':
115 p_die("osh printf doesn't support floating point", part.type)
116 # These two could be implemented. %c needs utf-8 decoding.
117 if type_val == 'c':
118 p_die("osh printf doesn't support single characters (bytes)",
119 part.type)
120
121 elif self.token_type == Id.Unknown_Tok:
122 p_die('Invalid printf format character', self.cur_token)
123
124 else:
125 p_die('Expected a printf format character', self.cur_token)
126
127 return part
128
129 def Parse(self):
130 # type: () -> List[printf_part_t]
131 self._Next(lex_mode_e.PrintfOuter)
132 parts = [] # type: List[printf_part_t]
133 while True:
134 if (self.token_kind in (Kind.Lit, Kind.Char) or self.token_type
135 in (Id.Format_EscapedPercent, Id.Unknown_Backslash)):
136
137 # Note: like in echo -e, we don't fail with Unknown_Backslash here
138 # when shopt -u parse_backslash because it's at runtime rather than
139 # parse time.
140 # Users should use $'' or the future static printf ${x %.3f}.
141
142 parts.append(self.cur_token)
143
144 elif self.token_type == Id.Format_Percent:
145 parts.append(self._ParseFormatStr())
146
147 elif self.token_type in (Id.Eof_Real, Id.Eol_Tok):
148 # Id.Eol_Tok: special case for format string of '\x00'.
149 break
150
151 else:
152 raise AssertionError(Id_str(self.token_type))
153
154 self._Next(lex_mode_e.PrintfOuter)
155
156 return parts
157
158
159class _PrintfState(object):
160
161 def __init__(self):
162 # type: () -> None
163 self.arg_index = 0
164 self.backslash_c = False
165 self.status = 0 # set to 1 before returning
166
167
168class Printf(vm._Builtin):
169
170 def __init__(
171 self,
172 mem, # type: state.Mem
173 parse_ctx, # type: parse_lib.ParseContext
174 unsafe_arith, # type: sh_expr_eval.UnsafeArith
175 errfmt, # type: ui.ErrorFormatter
176 ):
177 # type: (...) -> None
178 self.mem = mem
179 self.parse_ctx = parse_ctx
180 self.unsafe_arith = unsafe_arith
181 self.errfmt = errfmt
182 self.parse_cache = {} # type: Dict[str, List[printf_part_t]]
183
184 # this object initialized in main()
185 self.shell_start_time = time_.time()
186
187 def _Percent(
188 self,
189 pr, # type: _PrintfState
190 part, # type: printf_part.Percent
191 varargs, # type: List[str]
192 locs, # type: List[CompoundWord]
193 ):
194 # type: (...) -> Optional[str]
195
196 num_args = len(varargs)
197
198 # TODO: Cache this?
199 flags = [] # type: List[str]
200 if len(part.flags) > 0:
201 for flag_token in part.flags:
202 flags.append(lexer.TokenVal(flag_token))
203
204 width = -1 # nonexistent
205 if part.width:
206 if part.width.id in (Id.Format_Num, Id.Format_Zero):
207 width_str = lexer.TokenVal(part.width)
208 width_loc = part.width # type: loc_t
209 elif part.width.id == Id.Format_Star: # depends on data
210 if pr.arg_index < num_args:
211 width_str = varargs[pr.arg_index]
212 width_loc = locs[pr.arg_index]
213 pr.arg_index += 1
214 else:
215 width_str = '' # invalid
216 width_loc = loc.Missing
217 else:
218 raise AssertionError()
219
220 try:
221 width = int(width_str)
222 except ValueError:
223 if width_loc.tag() == loc_e.Missing:
224 width_loc = part.width
225 self.errfmt.Print_("printf got invalid width %r" % width_str,
226 blame_loc=width_loc)
227 pr.status = 1
228 return None
229
230 precision = -1 # nonexistent
231 if part.precision:
232 if part.precision.id == Id.Format_Dot:
233 precision_str = '0'
234 precision_loc = part.precision # type: loc_t
235 elif part.precision.id in (Id.Format_Num, Id.Format_Zero):
236 precision_str = lexer.TokenVal(part.precision)
237 precision_loc = part.precision
238 elif part.precision.id == Id.Format_Star:
239 if pr.arg_index < num_args:
240 precision_str = varargs[pr.arg_index]
241 precision_loc = locs[pr.arg_index]
242 pr.arg_index += 1
243 else:
244 precision_str = ''
245 precision_loc = loc.Missing
246 else:
247 raise AssertionError()
248
249 try:
250 precision = int(precision_str)
251 except ValueError:
252 if precision_loc.tag() == loc_e.Missing:
253 precision_loc = part.precision
254 self.errfmt.Print_('printf got invalid precision %r' %
255 precision_str,
256 blame_loc=precision_loc)
257 pr.status = 1
258 return None
259
260 if pr.arg_index < num_args:
261 s = varargs[pr.arg_index]
262 word_loc = locs[pr.arg_index] # type: loc_t
263 pr.arg_index += 1
264 has_arg = True
265 else:
266 s = ''
267 word_loc = loc.Missing
268 has_arg = False
269
270 # Note: %s could be lexed into Id.Percent_S. Although small string
271 # optimization would remove the allocation as well.
272 typ = lexer.TokenVal(part.type)
273 if typ == 's':
274 if precision >= 0:
275 s = s[:precision] # truncate
276
277 elif typ == 'q':
278 # Most shells give \' for single quote, while OSH gives
279 # $'\'' this could matter when SSH'ing.
280 # Ditto for $'\\' vs. '\'
281
282 s = j8_lite.MaybeShellEncode(s)
283
284 elif typ == 'b':
285 # Process just like echo -e, except \c handling is simpler.
286
287 c_parts = [] # type: List[str]
288 lex = match.EchoLexer(s)
289 while True:
290 id_, tok_val = lex.Next()
291 if id_ == Id.Eol_Tok: # Note: This is really a NUL terminator
292 break
293
294 p = word_compile.EvalCStringToken(id_, tok_val)
295
296 # Unusual behavior: '\c' aborts processing!
297 if p is None:
298 pr.backslash_c = True
299 break
300
301 c_parts.append(p)
302 s = ''.join(c_parts)
303
304 elif part.type.id == Id.Format_Time or typ in 'diouxX':
305 # %(...)T and %d share this complex integer conversion logic
306
307 if match.LooksLikeInteger(s):
308 # Note: spaces like ' -42 ' accepted and normalized
309 ok, d = mops.FromStr2(s)
310 if not ok:
311 self.errfmt.Print_("Integer too big: %s" % s, word_loc)
312 pr.status = 1
313 return None
314
315 else:
316 # Check for 'a and "a
317 # These are interpreted as the numeric ASCII value of 'a'
318 num_bytes = len(s)
319 if num_bytes > 0 and s[0] in '\'"':
320 if num_bytes == 1:
321 # NUL after quote
322 d = mops.ZERO
323 elif num_bytes == 2:
324 # Allow invalid UTF-8, because all shells do
325 d = mops.IntWiden(ord(s[1]))
326 else:
327 try:
328 small_i = string_ops.DecodeUtf8Char(s, 1)
329 except error.Expr as e:
330 # Take the numeric value of first char, ignoring
331 # the rest of the bytes.
332 # Something like strict_arith or strict_printf
333 # could throw an error in this case.
334 self.errfmt.Print_(
335 'Warning: %s' % e.UserErrorString(), word_loc)
336 small_i = ord(s[1])
337
338 d = mops.IntWiden(small_i)
339
340 # No argument means -1 for %(...)T as in Bash Reference Manual
341 # 4.2 - "If no argument is specified, conversion behaves as if
342 # -1 had been given."
343 elif not has_arg and part.type.id == Id.Format_Time:
344 d = mops.MINUS_ONE
345
346 else:
347 if has_arg:
348 blame_loc = word_loc # type: loc_t
349 else:
350 blame_loc = part.type
351 self.errfmt.Print_(
352 'printf expected an integer, got %r' % s, blame_loc)
353 pr.status = 1
354 return None
355
356 if part.type.id == Id.Format_Time:
357 # Initialize timezone:
358 # `localtime' uses the current timezone information initialized
359 # by `tzset'. The function `tzset' refers to the environment
360 # variable `TZ'. When the exported variable `TZ' is present,
361 # its value should be reflected in the real environment
362 # variable `TZ' before call of `tzset'.
363 #
364 # Note: unlike LANG, TZ doesn't seem to change behavior if it's
365 # not exported.
366 #
367 # TODO: In YSH, provide an API that doesn't rely on libc's global
368 # state.
369
370 tzcell = self.mem.GetCell('TZ')
371 if (tzcell and tzcell.exported and
372 tzcell.val.tag() == value_e.Str):
373 tzval = cast(value.Str, tzcell.val)
374 posix.putenv('TZ', tzval.s)
375
376 time_.tzset()
377
378 # Handle special values:
379 # User can specify two special values -1 and -2 as in Bash
380 # Reference Manual 4.2: "Two special argument values may be
381 # used: -1 represents the current time, and -2 represents the
382 # time the shell was invoked." from
383 # https://www.gnu.org/software/bash/manual/html_node/Bash-Builtins.html#index-printf
384 if mops.Equal(d, mops.MINUS_ONE): # -1 is current time
385 # TODO: 2038 problem
386 ts = time_.time()
387 elif mops.Equal(d, mops.MINUS_TWO): # -2 is shell start time
388 ts = self.shell_start_time
389 else:
390 ts = mops.BigTruncate(d)
391
392 s = time_.strftime(typ[1:-2], time_.localtime(ts))
393 if precision >= 0:
394 s = s[:precision] # truncate
395
396 else: # typ in 'diouxX'
397 # Disallowed because it depends on 32- or 64- bit
398 if mops.Greater(mops.ZERO, d) and typ in 'ouxX':
399 # TODO: Don't truncate it
400 self.errfmt.Print_(
401 "Can't format negative number with %%%s: %d" %
402 (typ, mops.BigTruncate(d)), part.type)
403 pr.status = 1
404 return None
405
406 if typ == 'o':
407 s = mops.ToOctal(d)
408 elif typ == 'x':
409 s = mops.ToHexLower(d)
410 elif typ == 'X':
411 s = mops.ToHexUpper(d)
412 else: # diu
413 s = mops.ToStr(d) # without spaces like ' -42 '
414
415 # There are TWO different ways to ZERO PAD, and they differ on
416 # the negative sign! See spec/builtin-printf
417
418 zero_pad = 0 # no zero padding
419 if width >= 0 and '0' in flags:
420 zero_pad = 1 # style 1
421 elif precision > 0 and len(s) < precision:
422 zero_pad = 2 # style 2
423
424 if zero_pad:
425 negative = (s[0] == '-')
426 if negative:
427 digits = s[1:]
428 sign = '-'
429 if zero_pad == 1:
430 # [%06d] -42 becomes [-00042] (6 TOTAL)
431 n = width - 1
432 else:
433 # [%6.6d] -42 becomes [-000042] (1 for '-' + 6)
434 n = precision
435 else:
436 digits = s
437 sign = ''
438 if zero_pad == 1:
439 n = width
440 else:
441 n = precision
442 s = sign + digits.rjust(n, '0')
443
444 else:
445 raise AssertionError()
446
447 if width >= 0:
448 if '-' in flags:
449 s = s.ljust(width, ' ')
450 else:
451 s = s.rjust(width, ' ')
452 return s
453
454 def _Format(self, parts, varargs, locs, out):
455 # type: (List[printf_part_t], List[str], List[CompoundWord], List[str]) -> int
456 """Hairy printf formatting logic."""
457
458 pr = _PrintfState()
459 num_args = len(varargs)
460
461 while True: # loop over arguments
462 for part in parts: # loop over parsed format string
463 UP_part = part
464 if part.tag() == printf_part_e.Literal:
465 part = cast(Token, UP_part)
466 if part.id == Id.Format_EscapedPercent:
467 s = '%'
468 else:
469 s = word_compile.EvalCStringToken(
470 part.id, lexer.LazyStr(part))
471
472 elif part.tag() == printf_part_e.Percent:
473 part = cast(printf_part.Percent, UP_part)
474
475 s = self._Percent(pr, part, varargs, locs)
476 if pr.status != 0:
477 return pr.status
478
479 else:
480 raise AssertionError()
481
482 out.append(s)
483
484 if pr.backslash_c: # 'printf %b a\cb xx' - \c terminates processing!
485 break
486
487 if pr.arg_index == 0:
488 # We went through ALL parts and didn't consume ANY arg.
489 # Example: print x y
490 break
491 if pr.arg_index >= num_args:
492 # We printed all args
493 break
494 # If there are more args, keep going. This implement 'arg recycling'
495 # behavior
496 # printf '%s ' 1 2 3 => 1 2 3
497
498 return 0
499
500 def Run(self, cmd_val):
501 # type: (cmd_value.Argv) -> int
502 """
503 printf: printf [-v var] format [argument ...]
504 """
505 attrs, arg_r = flag_util.ParseCmdVal('printf', cmd_val)
506 arg = arg_types.printf(attrs.attrs)
507
508 fmt, fmt_loc = arg_r.ReadRequired2('requires a format string')
509 varargs, locs = arg_r.Rest2()
510
511 #log('fmt %s', fmt)
512 #log('vals %s', vals)
513
514 arena = self.parse_ctx.arena
515 if fmt in self.parse_cache:
516 parts = self.parse_cache[fmt]
517 else:
518 line_reader = reader.StringLineReader(fmt, arena)
519 # TODO: Make public
520 lexer = self.parse_ctx.MakeLexer(line_reader)
521 parser = _FormatStringParser(lexer)
522
523 with alloc.ctx_SourceCode(arena,
524 source.Dynamic('printf arg', fmt_loc)):
525 try:
526 parts = parser.Parse()
527 except error.Parse as e:
528 self.errfmt.PrettyPrintError(e)
529 return 2 # parse error
530
531 self.parse_cache[fmt] = parts
532
533 if 0:
534 print()
535 for part in parts:
536 part.PrettyPrint()
537 print()
538
539 out = [] # type: List[str]
540 status = self._Format(parts, varargs, locs, out)
541 if status != 0:
542 return status # failure
543
544 result = ''.join(out)
545 if arg.v is not None:
546 # TODO: get the location for arg.v!
547 v_loc = loc.Missing
548 lval = self.unsafe_arith.ParseLValue(arg.v, v_loc)
549 state.BuiltinSetValue(self.mem, lval, value.Str(result))
550 else:
551 mylib.Stdout().write(result)
552 return 0