diff --git a/NEWS.rst b/NEWS.rst index 7c66102..cccb8eb 100644 --- a/NEWS.rst +++ b/NEWS.rst @@ -1,5 +1,13 @@ .. default-role:: code +Unreleased +============================== + +New Features +------------------------------ +* Format strings with embedded Hy code (e.g., `f"The sum is {(+ x y)}"`) + are now supported, even on Pythons earlier than 3.6. + 0.16.0 ============================== diff --git a/docs/language/syntax.rst b/docs/language/syntax.rst index 32ebec9..2414499 100644 --- a/docs/language/syntax.rst +++ b/docs/language/syntax.rst @@ -42,7 +42,8 @@ string literal called a "bracket string" similar to Lua's long brackets. Bracket strings have customizable delimiters, like the here-documents of other languages. A bracket string begins with ``#[FOO[`` and ends with ``]FOO]``, where ``FOO`` is any string not containing ``[`` or ``]``, including the empty -string. For example:: +string. (If ``FOO`` is exactly ``f`` or begins with ``f-``, the bracket string +is interpreted as a :ref:`format string `.) For example:: => (print #[["That's very kind of yuo [sic]" Tom wrote back.]]) "That's very kind of yuo [sic]" Tom wrote back. @@ -69,6 +70,43 @@ of bytes. So when running under Python 3, Hy translates ``"foo"`` and Unlike Python, Hy only recognizes string prefixes (``r``, etc.) in lowercase. +.. _syntax-fstrings: + +format strings +-------------- + +A format string (or "f-string", or "formatted string literal") is a string +literal with embedded code, possibly accompanied by formatting commands. Hy +f-strings work much like :ref:`Python f-strings ` except that the +embedded code is in Hy rather than Python, and they're supported on all +versions of Python. + +:: + + => (print f"The sum is {(+ 1 1)}.") + The sum is 2. + +Since ``!`` and ``:`` are identifier characters in Hy, Hy decides where the +code in a replacement field ends, and any conversion or format specifier +begins, by parsing exactly one form. You can use ``do`` to combine several +forms into one, as usual. Whitespace may be necessary to terminate the form:: + + => (setv foo "a") + => (print f"{foo:x<5}") + … + NameError: name 'hyx_fooXcolonXxXlessHthan_signX5' is not defined + => (print f"{foo :x<5}") + axxxx + +Unlike Python, whitespace is allowed between a conversion and a format +specifier. + +Also unlike Python, comments and backslashes are allowed in replacement fields. +Hy's lexer will still process the whole format string normally, like any other +string, before any replacement fields are considered, so you may need to +backslash your backslashes, and you can't comment out a closing brace or the +string delimiter. + .. _syntax-keywords: keywords diff --git a/hy/compiler.py b/hy/compiler.py index 08e0c98..8bd2aae 100755 --- a/hy/compiler.py +++ b/hy/compiler.py @@ -12,14 +12,15 @@ from funcparserlib.parser import some, many, oneplus, maybe, NoParseError from hy.errors import (HyCompileError, HyTypeError, HyLanguageError, HySyntaxError, HyEvalError, HyInternalError) -from hy.lex import mangle, unmangle +from hy.lex import mangle, unmangle, hy_parse, parse_one_thing, LexException from hy._compat import (string_types, str_type, bytes_type, long_type, PY3, - PY35, reraise) + PY35, PY36, reraise) from hy.macros import require, load_macros, macroexpand, tag_macroexpand import hy.core +import re import pkgutil import traceback import importlib @@ -31,6 +32,7 @@ import copy import __future__ from collections import defaultdict +from functools import reduce if PY3: import builtins @@ -629,8 +631,11 @@ class HyASTCompiler(object): elif isinstance(form, HyKeyword): body = [HyString(form.name)] - elif isinstance(form, HyString) and form.brackets is not None: - body.extend([HyKeyword("brackets"), form.brackets]) + elif isinstance(form, HyString): + if form.is_format: + body.extend([HyKeyword("is_format"), form.is_format]) + if form.brackets is not None: + body.extend([HyKeyword("brackets"), form.brackets]) ret = HyExpression([HySymbol(name)] + body).replace(form) return imports, ret, False @@ -1798,10 +1803,112 @@ class HyASTCompiler(object): @builds_model(HyString, HyBytes) def compile_string(self, string): + if type(string) is HyString and string.is_format: + # This is a format string (a.k.a. an f-string). + return self._format_string(string, str_type(string)) node = asty.Bytes if PY3 and type(string) is HyBytes else asty.Str f = bytes_type if type(string) is HyBytes else str_type return node(string, s=f(string)) + def _format_string(self, string, rest, allow_recursion=True): + values = [] + ret = Result() + + while True: + # Look for the next replacement field, and get the + # plain text before it. + match = re.search(r'\{\{?|\}\}?', rest) + if match: + literal_chars = rest[: match.start()] + if match.group() == '}': + raise self._syntax_error(string, + "f-string: single '}' is not allowed") + if match.group() in ('{{', '}}'): + # Doubled braces just add a single brace to the text. + literal_chars += match.group()[0] + rest = rest[match.end() :] + else: + literal_chars = rest + rest = "" + if literal_chars: + values.append(asty.Str(string, s = literal_chars)) + if not rest: + break + if match.group() != '{': + continue + + # Look for the end of the replacement field, allowing + # one more level of matched braces, but no deeper, and only + # if we can recurse. + match = re.match( + r'(?: \{ [^{}]* \} | [^{}]+ )* \}' + if allow_recursion + else r'[^{}]* \}', + rest, re.VERBOSE) + if not match: + raise self._syntax_error(string, 'f-string: mismatched braces') + item = rest[: match.end() - 1] + rest = rest[match.end() :] + + # Parse the first form. + try: + model, item = parse_one_thing(item) + except (ValueError, LexException) as e: + raise self._syntax_error(string, "f-string: " + str_type(e)) + + # Look for a conversion character. + item = item.lstrip() + conversion = None + if item.startswith('!'): + conversion = item[1] + item = item[2:].lstrip() + + # Look for a format specifier. + format_spec = asty.Str(string, s = "") + if item.startswith(':'): + if allow_recursion: + ret += self._format_string(string, + item[1:], + allow_recursion=False) + format_spec = ret.force_expr + else: + format_spec = asty.Str(string, s=item[1:]) + elif item: + raise self._syntax_error(string, + "f-string: trailing junk in field") + + # Now, having finished compiling any recursively included + # forms, we can compile the first form that we parsed. + ret += self.compile(model) + + if PY36: + values.append(asty.FormattedValue( + string, + conversion = -1 if conversion is None else ord(conversion), + format_spec = format_spec, + value = ret.force_expr)) + else: + # Make an expression like: + # "{!r:{}}".format(value, format_spec) + values.append(asty.Call(string, + func = asty.Attribute( + string, + value = asty.Str(string, s = + '{' + + ('!' + conversion if conversion else '') + + ':{}}'), + attr = 'format', ctx = ast.Load()), + args = [ret.force_expr, format_spec], + keywords = [], starargs = None, kwargs = None)) + + return ret + ( + asty.JoinedStr(string, values = values) + if PY36 + else reduce( + lambda x, y: + asty.BinOp(string, left = x, op = ast.Add(), right = y), + values)) + @builds_model(HyList, HySet) def compile_list(self, expression): elts, ret, _ = self._compile_collect(expression) diff --git a/hy/lex/__init__.py b/hy/lex/__init__.py index eb3ac41..d133f5f 100644 --- a/hy/lex/__init__.py +++ b/hy/lex/__init__.py @@ -74,6 +74,33 @@ def tokenize(source, filename=None): raise e +def parse_one_thing(src_string): + """Parse the first form from the string. Return it and the + remainder of the string.""" + import re + from hy.lex.lexer import lexer + from hy.lex.parser import parser + from rply.errors import LexingError + tokens = [] + err = None + for token in lexer.lex(src_string): + tokens.append(token) + try: + model, = parser.parse( + iter(tokens), + state=ParserState(src_string, filename=None)) + except (LexingError, LexException) as e: + err = e + else: + return model, src_string[re.match( + r'.+\n' * (model.end_line - 1) + + '.' * model.end_column, + src_string).end():] + if err: + raise err + raise ValueError("No form found") + + mangle_delim = 'X' diff --git a/hy/lex/lexer.py b/hy/lex/lexer.py index 14b7c88..f202d94 100755 --- a/hy/lex/lexer.py +++ b/hy/lex/lexer.py @@ -38,7 +38,7 @@ lg.add('HASHOTHER', r'#%s' % identifier) # A regexp which matches incomplete strings, used to support # multi-line strings in the interpreter partial_string = r'''(?x) - (?:u|r|ur|ru|b|br|rb)? # prefix + (?:u|r|ur|ru|b|br|rb|f|fr|rf)? # prefix " # start string (?: | [^"\\] # non-quote or backslash diff --git a/hy/lex/parser.py b/hy/lex/parser.py index c4df2a5..6a1acfb 100755 --- a/hy/lex/parser.py +++ b/hy/lex/parser.py @@ -31,8 +31,11 @@ def set_boundaries(fun): ret.end_line = end.lineno ret.end_column = end.colno else: - ret.end_line = start.lineno - ret.end_column = start.colno + len(p[0].value) + v = p[0].value + ret.end_line = start.lineno + v.count('\n') + ret.end_column = (len(v) - v.rindex('\n') - 1 + if '\n' in v + else start.colno + len(v) - 1) return ret return wrapped @@ -197,14 +200,22 @@ def t_empty_list(state, p): @pg.production("string : STRING") @set_boundaries def t_string(state, p): + s = p[0].value + # Detect and remove any "f" prefix. + is_format = False + if s.startswith('f') or s.startswith('rf'): + is_format = True + s = s.replace('f', '', 1) # Replace the single double quotes with triple double quotes to allow # embedded newlines. try: - s = eval(p[0].value.replace('"', '"""', 1)[:-1] + '"""') + s = eval(s.replace('"', '"""', 1)[:-1] + '"""') except SyntaxError: raise LexException.from_lexer("Can't convert {} to a HyString".format(p[0].value), state, p[0]) - return (HyString if isinstance(s, str_type) else HyBytes)(s) + return (HyString(s, is_format = is_format) + if isinstance(s, str_type) + else HyBytes(s)) @pg.production("string : PARTIAL_STRING") @@ -219,7 +230,10 @@ bracket_string_re = next(r.re for r in lexer.rules if r.name == 'BRACKETSTRING') def t_bracket_string(state, p): m = bracket_string_re.match(p[0].value) delim, content = m.groups() - return HyString(content, brackets=delim) + return HyString( + content, + is_format = delim == 'f' or delim.startswith('f-'), + brackets = delim) @pg.production("identifier : IDENTIFIER") diff --git a/hy/models.py b/hy/models.py index 478c691..458d615 100644 --- a/hy/models.py +++ b/hy/models.py @@ -33,6 +33,11 @@ class HyObject(object): """ Generic Hy Object model. This is helpful to inject things into all the Hy lexing Objects at once. + + The position properties (`start_line`, `end_line`, `start_column`, + `end_column`) are each 1-based and inclusive. For example, a symbol + `abc` starting at the first column would have `start_column` 1 and + `end_column` 3. """ __properties__ = ["module", "start_line", "end_line", "start_column", "end_column"] @@ -89,8 +94,9 @@ class HyString(HyObject, str_type): scripts. It's either a ``str`` or a ``unicode``, depending on the Python version. """ - def __new__(cls, s=None, brackets=None): + def __new__(cls, s=None, is_format=False, brackets=None): value = super(HyString, cls).__new__(cls, s) + value.is_format = bool(is_format) value.brackets = brackets return value diff --git a/tests/compilers/test_ast.py b/tests/compilers/test_ast.py index 9311eef..9d004da 100644 --- a/tests/compilers/test_ast.py +++ b/tests/compilers/test_ast.py @@ -10,7 +10,7 @@ from hy.compiler import hy_compile, hy_eval from hy.errors import HyCompileError, HyLanguageError, HyError from hy.lex import hy_parse from hy.lex.exceptions import LexException, PrematureEndOfInput -from hy._compat import PY3 +from hy._compat import PY3, PY36 import ast import pytest @@ -511,6 +511,18 @@ def test_ast_unicode_vs_bytes(): assert s('b"\\xa0"') == (bytes([160]) if PY3 else chr(160)) +@pytest.mark.skipif(not PY36, reason='f-strings require Python 3.6+') +def test_format_string(): + assert can_compile('f"hello world"') + assert can_compile('f"hello {(+ 1 1)} world"') + assert can_compile('f"hello world {(+ 1 1)}"') + assert cant_compile('f"hello {(+ 1 1) world"') + assert cant_compile('f"hello (+ 1 1)} world"') + assert cant_compile('f"hello {(+ 1 1} world"') + assert can_compile(r'f"hello {\"n\"} world"') + assert can_compile(r'f"hello {\"\\n\"} world"') + + def test_ast_bracket_string(): assert s(r'#[[empty delims]]') == 'empty delims' assert s(r'#[my delim[fizzle]my delim]') == 'fizzle' diff --git a/tests/native_tests/language.hy b/tests/native_tests/language.hy index 65c629a..04936dd 100644 --- a/tests/native_tests/language.hy +++ b/tests/native_tests/language.hy @@ -1217,6 +1217,72 @@ (assert (none? (. '"squid" brackets)))) +(defn test-format-strings [] + (assert (= f"hello world" "hello world")) + (assert (= f"hello {(+ 1 1)} world" "hello 2 world")) + (assert (= f"a{ (.upper (+ \"g\" \"k\")) }z" "aGKz")) + + ; Referring to a variable + (setv p "xyzzy") + (assert (= f"h{p}j" "hxyzzyj")) + + ; Including a statement and setting a variable + (assert (= f"a{(do (setv floop 4) (* floop 2))}z" "a8z")) + (assert (= floop 4)) + + ; Comments + (assert (= f"a{(+ 1 + 2 ; This is a comment. + 3)}z" "a6z")) + + ; Newlines in replacement fields + (assert (= f"ey {\"bee +cee\"} dee" "ey bee\ncee dee")) + + ; Conversion characters and format specifiers + (setv p:9 "other") + (setv !r "bar") + (defn u [s] + ; Add a "u" prefix for Python 2. + (if PY3 + s + (.replace (.replace s "'" "u'" 1) " " " " 1))) + (assert (= f"a{p !r}" (u "a'xyzzy'"))) + (assert (= f"a{p :9}" "axyzzy ")) + (assert (= f"a{p:9}" "aother")) + (assert (= f"a{p !r :9}" (u "a'xyzzy' "))) + (assert (= f"a{p !r:9}" (u "a'xyzzy' "))) + (assert (= f"a{p:9 :9}" "aother ")) + (assert (= f"a{!r}" "abar")) + (assert (= f"a{!r !r}" (u "a'bar'"))) + + ; Fun with `r` + (assert (= f"hello {r\"\\n\"}" r"hello \n")) + (assert (= f"hello {r\"\n\"}" "hello \n")) + ; The `r` applies too late to avoid interpreting a backslash. + + ; Braces escaped via doubling + (assert (= f"ab{{cde" "ab{cde")) + (assert (= f"ab{{cde}}}}fg{{{{{{" "ab{cde}}fg{{{")) + (assert (= f"ab{{{(+ 1 1)}}}" "ab{2}")) + + ; Nested replacement fields + (assert (= f"{2 :{(+ 2 2)}}" " 2")) + (setv value 12.34 width 10 precision 4) + (assert (= f"result: {value :{width}.{precision}}" "result: 12.34")) + + ; Nested replacement fields with ! and : + (defclass C [object] + (defn __format__ [self format-spec] + (+ "C[" format-spec "]"))) + (assert (= f"{(C) : {(str (+ 1 1)) !r :x<5}}" "C[ '2'xx]")) + + ; Format bracket strings + (assert (= #[f[a{p !r :9}]f] (u "a'xyzzy' "))) + (assert (= #[f-string[result: {value :{width}.{precision}}]f-string] + "result: 12.34"))) + + (defn test-import-syntax [] "NATIVE: test the import syntax." diff --git a/tests/test_lex.py b/tests/test_lex.py index f709719..304f69a 100644 --- a/tests/test_lex.py +++ b/tests/test_lex.py @@ -240,22 +240,38 @@ def test_lex_bad_attrs(): with lexe(): tokenize(":hello.foo") -def test_lex_line_counting(): - """ Make sure we can count lines / columns """ +def test_lex_column_counting(): entry = tokenize("(foo (one two))")[0] - assert entry.start_line == 1 assert entry.start_column == 1 - assert entry.end_line == 1 assert entry.end_column == 15 - entry = entry[1] - assert entry.start_line == 1 - assert entry.start_column == 6 + symbol = entry[0] + assert symbol.start_line == 1 + assert symbol.start_column == 2 + assert symbol.end_line == 1 + assert symbol.end_column == 4 - assert entry.end_line == 1 - assert entry.end_column == 14 + inner_expr = entry[1] + assert inner_expr.start_line == 1 + assert inner_expr.start_column == 6 + assert inner_expr.end_line == 1 + assert inner_expr.end_column == 14 + + +def test_lex_column_counting_with_literal_newline(): + string, symbol = tokenize('"apple\nblueberry" abc') + + assert string.start_line == 1 + assert string.start_column == 1 + assert string.end_line == 2 + assert string.end_column == 10 + + assert symbol.start_line == 2 + assert symbol.start_column == 12 + assert symbol.end_line == 2 + assert symbol.end_column == 14 def test_lex_line_counting_multi():