From 45b7a4ac9d93bfd800653e14cc51c4e3aacf84c1 Mon Sep 17 00:00:00 2001 From: Kodi Arfer Date: Sat, 18 Feb 2017 16:15:58 -0800 Subject: [PATCH] Add bytestring literals --- NEWS | 2 ++ docs/language/api.rst | 13 ++++++++++++ docs/language/internals.rst | 6 ++++++ hy/_compat.py | 5 +++++ hy/compiler.py | 12 +++++++++-- hy/lex/lexer.py | 2 +- hy/lex/parser.py | 32 ++++++++++++++++++++--------- hy/models.py | 12 ++++++++++- tests/compilers/test_ast.py | 9 ++++++++ tests/native_tests/native_macros.hy | 3 +++ 10 files changed, 82 insertions(+), 14 deletions(-) diff --git a/NEWS b/NEWS index 86f8588..1481c02 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,8 @@ Changes from 0.12.1 [ Language Changes ] * `let` has been removed. Python's scoping rules do not make a proper implementation of it possible. Use `setv` instead. + * Added bytestring literals, which create `bytes` objects under Python 3 + and `str` objects under Python 2 * Commas and underscores are allowed in numeric literals * xor: If exactly one argument is true, return it diff --git a/docs/language/api.rst b/docs/language/api.rst index ba1fe5f..f9d0fc9 100644 --- a/docs/language/api.rst +++ b/docs/language/api.rst @@ -52,6 +52,19 @@ digits. (print 10,000,000,000 10_000_000_000) +string literals +--------------- + +Unlike Python, Hy allows only double-quoted strings. The single-quote character +is reserved for preventing the evaluation of a form, as in most Lisps. + +Whether running under Python 2 or Python 3, Hy treats string literals as +sequences of Unicode characters by default, and allows you to prefix a literal +with ``b`` to treat it as a sequence of bytes. So when running under Python 3, +Hy translates ``"foo"`` and ``b"foo"`` to the identical Python code, but when +running under Python 2, ``"foo"`` is translated to ``u"foo"`` and ``b"foo"`` is +translated to ``"foo"``. + Built-Ins ========= diff --git a/docs/language/internals.rst b/docs/language/internals.rst index 7d8e4a2..07b65bd 100644 --- a/docs/language/internals.rst +++ b/docs/language/internals.rst @@ -113,6 +113,12 @@ Hy literal strings can span multiple lines, and are considered by the parser as a single unit, respecting the Python escapes for unicode strings. +HyBytes +~~~~~~~ + +``hy.models.HyBytes`` is like ``HyString``, but for sequences of bytes. +It inherits from ``bytes`` on Python 3 and ``str`` on Python 2. + .. _hy_numeric_models: Numeric Models diff --git a/hy/_compat.py b/hy/_compat.py index 157bad1..bdad6da 100644 --- a/hy/_compat.py +++ b/hy/_compat.py @@ -49,6 +49,11 @@ if PY3: else: str_type = unicode # NOQA +if PY3: + bytes_type = bytes +else: + bytes_type = str + if PY3: long_type = int else: diff --git a/hy/compiler.py b/hy/compiler.py index 3dafd9e..42d61c3 100644 --- a/hy/compiler.py +++ b/hy/compiler.py @@ -25,14 +25,15 @@ # DEALINGS IN THE SOFTWARE. from hy.models import (HyExpression, HyKeyword, HyInteger, HyComplex, HyString, - HySymbol, HyFloat, HyList, HySet, HyDict, HyCons) + HyBytes, HySymbol, HyFloat, HyList, HySet, HyDict, + HyCons) from hy.errors import HyCompileError, HyTypeError from hy.lex.parser import hy_symbol_mangle import hy.macros from hy._compat import ( - str_type, long_type, PY27, PY33, PY3, PY34, PY35, raise_empty) + str_type, bytes_type, long_type, PY27, PY33, PY3, PY34, PY35, raise_empty) from hy.macros import require, macroexpand, reader_macroexpand import hy.importer @@ -2641,6 +2642,13 @@ class HyASTCompiler(object): lineno=string.start_line, col_offset=string.start_column) + @builds(HyBytes) + def compile_bytes(self, bytestring): + f = ast.Bytes if PY3 else ast.Str + return f(s=bytes_type(bytestring), + lineno=bytestring.start_line, + col_offset=bytestring.start_column) + @builds(HyKeyword) def compile_keyword(self, keyword): return ast.Str(s=str_type(keyword), diff --git a/hy/lex/lexer.py b/hy/lex/lexer.py index 4a770f5..4fe35e2 100644 --- a/hy/lex/lexer.py +++ b/hy/lex/lexer.py @@ -46,7 +46,7 @@ lg.add('HASHREADER', r'#[^{]') # A regexp which matches incomplete strings, used to support # multi-line strings in the interpreter partial_string = r'''(?x) - (?:u|r|ur|ru)? # prefix + (?:u|r|ur|ru|b|br|rb)? # prefix " # start string (?: | [^"\\] # non-quote or backslash diff --git a/hy/lex/parser.py b/hy/lex/parser.py index 30dd4f5..518caae 100644 --- a/hy/lex/parser.py +++ b/hy/lex/parser.py @@ -18,13 +18,15 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -import sys from functools import wraps +from ast import literal_eval from rply import ParserGenerator -from hy.models import (HyComplex, HyCons, HyDict, HyExpression, HyFloat, - HyInteger, HyKeyword, HyList, HySet, HyString, HySymbol) +from hy._compat import PY3, str_type +from hy.models import (HyBytes, HyComplex, HyCons, HyDict, HyExpression, + HyFloat, HyInteger, HyKeyword, HyList, HySet, HyString, + HySymbol) from .lexer import lexer from .exceptions import LexException, PrematureEndOfInput @@ -55,8 +57,6 @@ def hy_symbol_unmangle(p): # hy_symbol_mangle is one-way, so this can't be perfect. # But it can be useful till we have a way to get the original # symbol (https://github.com/hylang/hy/issues/360). - - from hy._compat import str_type p = str_type(p) if p.endswith("_bang") and p != "_bang": @@ -258,12 +258,19 @@ def t_empty_list(p): return HyList([]) -if sys.version_info[0] >= 3: +if PY3: def uni_hystring(s): - return HyString(eval(s)) + return HyString(literal_eval(s)) + + def hybytes(s): + return HyBytes(literal_eval('b'+s)) + else: def uni_hystring(s): - return HyString(eval('u'+s)) + return HyString(literal_eval('u'+s)) + + def hybytes(s): + return HyBytes(literal_eval(s)) @pg.production("string : STRING") @@ -273,11 +280,16 @@ def t_string(p): s = p[0].value[:-1] # get the header header, s = s.split('"', 1) - # remove unicode marker + # remove unicode marker (this is redundant because Hy string + # literals already, by default, generate Unicode literals + # under Python 2) header = header.replace("u", "") + # remove bytes marker, since we'll need to exclude it for Python 2 + is_bytestring = "b" in header + header = header.replace("b", "") # build python string s = header + '"""' + s + '"""' - return uni_hystring(s) + return (hybytes if is_bytestring else uni_hystring)(s) @pg.production("string : PARTIAL_STRING") diff --git a/hy/models.py b/hy/models.py index df11247..d7c350a 100644 --- a/hy/models.py +++ b/hy/models.py @@ -19,7 +19,7 @@ # DEALINGS IN THE SOFTWARE. from __future__ import unicode_literals -from hy._compat import PY3, str_type, long_type, string_types +from hy._compat import PY3, str_type, bytes_type, long_type, string_types class HyObject(object): @@ -84,6 +84,16 @@ class HyString(HyObject, str_type): _wrappers[str_type] = HyString +class HyBytes(HyObject, bytes_type): + """ + Generic Hy Bytes object. It's either a ``bytes`` or a ``str``, depending + on the Python version. + """ + pass + +_wrappers[bytes_type] = HyBytes + + class HySymbol(HyString): """ Hy Symbol. Basically a String. diff --git a/tests/compilers/test_ast.py b/tests/compilers/test_ast.py index 6b4d3dc..f81371f 100644 --- a/tests/compilers/test_ast.py +++ b/tests/compilers/test_ast.py @@ -482,6 +482,15 @@ def test_ast_unicode_strings(): assert _compile_string("\xc3\xa9") == "\xc3\xa9" +def test_ast_unicode_vs_bytes(): + def f(x): return hy_compile(tokenize(x), "__main__").body[0].value.s + assert f('"hello"') == u"hello" + assert type(f('"hello"')) is (str if PY3 else unicode) # noqa + assert f('b"hello"') == (eval('b"hello"') if PY3 else "hello") + assert type(f('b"hello"')) == (bytes if PY3 else str) + assert f('b"\\xa0"') == (bytes([160]) if PY3 else chr(160)) + + def test_compile_error(): """Ensure we get compile error in tricky cases""" try: diff --git a/tests/native_tests/native_macros.hy b/tests/native_tests/native_macros.hy index 6d50899..dd09595 100644 --- a/tests/native_tests/native_macros.hy +++ b/tests/native_tests/native_macros.hy @@ -30,6 +30,9 @@ (defmacro a-string [] "foo") (assert (= (a-string) "foo")) +(defmacro a-bytes [] b"foo") +(assert (= (a-bytes) b"foo")) + (defmacro a-list [] [1 2]) (assert (= (a-list) [1 2]))