From 45b7a4ac9d93bfd800653e14cc51c4e3aacf84c1 Mon Sep 17 00:00:00 2001
From: Kodi Arfer <git@arfer.net>
Date: Sat, 18 Feb 2017 16:15:58 -0800
Subject: [PATCH] Add bytestring literals

---
 NEWS                                |  2 ++
 docs/language/api.rst               | 13 ++++++++++++
 docs/language/internals.rst         |  6 ++++++
 hy/_compat.py                       |  5 +++++
 hy/compiler.py                      | 12 +++++++++--
 hy/lex/lexer.py                     |  2 +-
 hy/lex/parser.py                    | 32 ++++++++++++++++++++---------
 hy/models.py                        | 12 ++++++++++-
 tests/compilers/test_ast.py         |  9 ++++++++
 tests/native_tests/native_macros.hy |  3 +++
 10 files changed, 82 insertions(+), 14 deletions(-)

diff --git a/NEWS b/NEWS
index 86f8588..1481c02 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,8 @@ Changes from 0.12.1
    [ Language Changes ]
    * `let` has been removed. Python's scoping rules do not make a proper
      implementation of it possible. Use `setv` instead.
+   * Added bytestring literals, which create `bytes` objects under Python 3
+     and `str` objects under Python 2
    * Commas and underscores are allowed in numeric literals
    * xor: If exactly one argument is true, return it
 
diff --git a/docs/language/api.rst b/docs/language/api.rst
index ba1fe5f..f9d0fc9 100644
--- a/docs/language/api.rst
+++ b/docs/language/api.rst
@@ -52,6 +52,19 @@ digits.
 
     (print 10,000,000,000 10_000_000_000)
 
+string literals
+---------------
+
+Unlike Python, Hy allows only double-quoted strings. The single-quote character
+is reserved for preventing the evaluation of a form, as in most Lisps.
+
+Whether running under Python 2 or Python 3, Hy treats string literals as
+sequences of Unicode characters by default, and allows you to prefix a literal
+with ``b`` to treat it as a sequence of bytes. So when running under Python 3,
+Hy translates ``"foo"`` and ``b"foo"`` to the identical Python code, but when
+running under Python 2, ``"foo"`` is translated to ``u"foo"`` and ``b"foo"`` is
+translated to ``"foo"``.
+
 Built-Ins
 =========
 
diff --git a/docs/language/internals.rst b/docs/language/internals.rst
index 7d8e4a2..07b65bd 100644
--- a/docs/language/internals.rst
+++ b/docs/language/internals.rst
@@ -113,6 +113,12 @@ Hy literal strings can span multiple lines, and are considered by the
 parser as a single unit, respecting the Python escapes for unicode
 strings.
 
+HyBytes
+~~~~~~~
+
+``hy.models.HyBytes`` is like ``HyString``, but for sequences of bytes.
+It inherits from ``bytes`` on Python 3 and ``str`` on Python 2.
+
 .. _hy_numeric_models:
 
 Numeric Models
diff --git a/hy/_compat.py b/hy/_compat.py
index 157bad1..bdad6da 100644
--- a/hy/_compat.py
+++ b/hy/_compat.py
@@ -49,6 +49,11 @@ if PY3:
 else:
     str_type = unicode  # NOQA
 
+if PY3:
+    bytes_type = bytes
+else:
+    bytes_type = str
+
 if PY3:
     long_type = int
 else:
diff --git a/hy/compiler.py b/hy/compiler.py
index 3dafd9e..42d61c3 100644
--- a/hy/compiler.py
+++ b/hy/compiler.py
@@ -25,14 +25,15 @@
 # DEALINGS IN THE SOFTWARE.
 
 from hy.models import (HyExpression, HyKeyword, HyInteger, HyComplex, HyString,
-                       HySymbol, HyFloat, HyList, HySet, HyDict, HyCons)
+                       HyBytes, HySymbol, HyFloat, HyList, HySet, HyDict,
+                       HyCons)
 from hy.errors import HyCompileError, HyTypeError
 
 from hy.lex.parser import hy_symbol_mangle
 
 import hy.macros
 from hy._compat import (
-    str_type, long_type, PY27, PY33, PY3, PY34, PY35, raise_empty)
+    str_type, bytes_type, long_type, PY27, PY33, PY3, PY34, PY35, raise_empty)
 from hy.macros import require, macroexpand, reader_macroexpand
 import hy.importer
 
@@ -2641,6 +2642,13 @@ class HyASTCompiler(object):
                        lineno=string.start_line,
                        col_offset=string.start_column)
 
+    @builds(HyBytes)
+    def compile_bytes(self, bytestring):
+        f = ast.Bytes if PY3 else ast.Str
+        return f(s=bytes_type(bytestring),
+                 lineno=bytestring.start_line,
+                 col_offset=bytestring.start_column)
+
     @builds(HyKeyword)
     def compile_keyword(self, keyword):
         return ast.Str(s=str_type(keyword),
diff --git a/hy/lex/lexer.py b/hy/lex/lexer.py
index 4a770f5..4fe35e2 100644
--- a/hy/lex/lexer.py
+++ b/hy/lex/lexer.py
@@ -46,7 +46,7 @@ lg.add('HASHREADER', r'#[^{]')
 # A regexp which matches incomplete strings, used to support
 # multi-line strings in the interpreter
 partial_string = r'''(?x)
-    (?:u|r|ur|ru)? # prefix
+    (?:u|r|ur|ru|b|br|rb)? # prefix
     "  # start string
     (?:
        | [^"\\]             # non-quote or backslash
diff --git a/hy/lex/parser.py b/hy/lex/parser.py
index 30dd4f5..518caae 100644
--- a/hy/lex/parser.py
+++ b/hy/lex/parser.py
@@ -18,13 +18,15 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
-import sys
 from functools import wraps
+from ast import literal_eval
 
 from rply import ParserGenerator
 
-from hy.models import (HyComplex, HyCons, HyDict, HyExpression, HyFloat,
-                       HyInteger, HyKeyword, HyList, HySet, HyString, HySymbol)
+from hy._compat import PY3, str_type
+from hy.models import (HyBytes, HyComplex, HyCons, HyDict, HyExpression,
+                       HyFloat, HyInteger, HyKeyword, HyList, HySet, HyString,
+                       HySymbol)
 from .lexer import lexer
 from .exceptions import LexException, PrematureEndOfInput
 
@@ -55,8 +57,6 @@ def hy_symbol_unmangle(p):
     # hy_symbol_mangle is one-way, so this can't be perfect.
     # But it can be useful till we have a way to get the original
     # symbol (https://github.com/hylang/hy/issues/360).
-
-    from hy._compat import str_type
     p = str_type(p)
 
     if p.endswith("_bang") and p != "_bang":
@@ -258,12 +258,19 @@ def t_empty_list(p):
     return HyList([])
 
 
-if sys.version_info[0] >= 3:
+if PY3:
     def uni_hystring(s):
-        return HyString(eval(s))
+        return HyString(literal_eval(s))
+
+    def hybytes(s):
+        return HyBytes(literal_eval('b'+s))
+
 else:
     def uni_hystring(s):
-        return HyString(eval('u'+s))
+        return HyString(literal_eval('u'+s))
+
+    def hybytes(s):
+        return HyBytes(literal_eval(s))
 
 
 @pg.production("string : STRING")
@@ -273,11 +280,16 @@ def t_string(p):
     s = p[0].value[:-1]
     # get the header
     header, s = s.split('"', 1)
-    # remove unicode marker
+    # remove unicode marker (this is redundant because Hy string
+    # literals already, by default, generate Unicode literals
+    # under Python 2)
     header = header.replace("u", "")
+    # remove bytes marker, since we'll need to exclude it for Python 2
+    is_bytestring = "b" in header
+    header = header.replace("b", "")
     # build python string
     s = header + '"""' + s + '"""'
-    return uni_hystring(s)
+    return (hybytes if is_bytestring else uni_hystring)(s)
 
 
 @pg.production("string : PARTIAL_STRING")
diff --git a/hy/models.py b/hy/models.py
index df11247..d7c350a 100644
--- a/hy/models.py
+++ b/hy/models.py
@@ -19,7 +19,7 @@
 # DEALINGS IN THE SOFTWARE.
 
 from __future__ import unicode_literals
-from hy._compat import PY3, str_type, long_type, string_types
+from hy._compat import PY3, str_type, bytes_type, long_type, string_types
 
 
 class HyObject(object):
@@ -84,6 +84,16 @@ class HyString(HyObject, str_type):
 _wrappers[str_type] = HyString
 
 
+class HyBytes(HyObject, bytes_type):
+    """
+    Generic Hy Bytes object. It's either a ``bytes`` or a ``str``, depending
+    on the Python version.
+    """
+    pass
+
+_wrappers[bytes_type] = HyBytes
+
+
 class HySymbol(HyString):
     """
     Hy Symbol. Basically a String.
diff --git a/tests/compilers/test_ast.py b/tests/compilers/test_ast.py
index 6b4d3dc..f81371f 100644
--- a/tests/compilers/test_ast.py
+++ b/tests/compilers/test_ast.py
@@ -482,6 +482,15 @@ def test_ast_unicode_strings():
     assert _compile_string("\xc3\xa9") == "\xc3\xa9"
 
 
+def test_ast_unicode_vs_bytes():
+    def f(x): return hy_compile(tokenize(x), "__main__").body[0].value.s
+    assert f('"hello"') == u"hello"
+    assert type(f('"hello"')) is (str if PY3 else unicode)  # noqa
+    assert f('b"hello"') == (eval('b"hello"') if PY3 else "hello")
+    assert type(f('b"hello"')) == (bytes if PY3 else str)
+    assert f('b"\\xa0"') == (bytes([160]) if PY3 else chr(160))
+
+
 def test_compile_error():
     """Ensure we get compile error in tricky cases"""
     try:
diff --git a/tests/native_tests/native_macros.hy b/tests/native_tests/native_macros.hy
index 6d50899..dd09595 100644
--- a/tests/native_tests/native_macros.hy
+++ b/tests/native_tests/native_macros.hy
@@ -30,6 +30,9 @@
 (defmacro a-string [] "foo")
 (assert (= (a-string) "foo"))
 
+(defmacro a-bytes [] b"foo")
+(assert (= (a-bytes) b"foo"))
+
 (defmacro a-list [] [1 2])
 (assert (= (a-list) [1 2]))