From 5ae6875e8899c7907ce524b5f43dbfe12b3e5c44 Mon Sep 17 00:00:00 2001 From: Yigong Wang Date: Tue, 27 Mar 2018 20:04:43 -0400 Subject: [PATCH] Fix `mangle` for Pythons compiled with UCS-2 --- hy/_compat.py | 4 ++++ hy/lex/parser.py | 22 ++++++++++++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/hy/_compat.py b/hy/_compat.py index 3c5b7d9..c40e44d 100644 --- a/hy/_compat.py +++ b/hy/_compat.py @@ -25,6 +25,10 @@ PY35 = sys.version_info >= (3, 5) PY36 = sys.version_info >= (3, 6) PY37 = sys.version_info >= (3, 7) +# The value of UCS4 indicates whether Unicode strings are stored as UCS-4. +# It is always true on Pythons >= 3.3, which use USC-4 on all systems. +UCS4 = sys.maxunicode == 0x10FFFF + str_type = str if PY3 else unicode # NOQA bytes_type = bytes if PY3 else str # NOQA long_type = int if PY3 else long # NOQA diff --git a/hy/lex/parser.py b/hy/lex/parser.py index 238f57a..1287d91 100755 --- a/hy/lex/parser.py +++ b/hy/lex/parser.py @@ -10,7 +10,7 @@ import string, re, unicodedata from rply import ParserGenerator -from hy._compat import PY3, str_type, isidentifier +from hy._compat import PY3, str_type, isidentifier, UCS4 from hy.models import (HyBytes, HyComplex, HyCons, HyDict, HyExpression, HyFloat, HyInteger, HyKeyword, HyList, HySet, HyString, HySymbol) @@ -25,14 +25,28 @@ pg = ParserGenerator( mangle_delim = 'Δ' if PY3 else 'X' +def unicode_to_ucs4iter(ustr): + # Covert a unicode string to an iterable object, + # elements in the object are single USC-4 unicode characters + if UCS4: + return ustr + ucs4_list = list(ustr) + for i, u in enumerate(ucs4_list): + if 0xD7FF < ord(u) < 0xDC00: + ucs4_list[i] += ucs4_list[i + 1] + del ucs4_list[i + 1] + return ucs4_list + def mangle(s): """Stringify the argument and convert it to a valid Python identifier according to Hy's mangling rules.""" + def unicode_char_to_hex(uchr): + # Covert a unicode char to hex string, without prefix + return uchr.encode('unicode-escape').decode('utf-8').lstrip('\\U').lstrip('\\u').lstrip('0') assert s s = str_type(s) - s = s.replace("-", "_") s2 = s.lstrip('_') leading_underscores = '_' * (len(s) - len(s2)) @@ -50,8 +64,8 @@ def mangle(s): # allowed at the start of an identifier. else '{0}{1}{0}'.format(mangle_delim, unicodedata.name(c, '').lower().replace('-', 'H').replace(' ', '_') - or 'U{:x}'.format(ord(c))) - for c in s) + or 'U{}'.format(unicode_char_to_hex(c))) + for c in unicode_to_ucs4iter(s)) s = leading_underscores + s assert isidentifier(s)