Fix mangle for Pythons compiled with UCS-2

This commit is contained in:
Yigong Wang 2018-03-27 20:04:43 -04:00 committed by Kodi Arfer
parent af89fd68b0
commit 5ae6875e88
2 changed files with 22 additions and 4 deletions

View File

@ -25,6 +25,10 @@ PY35 = sys.version_info >= (3, 5)
PY36 = sys.version_info >= (3, 6) PY36 = sys.version_info >= (3, 6)
PY37 = sys.version_info >= (3, 7) PY37 = sys.version_info >= (3, 7)
# The value of UCS4 indicates whether Unicode strings are stored as UCS-4.
# It is always true on Pythons >= 3.3, which use USC-4 on all systems.
UCS4 = sys.maxunicode == 0x10FFFF
str_type = str if PY3 else unicode # NOQA str_type = str if PY3 else unicode # NOQA
bytes_type = bytes if PY3 else str # NOQA bytes_type = bytes if PY3 else str # NOQA
long_type = int if PY3 else long # NOQA long_type = int if PY3 else long # NOQA

View File

@ -10,7 +10,7 @@ import string, re, unicodedata
from rply import ParserGenerator from rply import ParserGenerator
from hy._compat import PY3, str_type, isidentifier from hy._compat import PY3, str_type, isidentifier, UCS4
from hy.models import (HyBytes, HyComplex, HyCons, HyDict, HyExpression, from hy.models import (HyBytes, HyComplex, HyCons, HyDict, HyExpression,
HyFloat, HyInteger, HyKeyword, HyList, HySet, HyString, HyFloat, HyInteger, HyKeyword, HyList, HySet, HyString,
HySymbol) HySymbol)
@ -25,14 +25,28 @@ pg = ParserGenerator(
mangle_delim = 'Δ' if PY3 else 'X' mangle_delim = 'Δ' if PY3 else 'X'
def unicode_to_ucs4iter(ustr):
# Covert a unicode string to an iterable object,
# elements in the object are single USC-4 unicode characters
if UCS4:
return ustr
ucs4_list = list(ustr)
for i, u in enumerate(ucs4_list):
if 0xD7FF < ord(u) < 0xDC00:
ucs4_list[i] += ucs4_list[i + 1]
del ucs4_list[i + 1]
return ucs4_list
def mangle(s): def mangle(s):
"""Stringify the argument and convert it to a valid Python identifier """Stringify the argument and convert it to a valid Python identifier
according to Hy's mangling rules.""" according to Hy's mangling rules."""
def unicode_char_to_hex(uchr):
# Covert a unicode char to hex string, without prefix
return uchr.encode('unicode-escape').decode('utf-8').lstrip('\\U').lstrip('\\u').lstrip('0')
assert s assert s
s = str_type(s) s = str_type(s)
s = s.replace("-", "_") s = s.replace("-", "_")
s2 = s.lstrip('_') s2 = s.lstrip('_')
leading_underscores = '_' * (len(s) - len(s2)) leading_underscores = '_' * (len(s) - len(s2))
@ -50,8 +64,8 @@ def mangle(s):
# allowed at the start of an identifier. # allowed at the start of an identifier.
else '{0}{1}{0}'.format(mangle_delim, else '{0}{1}{0}'.format(mangle_delim,
unicodedata.name(c, '').lower().replace('-', 'H').replace(' ', '_') unicodedata.name(c, '').lower().replace('-', 'H').replace(' ', '_')
or 'U{:x}'.format(ord(c))) or 'U{}'.format(unicode_char_to_hex(c)))
for c in s) for c in unicode_to_ucs4iter(s))
s = leading_underscores + s s = leading_underscores + s
assert isidentifier(s) assert isidentifier(s)