hy/hy/lex/lexer.py

# Copyright 2018 the authors.
# This file is part of Hy, which is free software licensed under the Expat
# license. See the LICENSE.

from rply import LexerGenerator


lg = LexerGenerator()


# A regexp for something that should end a quoting/unquoting operator
# i.e. a space or a closing brace/paren/curly
end_quote = r'(?![\s\)\]\}])'

identifier = r'[^()\[\]{}\'"\s;]+'

lg.add('LPAREN', r'\(')
lg.add('RPAREN', r'\)')
lg.add('LBRACKET', r'\[')
lg.add('RBRACKET', r'\]')
lg.add('LCURLY', r'\{')
lg.add('RCURLY', r'\}')
lg.add('HLCURLY', r'#\{')
lg.add('QUOTE', r'\'%s' % end_quote)
lg.add('QUASIQUOTE', r'`%s' % end_quote)
lg.add('UNQUOTESPLICE', r'~@%s' % end_quote)
lg.add('UNQUOTE', r'~%s' % end_quote)
lg.add('DISCARD', r'#_')
lg.add('HASHSTARS', r'#\*+')
lg.add('BRACKETSTRING', r'''(?x)
    \# \[ ( [^\[\]]* ) \[    # Opening delimiter
    \n?                      # A single leading newline will be ignored
    ((?:\n|.)*?)             # Content of the string
    \] \1 \]                 # Closing delimiter
    ''')
lg.add('HASHOTHER', r'#%s' % identifier)

# A regexp which matches incomplete strings, used to support
# multi-line strings in the interpreter
partial_string = r'''(?x)
    (?:u|r|ur|ru|b|br|rb)? # prefix
    "  # start string
    (?:
       | [^"\\]             # non-quote or backslash
       | \\(.|\n)           # or escaped single character or newline
       | \\x[0-9a-fA-F]{2}  # or escaped raw character
       | \\u[0-9a-fA-F]{4}  # or unicode escape
       | \\U[0-9a-fA-F]{8}  # or long unicode escape
    )* # one or more times
'''

lg.add('STRING', r'%s"' % partial_string)
lg.add('PARTIAL_STRING', partial_string)

lg.add('IDENTIFIER', identifier)


lg.ignore(r';.*(?=\r|\n|$)')
lg.ignore(r'\s+')


lexer = lg.build()
Update copyright years 2018-01-01 16:38:33 +01:00			`# Copyright 2018 the authors.`
Make all files comply with license-header policy 2017-04-27 23:16:57 +02:00			`# This file is part of Hy, which is free software licensed under the Expat`
			`# license. See the LICENSE.`
Add an rply-based lexer 2013-04-08 09:56:04 +02:00
			`from rply import LexerGenerator`


			`lg = LexerGenerator()`


			`# A regexp for something that should end a quoting/unquoting operator`
			`# i.e. a space or a closing brace/paren/curly`
			`end_quote = r'(?![\s\)\]\}])'`

make sharp macros take arbitrary identifiers Previously, only a single character was allowed. 2017-05-10 03:54:32 +02:00			`identifier = r'[^()\[\]{}\'"\s;]+'`
Add an rply-based lexer 2013-04-08 09:56:04 +02:00
			`lg.add('LPAREN', r'\(')`
			`lg.add('RPAREN', r'\)')`
			`lg.add('LBRACKET', r'\[')`
			`lg.add('RBRACKET', r'\]')`
			`lg.add('LCURLY', r'\{')`
			`lg.add('RCURLY', r'\}')`
Add set literals (closes #827) 2015-06-26 23:47:35 +02:00			`lg.add('HLCURLY', r'#\{')`
Add an rply-based lexer 2013-04-08 09:56:04 +02:00			`lg.add('QUOTE', r'\'%s' % end_quote)`
			lg.add('QUASIQUOTE', r'`%s' % end_quote)
			`lg.add('UNQUOTESPLICE', r'~@%s' % end_quote)`
			`lg.add('UNQUOTE', r'~%s' % end_quote)`
add #_ discard syntax 2017-08-02 23:53:46 +02:00			`lg.add('DISCARD', r'#_')`
Implement #* and #** unpacking 2017-07-17 22:34:39 +02:00			`lg.add('HASHSTARS', r'#\*+')`
Add #[DELIM[ … ]DELIM] syntax for string literals 2017-09-08 20:22:31 +02:00			`lg.add('BRACKETSTRING', r'''(?x)`
			`\# \[ ( [^\[\]]* ) \[ # Opening delimiter`
			`\n? # A single leading newline will be ignored`
			`((?:\n\|.)*?) # Content of the string`
			`\] \1 \] # Closing delimiter`
			`''')`
make sharp macros take arbitrary identifiers Previously, only a single character was allowed. 2017-05-10 03:54:32 +02:00			`lg.add('HASHOTHER', r'#%s' % identifier)`
Add an rply-based lexer 2013-04-08 09:56:04 +02:00
Add support for multi-line strings in interpreter 2014-12-29 00:36:31 +01:00			`# A regexp which matches incomplete strings, used to support`
			`# multi-line strings in the interpreter`
			`partial_string = r'''(?x)`
Add bytestring literals 2017-02-19 01:15:58 +01:00			`(?:u\|r\|ur\|ru\|b\|br\|rb)? # prefix`
Add an rply-based lexer 2013-04-08 09:56:04 +02:00			`" # start string`
			`(?:`
			`\| [^"\\] # non-quote or backslash`
Fix #831 Blame dot not matching newlines 2015-08-22 22:13:46 +02:00			`\| \\(.\|\n) # or escaped single character or newline`
Add an rply-based lexer 2013-04-08 09:56:04 +02:00			`\| \\x[0-9a-fA-F]{2} # or escaped raw character`
			`\| \\u[0-9a-fA-F]{4} # or unicode escape`
			`\| \\U[0-9a-fA-F]{8} # or long unicode escape`
			`)* # one or more times`
Add support for multi-line strings in interpreter 2014-12-29 00:36:31 +01:00			`'''`
Add an rply-based lexer 2013-04-08 09:56:04 +02:00
Add support for multi-line strings in interpreter 2014-12-29 00:36:31 +01:00			`lg.add('STRING', r'%s"' % partial_string)`
			`lg.add('PARTIAL_STRING', partial_string)`
Add an rply-based lexer 2013-04-08 09:56:04 +02:00
make sharp macros take arbitrary identifiers Previously, only a single character was allowed. 2017-05-10 03:54:32 +02:00			`lg.add('IDENTIFIER', identifier)`
Add an rply-based lexer 2013-04-08 09:56:04 +02:00

Comments end when the input ends or a newline occurs This fixes #382, which occured because the REPL doesn't use trailing newlines. 2014-01-02 03:09:18 +01:00			`lg.ignore(r';.*(?=\r\|\n\|$)')`
Add an rply-based lexer 2013-04-08 09:56:04 +02:00			`lg.ignore(r'\s+')`


			`lexer = lg.build()`