Note
Go to the end to download the full example code
Custom lexer
Demonstrates using a custom lexer to parse a non-textual stream of data
You can use a custom lexer to tokenize text when the lexers offered by Lark are too slow, or not flexible enough.
You can also use it (as shown in this example) to tokenize streams of objects.
from lark import Lark, Transformer, v_args
from lark.lexer import Lexer, Token
class TypeLexer(Lexer):
def __init__(self, lexer_conf):
pass
def lex(self, data):
for obj in data:
if isinstance(obj, int):
yield Token('INT', obj)
elif isinstance(obj, (type(''), type(u''))):
yield Token('STR', obj)
else:
raise TypeError(obj)
parser = Lark("""
start: data_item+
data_item: STR INT*
%declare STR INT
""", parser='lalr', lexer=TypeLexer)
class ParseToDict(Transformer):
@v_args(inline=True)
def data_item(self, name, *numbers):
return name.value, [n.value for n in numbers]
start = dict
def test():
data = ['alice', 1, 27, 3, 'bob', 4, 'carrie', 'dan', 8, 6]
print(data)
tree = parser.parse(data)
res = ParseToDict().transform(tree)
print('-->')
print(res) # prints {'alice': [1, 27, 3], 'bob': [4], 'carrie': [], 'dan': [8, 6]}
if __name__ == '__main__':
test()
Total running time of the script: (0 minutes 0.000 seconds)