tokenizer / app.py
Kaiizx's picture
fix output bug
718d914
import gradio as gr
def tokenize(buffer: str):
split = {'(', ')', '{', '}', '[', ']', ',', ':', '+', '-', '*', '/', '%', '=', '\n'}
DFA_table = {
-1: {'any': -1},
0: {' ': 1, 'any': 5, 'split': 17, 'f': 7, '"': 8, "'": 9},
1: {' ': 2, 'f': 7, 'any': 5},
2: {' ': 3, 'f': 7, 'any': 5},
3: {' ': 4, 'f': 7, 'any': 5},
4: {'any': 18},
5: {' ': 6, 'any': 5, 'split': 17},
6: {' ': 6, 'any': 18, 'split': 17},
7: {'any': 5, '"': 8, "'": 9},
8: {'"': 16, 'any': 8},
9: {"'": 11, 'any': 10},
10: {"'": 16, 'any': 10},
11: {' ': 16, "'": 12, 'any': -1, 'split': 17},
12: {"'": 13, 'any': 12},
13: {"'": 14, 'any': -1},
14: {"'": 15, 'any': -1},
15: {' ': 16, 'split': 17, 'any': -1},
16: {' ': 16, 'any': -1, 'split': 17, '"': 18, "'": 18},
17: {'any': -1}, # final: consume split as token
18: {'any': -1}, # final: not consume split as token
}
finals = (17, 18)
tokens = []
cursor = 0
while cursor < len(buffer):
state = 0
temp = ''
while cursor < len(buffer):
ch = buffer[cursor]
if ch in split:
ch = 'split'
if ch not in DFA_table[state]:
ch = 'any'
state = DFA_table[state][ch]
if state not in finals:
temp += buffer[cursor]
else:
break
cursor += 1
if state not in finals and state != 5:
raise RuntimeError(f"Rejected at state {state}")
if temp != '':
tokens.append(temp.strip() if temp != ' ' else temp)
if state == finals[0]:
tokens.append(buffer[cursor])
cursor += 1
return tokens
interface = gr.Interface(
fn=tokenize,
title="Tokenizer",
description="Tokenize the python code",
theme="compact",
inputs=gr.TextArea(label="Python code",value = "print('Hello World!!')"),
outputs=gr.TextArea(label="Tokenize output")
)
interface.launch()