|
import gradio as gr |
|
|
|
def tokenize(buffer: str): |
|
split = {'(', ')', '{', '}', '[', ']', ',', ':', '+', '-', '*', '/', '%', '=', '\n'} |
|
DFA_table = { |
|
-1: {'any': -1}, |
|
0: {' ': 1, 'any': 5, 'split': 17, 'f': 7, '"': 8, "'": 9}, |
|
1: {' ': 2, 'f': 7, 'any': 5}, |
|
2: {' ': 3, 'f': 7, 'any': 5}, |
|
3: {' ': 4, 'f': 7, 'any': 5}, |
|
4: {'any': 18}, |
|
5: {' ': 6, 'any': 5, 'split': 17}, |
|
6: {' ': 6, 'any': 18, 'split': 17}, |
|
7: {'any': 5, '"': 8, "'": 9}, |
|
8: {'"': 16, 'any': 8}, |
|
9: {"'": 11, 'any': 10}, |
|
10: {"'": 16, 'any': 10}, |
|
11: {' ': 16, "'": 12, 'any': -1, 'split': 17}, |
|
12: {"'": 13, 'any': 12}, |
|
13: {"'": 14, 'any': -1}, |
|
14: {"'": 15, 'any': -1}, |
|
15: {' ': 16, 'split': 17, 'any': -1}, |
|
16: {' ': 16, 'any': -1, 'split': 17, '"': 18, "'": 18}, |
|
17: {'any': -1}, |
|
18: {'any': -1}, |
|
} |
|
finals = (17, 18) |
|
tokens = [] |
|
cursor = 0 |
|
while cursor < len(buffer): |
|
state = 0 |
|
temp = '' |
|
while cursor < len(buffer): |
|
ch = buffer[cursor] |
|
if ch in split: |
|
ch = 'split' |
|
if ch not in DFA_table[state]: |
|
ch = 'any' |
|
state = DFA_table[state][ch] |
|
if state not in finals: |
|
temp += buffer[cursor] |
|
else: |
|
break |
|
cursor += 1 |
|
|
|
if state not in finals and state != 5: |
|
raise RuntimeError(f"Rejected at state {state}") |
|
if temp != '': |
|
tokens.append(temp.strip() if temp != ' ' else temp) |
|
if state == finals[0]: |
|
tokens.append(buffer[cursor]) |
|
cursor += 1 |
|
return tokens |
|
|
|
|
|
|
|
|
|
|
|
interface = gr.Interface( |
|
fn=tokenize, |
|
title="Tokenizer", |
|
description="Tokenize the python code", |
|
theme="compact", |
|
inputs=gr.TextArea(label="Python code",value = "print('Hello World!!')"), |
|
outputs=gr.TextArea(label="Tokenize output") |
|
) |
|
|
|
interface.launch() |