import gradio as gr
import os
from tokenizer_bpe import BasicTokenizer
print("Loading the model...")
model_path = os.path.join(os.getcwd(), "tokenizer_model")
model_path = os.path.join(model_path, "hindi_sentiments.model")
basic_tokenizer = BasicTokenizer()
basic_tokenizer.load(model_path)
def test_tokenizer(text):
ids = basic_tokenizer.encode(text)
decoded = basic_tokenizer.decode(ids)
mapping = [(str(i), basic_tokenizer.decode([i])) for i in ids]
return ids, decoded, mapping
with gr.Blocks() as demo:
gr.HTML("
Token Generation for Hindi Dataset
")
with gr.Row():
with gr.Column():
inputs = [
gr.TextArea(
label="Enter text to generate tokens in Hindi", lines=10
)
]
generate_btn = gr.Button(value="Generate Text")
with gr.Column():
enc = gr.Textbox(label="Encoded Tokens")
txt = gr.Textbox(label="Decoded Text from tokens")
map = gr.Textbox(label="Mapping of the tokens and respective texts")
outputs = [enc, txt, map]
generate_btn.click(fn=test_tokenizer, inputs=inputs, outputs=outputs)
if __name__ == "__main__":
demo.launch(share=True)