|
import gradio as gr |
|
from transformers import AutoTokenizer |
|
|
|
|
|
def tokenize_text(text, tokenizer_name): |
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) |
|
tokenized_text = tokenizer.tokenize(text) |
|
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text) |
|
decoded_text = tokenizer.decode(input_ids) |
|
return f"Tokenized Text: {tokenized_text}\nInput IDs: {input_ids}\nDecoded Text: {decoded_text}" |
|
|
|
|
|
|
|
tokenizer_names = [ |
|
"riotu-lab/ArabianGPT-01B", |
|
"riotu-lab/ArabianGPT-03B", |
|
"riotu-lab/ArabianGPT-08B", |
|
"FreedomIntelligence/AceGPT-13B", |
|
"FreedomIntelligence/AceGPT-7B", |
|
"inception-mbzuai/jais-13b", |
|
"aubmindlab/aragpt2-base", |
|
"aubmindlab/aragpt2-medium", |
|
"aubmindlab/aragpt2-large", |
|
"aubmindlab/aragpt2-mega" |
|
] |
|
|
|
|
|
iface = gr.Interface( |
|
fn=tokenize_text, |
|
inputs=[ |
|
gr.Textbox(label="Enter Text"), |
|
gr.Dropdown(choices=tokenizer_names, label="Select Tokenizer"), |
|
], |
|
outputs="text", |
|
title="Kalemat: Explore Arabic Tokenizers", |
|
description="This interactive tool allows you to experiment with different Arabic tokenizers and see how they break down text into individual units. Try out various tokenizers and observe the tokenized form, input IDs, and decoded text to gain insights into the tokenization process.", |
|
) |
|
|
|
|
|
iface.launch() |