HeshamHaroon's picture
Update app.py
61a3b2f verified
raw
history blame
1.59 kB
import gradio as gr
import aranizer
# Function to dynamically load tokenizers
def load_tokenizers():
tokenizer_methods = [
"aranizer_bpe32k", "aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
"aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k"
]
tokenizers = {}
for method in tokenizer_methods:
module, function = method.rsplit("_", 1)
try:
tokenizer = getattr(getattr(aranizer, module), f'get_tokenizer_{function}')()
tokenizers[method] = tokenizer
except AttributeError as e:
print(f"Error loading tokenizer {method}: {e}")
return tokenizers
# Now, let's load all available tokenizers
tokenizers = load_tokenizers()
def compare_tokenizers(text):
results = []
for name, tokenizer in tokenizers.items():
try:
tokens = tokenizer.tokenize(text)
encoded_output = tokenizer.encode(text, add_special_tokens=True)
decoded_text = tokenizer.decode(encoded_output)
results.append((name, tokens, encoded_output, decoded_text))
except Exception as e:
results.append((name, f"Error: {e}", "", ""))
return results
inputs_component = gr.inputs.Textbox(lines=2, placeholder="Enter Arabic text here...", label="Input Text")
outputs_component = gr.outputs.Table(label="Results", headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"])
iface = gr.Interface(fn=compare_tokenizers, inputs=inputs_component, outputs=outputs_component, title="Tokenizer Comparison Tool")
iface.launch()