Spaces:
Runtime error
Runtime error
File size: 3,476 Bytes
5357bd8 61a3b2f bee3802 7760bbc ece3f89 7760bbc ece3f89 b9f9278 61a3b2f 7760bbc b9f9278 7760bbc 66b3df6 7760bbc ece3f89 b9f9278 66b3df6 ab010ed 7ed66d7 7760bbc b9f9278 7760bbc 4c91389 31687bc 7760bbc 3b0ce68 80ccea0 bd02afc ece3f89 7760bbc 80ccea0 7760bbc 80ccea0 3567a04 bd02afc 6f9d03f 7760bbc ece3f89 9f4f9aa 7760bbc ece3f89 fcbfd45 7760bbc b9f9278 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
from gradio import Interface
import gradio as gr
import aranizer
from aranizer import aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
from transformers import AutoTokenizer
# Load additional tokenizers from transformers
gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
# List of available tokenizers and a dictionary to load them
tokenizer_options = [
"aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
"aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k",
"FreedomIntelligence/AceGPT-13B",
"FreedomIntelligence/AceGPT-7B",
"inception-mbzuai/jais-13b",
"aubmindlab/bert-base-arabertv2"
]
tokenizers = {
"aranizer_bpe50k": aranizer_bpe50k.get_tokenizer,
"aranizer_bpe64k": aranizer_bpe64k.get_tokenizer,
"aranizer_bpe86k": aranizer_bpe86k.get_tokenizer,
"aranizer_sp32k": aranizer_sp32k.get_tokenizer,
"aranizer_sp50k": aranizer_sp50k.get_tokenizer,
"aranizer_sp64k": aranizer_sp64k.get_tokenizer,
"aranizer_sp86k": aranizer_sp86k.get_tokenizer,
"FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
"FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
"inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer,
"aubmindlab/bert-base-arabertv2": lambda: arabert_tokenizer
}
def compare_tokenizers(tokenizer_name, text):
# Handle the transformer tokenizers separately due to API differences
if tokenizer_name in ["FreedomIntelligence/AceGPT-13B", "FreedomIntelligence/AceGPT-7B", "inception-mbzuai/jais-13b", "aubmindlab/bert-base-arabertv2"]:
tokenizer = tokenizers[tokenizer_name]()
tokens = tokenizer.tokenize(text)
tokens_arabic = [token.encode('utf-8').decode('utf-8') for token in tokens]
encoded_output = tokenizer.encode(text, add_special_tokens=True)
decoded_text = tokenizer.decode(encoded_output, skip_special_tokens=True)
else:
# AraNizer tokenizers
tokenizer = tokenizers[tokenizer_name]()
tokens = tokenizer.tokenize(text)
encoded_output = tokenizer.encode(text, add_special_tokens=True)
decoded_text = tokenizer.decode(encoded_output)
tokens_arabic = [token.encode('utf-8').decode('utf-8') for token in tokens]
# Prepare the results to be displayed in HTML format
results_html = f"""
<div>
<h3>Tokenizer: {tokenizer_name}</h3>
<p><strong>Tokens:</strong> {tokens_arabic}</p>
<p><strong>Encoded:</strong> {encoded_output}</p>
<p><strong>Decoded:</strong> {decoded_text}</p>
</div>
"""
return results_html
# Define the Gradio interface components with a dropdown for model selection
inputs_component = [
gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer"),
gr.Textbox(lines=2, placeholder="اكتب النص هنا...", label="Input Text")
]
outputs_component = gr.HTML(label="Results")
# Setting up the interface
iface = Interface(
fn=compare_tokenizers,
inputs=inputs_component,
outputs=outputs_component,
title="Arabic Tokenizer Arena",
live=True
)
# Launching the Gradio app
iface.launch()
|