Spaces:
Runtime error
Runtime error
File size: 3,950 Bytes
5357bd8 61a3b2f bee3802 7760bbc 0a9a631 8d000e9 176f915 71c7fc4 0a9a631 ece3f89 7760bbc ece3f89 b9f9278 176f915 c296fab 61a3b2f 7760bbc b9f9278 c296fab 7760bbc 176f915 66b3df6 7760bbc ece3f89 b9f9278 c296fab 66b3df6 ab010ed 176f915 7ed66d7 71c7fc4 bd02afc 71c7fc4 bd02afc ece3f89 7760bbc 80ccea0 7760bbc 80ccea0 3567a04 bd02afc 6f9d03f 7760bbc ece3f89 9f4f9aa 7760bbc ece3f89 fcbfd45 7760bbc b9f9278 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
from gradio import Interface
import gradio as gr
import aranizer
from aranizer import aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
from transformers import AutoTokenizer, logging
from huggingface_hub import login
import os
# Retrieve your Hugging Face token from the environment variable
HF_TOKEN = os.getenv('HF_TOKEN')
if HF_TOKEN:
HF_TOKEN = HF_TOKEN.strip() # Remove any leading or trailing whitespace/newlines
login(token=HF_TOKEN)
# Load additional tokenizers from transformers
gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
# Try to load the gated tokenizer
try:
meta_llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
except Exception as e:
meta_llama_tokenizer = None
logging.warning(f"Could not load meta-llama/Meta-Llama-3-8B tokenizer: {e}")
cohere_command_r_v01_tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
cohere_command_r_plus_tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-plus")
# List of available tokenizers and a dictionary to load them
tokenizer_options = [
"aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
"aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k",
"FreedomIntelligence/AceGPT-13B",
"FreedomIntelligence/AceGPT-7B",
"inception-mbzuai/jais-13b",
"aubmindlab/bert-base-arabertv2",
"CohereForAI/c4ai-command-r-v01",
"CohereForAI/c4ai-command-r-plus"
]
if meta_llama_tokenizer:
tokenizer_options.append("meta-llama/Meta-Llama-3-8B")
tokenizers = {
"aranizer_bpe50k": aranizer_bpe50k.get_tokenizer,
"aranizer_bpe64k": aranizer_bpe64k.get_tokenizer,
"aranizer_bpe86k": aranizer_bpe86k.get_tokenizer,
"aranizer_sp32k": aranizer_sp32k.get_tokenizer,
"aranizer_sp50k": aranizer_sp50k.get_tokenizer,
"aranizer_sp64k": aranizer_sp64k.get_tokenizer,
"aranizer_sp86k": aranizer_sp86k.get_tokenizer,
"FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
"FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
"inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer,
"aubmindlab/bert-base-arabertv2": lambda: arabert_tokenizer,
"CohereForAI/c4ai-command-r-v01": lambda: cohere_command_r_v01_tokenizer,
"CohereForAI/c4ai-command-r-plus": lambda: cohere_command_r_plus_tokenizer
}
if meta_llama_tokenizer:
tokenizers["meta-llama/Meta-Llama-3-8B"] = lambda: meta_llama_tokenizer
def compare_tokenizers(tokenizer_name, text):
tokenizer = tokenizers[tokenizer_name]()
tokens = tokenizer.tokenize(text)
encoded_output = tokenizer.encode(text, add_special_tokens=True)
decoded_text = tokenizer.decode(encoded_output, skip_special_tokens=True)
# Prepare the results to be displayed in HTML format
results_html = f"""
<div>
<h3>Tokenizer: {tokenizer_name}</h3>
<p><strong>Tokens:</strong> {tokens}</p>
<p><strong>Encoded:</strong> {encoded_output}</p>
<p><strong>Decoded:</strong> {decoded_text}</p>
</div>
"""
return results_html
# Define the Gradio interface components with a dropdown for model selection
inputs_component = [
gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer"),
gr.Textbox(lines=2, placeholder="اكتب النص هنا...", label="Input Text")
]
outputs_component = gr.HTML(label="Results")
# Setting up the interface
iface = Interface(
fn=compare_tokenizers,
inputs=inputs_component,
outputs=outputs_component,
title="Arabic Tokenizer Arena",
live=True
)
# Launching the Gradio app
iface.launch()
|