Spaces:
Runtime error
Runtime error
File size: 4,691 Bytes
5357bd8 61a3b2f bee3802 7760bbc 0a9a631 176f915 0a9a631 ece3f89 7760bbc ece3f89 b9f9278 176f915 c296fab 61a3b2f 7760bbc b9f9278 c296fab 7760bbc 176f915 66b3df6 7760bbc ece3f89 b9f9278 c296fab 66b3df6 ab010ed 176f915 7ed66d7 7760bbc c296fab 7760bbc 31687bc 7760bbc 80ccea0 bd02afc c296fab bd02afc ece3f89 7760bbc 80ccea0 7760bbc 80ccea0 3567a04 bd02afc 6f9d03f 7760bbc ece3f89 9f4f9aa 7760bbc ece3f89 fcbfd45 7760bbc b9f9278 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
from gradio import Interface
import gradio as gr
import aranizer
from aranizer import aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
from transformers import AutoTokenizer, logging
from huggingface_hub import HfApi, login
import os
# Retrieve your Hugging Face token from the environment variable
HF_TOKEN = os.getenv('HF_TOKEN')
if HF_TOKEN:
api = HfApi()
api.set_access_token(HF_TOKEN)
login(token=HF_TOKEN)
# Load additional tokenizers from transformers
gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
# Try to load the gated tokenizer
try:
meta_llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
except Exception as e:
meta_llama_tokenizer = None
logging.warning(f"Could not load meta-llama/Meta-Llama-3-8B tokenizer: {e}")
cohere_command_r_v01_tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
cohere_command_r_plus_tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-plus")
# List of available tokenizers and a dictionary to load them
tokenizer_options = [
"aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
"aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k",
"FreedomIntelligence/AceGPT-13B",
"FreedomIntelligence/AceGPT-7B",
"inception-mbzuai/jais-13b",
"aubmindlab/bert-base-arabertv2",
"CohereForAI/c4ai-command-r-v01",
"CohereForAI/c4ai-command-r-plus"
]
if meta_llama_tokenizer:
tokenizer_options.append("meta-llama/Meta-Llama-3-8B")
tokenizers = {
"aranizer_bpe50k": aranizer_bpe50k.get_tokenizer,
"aranizer_bpe64k": aranizer_bpe64k.get_tokenizer,
"aranizer_bpe86k": aranizer_bpe86k.get_tokenizer,
"aranizer_sp32k": aranizer_sp32k.get_tokenizer,
"aranizer_sp50k": aranizer_sp50k.get_tokenizer,
"aranizer_sp64k": aranizer_sp64k.get_tokenizer,
"aranizer_sp86k": aranizer_sp86k.get_tokenizer,
"FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
"FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
"inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer,
"aubmindlab/bert-base-arabertv2": lambda: arabert_tokenizer,
"CohereForAI/c4ai-command-r-v01": lambda: cohere_command_r_v01_tokenizer,
"CohereForAI/c4ai-command-r-plus": lambda: cohere_command_r_plus_tokenizer
}
if meta_llama_tokenizer:
tokenizers["meta-llama/Meta-Llama-3-8B"] = lambda: meta_llama_tokenizer
def compare_tokenizers(tokenizer_name, text):
# Handle the transformer tokenizers separately due to API differences
if tokenizer_name in [
"FreedomIntelligence/AceGPT-13B", "FreedomIntelligence/AceGPT-7B",
"inception-mbzuai/jais-13b", "aubmindlab/bert-base-arabertv2",
"meta-llama/Meta-Llama-3-8B", "CohereForAI/c4ai-command-r-v01", "CohereForAI/c4ai-command-r-plus"
]:
tokenizer = tokenizers[tokenizer_name]()
tokens = tokenizer.tokenize(text)
encoded_output = tokenizer.encode(text, add_special_tokens=True)
decoded_text = tokenizer.decode(encoded_output, skip_special_tokens=True)
else:
# AraNizer tokenizers
tokenizer = tokenizers[tokenizer_name]()
tokens = tokenizer.tokenize(text)
encoded_output = tokenizer.encode(text, add_special_tokens=True)
decoded_text = tokenizer.decode(encoded_output)
# Prepare the results to be displayed in HTML format
tokens_arabic = [token.encode('utf-8').decode('utf-8') if isinstance(token, bytes) else token for token in tokens]
results_html = f"""
<div>
<h3>Tokenizer: {tokenizer_name}</h3>
<p><strong>Tokens:</strong> {tokens_arabic}</p>
<p><strong>Encoded:</strong> {encoded_output}</p>
<p><strong>Decoded:</strong> {decoded_text}</p>
</div>
"""
return results_html
# Define the Gradio interface components with a dropdown for model selection
inputs_component = [
gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer"),
gr.Textbox(lines=2, placeholder="اكتب النص هنا...", label="Input Text")
]
outputs_component = gr.HTML(label="Results")
# Setting up the interface
iface = Interface(
fn=compare_tokenizers,
inputs=inputs_component,
outputs=outputs_component,
title="Arabic Tokenizer Arena",
live=True
)
# Launching the Gradio app
iface.launch()
|