Spaces:
Runtime error
Runtime error
Commit
•
11bd107
1
Parent(s):
7ed66d7
Update app.py
Browse files
app.py
CHANGED
@@ -3,18 +3,12 @@ import gradio as gr
|
|
3 |
import aranizer
|
4 |
from aranizer import aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
|
5 |
|
6 |
-
# List of available tokenizers
|
7 |
-
tokenizer_options =
|
8 |
-
"aranizer_bpe50k"
|
9 |
-
"
|
10 |
-
|
11 |
-
"aranizer_sp32k": "SP 32k",
|
12 |
-
"aranizer_sp50k": "SP 50k",
|
13 |
-
"aranizer_sp64k": "SP 64k",
|
14 |
-
"aranizer_sp86k": "SP 86k",
|
15 |
-
}
|
16 |
|
17 |
-
# Mapping from names to tokenizer getters
|
18 |
tokenizers = {
|
19 |
"aranizer_bpe50k": aranizer_bpe50k.get_tokenizer,
|
20 |
"aranizer_bpe64k": aranizer_bpe64k.get_tokenizer,
|
@@ -26,21 +20,22 @@ tokenizers = {
|
|
26 |
}
|
27 |
|
28 |
def compare_tokenizers(tokenizer_name, text):
|
29 |
-
|
|
|
30 |
tokens = tokenizer.tokenize(text)
|
31 |
encoded_output = tokenizer.encode(text, add_special_tokens=True)
|
|
|
32 |
|
33 |
-
# Prepare the results to be displayed
|
34 |
-
results = [(tokenizer_name, tokens, encoded_output)]
|
35 |
return results
|
36 |
|
37 |
-
# Define Gradio interface components with a
|
38 |
inputs_component = [
|
39 |
-
gr.
|
40 |
gr.Textbox(lines=2, placeholder="Enter Arabic text here...", label="Input Text")
|
41 |
]
|
42 |
-
|
43 |
-
outputs_component = gr.Dataframe(headers=["Tokenizer", "Tokens", "Encoded Output"], label="Results")
|
44 |
|
45 |
# Setting up the interface
|
46 |
iface = Interface(fn=compare_tokenizers, inputs=inputs_component, outputs=outputs_component, title="AraNizer Tokenizer Comparison")
|
|
|
3 |
import aranizer
|
4 |
from aranizer import aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
|
5 |
|
6 |
+
# List of available tokenizers and a dictionary to load them
|
7 |
+
tokenizer_options = [
|
8 |
+
"aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
|
9 |
+
"aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k"
|
10 |
+
]
|
|
|
|
|
|
|
|
|
|
|
11 |
|
|
|
12 |
tokenizers = {
|
13 |
"aranizer_bpe50k": aranizer_bpe50k.get_tokenizer,
|
14 |
"aranizer_bpe64k": aranizer_bpe64k.get_tokenizer,
|
|
|
20 |
}
|
21 |
|
22 |
def compare_tokenizers(tokenizer_name, text):
|
23 |
+
# Load the selected tokenizer
|
24 |
+
tokenizer = tokenizers[tokenizer_name]()
|
25 |
tokens = tokenizer.tokenize(text)
|
26 |
encoded_output = tokenizer.encode(text, add_special_tokens=True)
|
27 |
+
decoded_text = tokenizer.decode(encoded_output)
|
28 |
|
29 |
+
# Prepare the results to be displayed
|
30 |
+
results = [(tokenizer_name, tokens, encoded_output, decoded_text)]
|
31 |
return results
|
32 |
|
33 |
+
# Define the Gradio interface components with a dropdown for model selection
|
34 |
inputs_component = [
|
35 |
+
gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer"),
|
36 |
gr.Textbox(lines=2, placeholder="Enter Arabic text here...", label="Input Text")
|
37 |
]
|
38 |
+
outputs_component = gr.Dataframe(headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"], label="Results", elem_height=500, elem_width='100%')
|
|
|
39 |
|
40 |
# Setting up the interface
|
41 |
iface = Interface(fn=compare_tokenizers, inputs=inputs_component, outputs=outputs_component, title="AraNizer Tokenizer Comparison")
|