MohamedRashad
commited on
Commit
·
6a52aab
1
Parent(s):
62d4a12
Refactor code and optimize dataframe sorting
Browse files
app.py
CHANGED
@@ -6,10 +6,6 @@ from datasets import load_dataset
|
|
6 |
import random
|
7 |
from pathlib import Path
|
8 |
|
9 |
-
# tokenizer = AutoTokenizer.from_pretrained("Xenova/gpt-4o")
|
10 |
-
# token_ids = tokenizer.encode("السلام عليكم ورحمة الله")
|
11 |
-
# exit()
|
12 |
-
|
13 |
initial_list_of_models = [
|
14 |
"Xenova/gpt-4o",
|
15 |
"NousResearch/Meta-Llama-3-8B",
|
@@ -48,16 +44,12 @@ for model_name in tqdm(initial_list_of_models):
|
|
48 |
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
|
49 |
|
50 |
# Save the dataframe to a csv file
|
51 |
-
df.to_json(dataframe_path, lines=True, orient="records")
|
52 |
-
|
53 |
-
# Gradio Functions
|
54 |
-
def refresh():
|
55 |
-
global df
|
56 |
-
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
|
57 |
-
return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
|
58 |
|
59 |
def submit(model_name):
|
60 |
global df
|
|
|
|
|
61 |
tokenizer = AutoTokenizer.from_pretrained(
|
62 |
model_name, use_fast=True, trust_remote_code=True
|
63 |
)
|
@@ -72,6 +64,9 @@ def submit(model_name):
|
|
72 |
},
|
73 |
ignore_index=True,
|
74 |
)
|
|
|
|
|
|
|
75 |
|
76 |
def generate_distinct_colors(n):
|
77 |
"""Generate n visually distinct colors in hexadecimal format."""
|
@@ -196,9 +191,7 @@ with gr.Blocks() as demo:
|
|
196 |
model_name = gr.Textbox(
|
197 |
label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
|
198 |
)
|
199 |
-
|
200 |
-
refresh_btn = gr.Button(value="Refresh")
|
201 |
-
submit_new_model_btn = gr.Button(value="Submit", variant="primary")
|
202 |
with gr.Tab(label="Try tokenizers"):
|
203 |
text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
|
204 |
dropdown = gr.Dropdown(
|
@@ -209,8 +202,7 @@ with gr.Blocks() as demo:
|
|
209 |
submit_text_btn = gr.Button(value="Submit", variant="primary")
|
210 |
tokenized_textbox = gr.HighlightedText(label="Tokenized text")
|
211 |
|
212 |
-
submit_new_model_btn.click(submit, model_name)
|
213 |
-
refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown])
|
214 |
submit_text_btn.click(tokenize_text, inputs=[text, dropdown], outputs=[tokenized_textbox])
|
215 |
|
216 |
|
|
|
6 |
import random
|
7 |
from pathlib import Path
|
8 |
|
|
|
|
|
|
|
|
|
9 |
initial_list_of_models = [
|
10 |
"Xenova/gpt-4o",
|
11 |
"NousResearch/Meta-Llama-3-8B",
|
|
|
44 |
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
|
45 |
|
46 |
# Save the dataframe to a csv file
|
47 |
+
df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
def submit(model_name):
|
50 |
global df
|
51 |
+
if model_name in df["📛 Models"].values:
|
52 |
+
return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
|
53 |
tokenizer = AutoTokenizer.from_pretrained(
|
54 |
model_name, use_fast=True, trust_remote_code=True
|
55 |
)
|
|
|
64 |
},
|
65 |
ignore_index=True,
|
66 |
)
|
67 |
+
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
|
68 |
+
df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
|
69 |
+
return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
|
70 |
|
71 |
def generate_distinct_colors(n):
|
72 |
"""Generate n visually distinct colors in hexadecimal format."""
|
|
|
191 |
model_name = gr.Textbox(
|
192 |
label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
|
193 |
)
|
194 |
+
submit_new_model_btn = gr.Button(value="Submit", variant="primary")
|
|
|
|
|
195 |
with gr.Tab(label="Try tokenizers"):
|
196 |
text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
|
197 |
dropdown = gr.Dropdown(
|
|
|
202 |
submit_text_btn = gr.Button(value="Submit", variant="primary")
|
203 |
tokenized_textbox = gr.HighlightedText(label="Tokenized text")
|
204 |
|
205 |
+
submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown])
|
|
|
206 |
submit_text_btn.click(tokenize_text, inputs=[text, dropdown], outputs=[tokenized_textbox])
|
207 |
|
208 |
|