Spaces:
Runtime error
Runtime error
Fix bug
Browse files
app.py
CHANGED
@@ -1,67 +1,58 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
-
import plotly.express as px
|
4 |
-
from plotly.subplots import make_subplots
|
5 |
-
import plotly.graph_objects as go
|
6 |
import random
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
-
@st.
|
10 |
def load_data():
|
11 |
return pd.read_csv("dataset.csv")
|
12 |
|
13 |
|
14 |
-
def reload_example_text_data(
|
15 |
random_id = random.choice(val_data["id"])
|
16 |
tempdf = val_data[val_data["id"] == random_id]
|
17 |
-
tempdf = tempdf[tempdf["
|
18 |
-
tempdf.
|
19 |
-
tempdf.
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
st.session_state.examplesdf = tempdf
|
21 |
|
22 |
|
23 |
-
|
24 |
-
"openai/gpt4",
|
25 |
-
"Xenova/gpt-4o",
|
26 |
-
"Xenova/claude-tokenizer",
|
27 |
-
"CohereForAI/aya-101",
|
28 |
-
"meta-llama/Meta-Llama-3-70B",
|
29 |
-
"mistralai/Mixtral-8x22B-v0.1",
|
30 |
-
"google/gemma-7b",
|
31 |
-
"facebook/nllb-200-distilled-600M",
|
32 |
-
"xlm-roberta-base",
|
33 |
-
"bert-base-uncased",
|
34 |
-
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
35 |
-
"bigscience/bloom",
|
36 |
-
"StabilityAI/stablelm-base-alpha-7b",
|
37 |
-
"google/flan-t5-base",
|
38 |
-
"facebook/mbart-large-50",
|
39 |
-
"EleutherAI/gpt-neox-20b",
|
40 |
-
]
|
41 |
|
42 |
with st.sidebar:
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
selected_tokenizers = st.multiselect(
|
47 |
"Select tokenizers",
|
48 |
options=tokenizer_names_to_test,
|
49 |
-
default=["openai/gpt4", "Xenova/gpt-4o"
|
50 |
-
max_selections=6,
|
51 |
label_visibility="collapsed",
|
52 |
)
|
53 |
-
|
54 |
-
for tokenizer_name in selected_tokenizers:
|
55 |
-
if tokenizer_name == "openai/gpt4":
|
56 |
-
link = "Tokenized using [tiktoken](https://github.com/openai/tiktoken)"
|
57 |
-
else:
|
58 |
-
url = f"https://huggingface.co/{tokenizer_name}"
|
59 |
-
link = f"Tokenizer is available [on the HuggingFace hub]({url})"
|
60 |
-
st.markdown(link, unsafe_allow_html=True)
|
61 |
-
|
62 |
-
val_data = load_data()
|
63 |
-
st.success(f"Data loaded: {len(val_data)}")
|
64 |
-
|
65 |
language_options = sorted(val_data["lang"].unique())
|
66 |
selected_language = st.selectbox(
|
67 |
"Select language",
|
@@ -69,18 +60,20 @@ with st.sidebar:
|
|
69 |
index=language_options.index("English") if "English" in language_options else 0,
|
70 |
label_visibility="collapsed",
|
71 |
)
|
72 |
-
|
73 |
-
|
74 |
-
"Select figure type",
|
75 |
options=["Boxplot", "Histogram", "Scatterplot"],
|
76 |
-
index=0,
|
77 |
label_visibility="collapsed",
|
78 |
)
|
79 |
|
80 |
st.header("Example Text")
|
81 |
-
reload_example_text_data(selected_language)
|
82 |
st.table(st.session_state.examplesdf)
|
83 |
-
st.button(
|
|
|
|
|
|
|
|
|
84 |
|
85 |
tokenizer_to_num_tokens = {
|
86 |
name: val_data[name].tolist() for name in selected_tokenizers
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
|
|
|
|
|
|
3 |
import random
|
4 |
+
import plotly.graph_objects as go
|
5 |
+
from plotly.subplots import make_subplots
|
6 |
+
import plotly.express as px
|
7 |
|
8 |
|
9 |
+
@st.cache_data
|
10 |
def load_data():
|
11 |
return pd.read_csv("dataset.csv")
|
12 |
|
13 |
|
14 |
+
def reload_example_text_data(selected_language, selected_tokenizers):
|
15 |
random_id = random.choice(val_data["id"])
|
16 |
tempdf = val_data[val_data["id"] == random_id]
|
17 |
+
tempdf = tempdf[tempdf["lang"] == selected_language]
|
18 |
+
tempdf.rename(columns={"lang": "Language"}, inplace=True)
|
19 |
+
tempdf.set_index("Language", inplace=True)
|
20 |
+
columns = ["iso", "text"] + selected_tokenizers
|
21 |
+
tempdf = tempdf[columns]
|
22 |
+
tempdf.columns = ["ISO", "Text"] + [
|
23 |
+
f"Num Tokens ({tokenizer})" for tokenizer in selected_tokenizers
|
24 |
+
]
|
25 |
+
tempdf.sort_values(by="ISO", inplace=True)
|
26 |
st.session_state.examplesdf = tempdf
|
27 |
|
28 |
|
29 |
+
val_data = load_data()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
with st.sidebar:
|
32 |
+
tokenizer_names_to_test = [
|
33 |
+
"openai/gpt4",
|
34 |
+
"Xenova/gpt-4o",
|
35 |
+
"Xenova/claude-tokenizer",
|
36 |
+
"CohereForAI/aya-101",
|
37 |
+
"meta-llama/Meta-Llama-3-70B",
|
38 |
+
"mistralai/Mixtral-8x22B-v0.1",
|
39 |
+
"google/gemma-7b",
|
40 |
+
"facebook/nllb-200-distilled-600M",
|
41 |
+
"xlm-roberta-base",
|
42 |
+
"bert-base-uncased",
|
43 |
+
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
44 |
+
"bigscience/bloom",
|
45 |
+
"StabilityAI/stablelm-base-alpha-7b",
|
46 |
+
"google/flan-t5-base",
|
47 |
+
"facebook/mbart-large-50",
|
48 |
+
"EleutherAI/gpt-neox-20b",
|
49 |
+
]
|
50 |
selected_tokenizers = st.multiselect(
|
51 |
"Select tokenizers",
|
52 |
options=tokenizer_names_to_test,
|
53 |
+
default=["openai/gpt4", "Xenova/gpt-4o"],
|
|
|
54 |
label_visibility="collapsed",
|
55 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
language_options = sorted(val_data["lang"].unique())
|
57 |
selected_language = st.selectbox(
|
58 |
"Select language",
|
|
|
60 |
index=language_options.index("English") if "English" in language_options else 0,
|
61 |
label_visibility="collapsed",
|
62 |
)
|
63 |
+
selected_figure = st.selectbox(
|
64 |
+
"Select Plot Type",
|
|
|
65 |
options=["Boxplot", "Histogram", "Scatterplot"],
|
|
|
66 |
label_visibility="collapsed",
|
67 |
)
|
68 |
|
69 |
st.header("Example Text")
|
70 |
+
reload_example_text_data(selected_language, selected_tokenizers)
|
71 |
st.table(st.session_state.examplesdf)
|
72 |
+
st.button(
|
73 |
+
"Reload",
|
74 |
+
on_click=reload_example_text_data,
|
75 |
+
args=(selected_language, selected_tokenizers),
|
76 |
+
)
|
77 |
|
78 |
tokenizer_to_num_tokens = {
|
79 |
name: val_data[name].tolist() for name in selected_tokenizers
|