Spaces:
Runtime error
Runtime error
Make the code to use precalculated tokens from the dataset
Browse files
app.py
CHANGED
@@ -1,20 +1,12 @@
|
|
1 |
import streamlit as st
|
2 |
-
from collections import defaultdict
|
3 |
-
import tqdm
|
4 |
-
import transformers
|
5 |
-
from transformers import AutoTokenizer
|
6 |
import pandas as pd
|
7 |
-
import matplotlib.pyplot as plt
|
8 |
-
import seaborn as sns
|
9 |
-
import numpy as np
|
10 |
-
import plotly.figure_factory as ff
|
11 |
import plotly.express as px
|
12 |
from plotly.subplots import make_subplots
|
13 |
import plotly.graph_objects as go
|
14 |
-
import random
|
15 |
|
16 |
|
17 |
-
@st.
|
18 |
def load_data():
|
19 |
return pd.read_csv("dataset.csv")
|
20 |
|
@@ -22,7 +14,6 @@ def load_data():
|
|
22 |
def reload_example_text_data(language):
|
23 |
random_id = random.choice(val_data["id"])
|
24 |
tempdf = val_data[val_data["id"] == random_id]
|
25 |
-
tempdf = tempdf[["iso", "text", *selected_tokenizers]]
|
26 |
tempdf = tempdf[tempdf["iso"] == language]
|
27 |
tempdf.set_index("iso", inplace=True)
|
28 |
tempdf.columns = ["Text"] + [f"Num Tokens ({t})" for t in selected_tokenizers]
|
@@ -50,11 +41,8 @@ tokenizer_names_to_test = [
|
|
50 |
|
51 |
with st.sidebar:
|
52 |
st.header("Comparing Tokenizers")
|
53 |
-
link = "This project compares the tokenization length for different tokenizers.
|
54 |
st.markdown(link)
|
55 |
-
|
56 |
-
st.header("Data Visualization")
|
57 |
-
st.subheader("Tokenizers")
|
58 |
selected_tokenizers = st.multiselect(
|
59 |
"Select tokenizers",
|
60 |
options=tokenizer_names_to_test,
|
@@ -63,29 +51,25 @@ with st.sidebar:
|
|
63 |
label_visibility="collapsed",
|
64 |
)
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
"The data in this figure is the validation set of the [Amazon Massive](https://huggingface.co/datasets/AmazonScience/massive/viewer/af-ZA/validation) dataset, which consists of 2033 short sentences and phrases translated into 51 different languages. Learn more about the dataset from [Amazon's blog post](https://www.amazon.science/blog/amazon-releases-51-language-dataset-for-language-understanding)"
|
74 |
-
)
|
75 |
|
76 |
-
|
77 |
-
language_options = sorted(val_data.lang.unique())
|
78 |
-
default_language_index = (
|
79 |
-
language_options.index("English") if "English" in language_options else 0
|
80 |
-
)
|
81 |
selected_language = st.selectbox(
|
82 |
"Select language",
|
83 |
options=language_options,
|
84 |
-
index=
|
85 |
label_visibility="collapsed",
|
86 |
)
|
87 |
|
88 |
-
st.subheader("Figure")
|
89 |
selected_figure = st.radio(
|
90 |
"Select figure type",
|
91 |
options=["Boxplot", "Histogram", "Scatterplot"],
|
@@ -94,18 +78,13 @@ with st.sidebar:
|
|
94 |
)
|
95 |
|
96 |
st.header("Example Text")
|
97 |
-
|
98 |
-
reload_example_text_data(selected_language)
|
99 |
st.table(st.session_state.examplesdf)
|
100 |
st.button("Reload", on_click=reload_example_text_data, args=(selected_language,))
|
101 |
|
102 |
-
tokenizer_to_num_tokens =
|
103 |
-
|
104 |
-
|
105 |
-
for tokenizer_name in selected_tokenizers:
|
106 |
-
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
|
107 |
-
num_tokens = len(tokenizer(text)["input_ids"])
|
108 |
-
tokenizer_to_num_tokens[tokenizer_name].append(num_tokens)
|
109 |
|
110 |
if selected_figure == "Boxplot":
|
111 |
fig = go.Figure()
|
@@ -113,11 +92,7 @@ if selected_figure == "Boxplot":
|
|
113 |
fig.add_trace(
|
114 |
go.Box(y=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name)
|
115 |
)
|
116 |
-
fig.update_layout(
|
117 |
-
title=f"Distribution of Number of Tokens for Selected Tokenizers",
|
118 |
-
xaxis_title="Tokenizer",
|
119 |
-
yaxis_title="Number of Tokens",
|
120 |
-
)
|
121 |
st.plotly_chart(fig)
|
122 |
elif selected_figure == "Histogram":
|
123 |
fig = make_subplots(
|
@@ -138,14 +113,8 @@ elif selected_figure == "Histogram":
|
|
138 |
st.plotly_chart(fig)
|
139 |
elif selected_figure == "Scatterplot":
|
140 |
df = pd.DataFrame(tokenizer_to_num_tokens)
|
141 |
-
fig = px.scatter_matrix(
|
142 |
-
df,
|
143 |
-
dimensions=selected_tokenizers,
|
144 |
-
color_discrete_sequence=px.colors.qualitative.Plotly,
|
145 |
-
)
|
146 |
fig.update_layout(
|
147 |
-
title=
|
148 |
-
width=800,
|
149 |
-
height=800,
|
150 |
)
|
151 |
st.plotly_chart(fig)
|
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
2 |
import pandas as pd
|
|
|
|
|
|
|
|
|
3 |
import plotly.express as px
|
4 |
from plotly.subplots import make_subplots
|
5 |
import plotly.graph_objects as go
|
6 |
+
import random
|
7 |
|
8 |
|
9 |
+
@st.cache
|
10 |
def load_data():
|
11 |
return pd.read_csv("dataset.csv")
|
12 |
|
|
|
14 |
def reload_example_text_data(language):
|
15 |
random_id = random.choice(val_data["id"])
|
16 |
tempdf = val_data[val_data["id"] == random_id]
|
|
|
17 |
tempdf = tempdf[tempdf["iso"] == language]
|
18 |
tempdf.set_index("iso", inplace=True)
|
19 |
tempdf.columns = ["Text"] + [f"Num Tokens ({t})" for t in selected_tokenizers]
|
|
|
41 |
|
42 |
with st.sidebar:
|
43 |
st.header("Comparing Tokenizers")
|
44 |
+
link = "This project compares the tokenization length for different tokenizers."
|
45 |
st.markdown(link)
|
|
|
|
|
|
|
46 |
selected_tokenizers = st.multiselect(
|
47 |
"Select tokenizers",
|
48 |
options=tokenizer_names_to_test,
|
|
|
51 |
label_visibility="collapsed",
|
52 |
)
|
53 |
|
54 |
+
for tokenizer_name in selected_tokenizers:
|
55 |
+
if tokenizer_name == "openai/gpt4":
|
56 |
+
link = "Tokenized using [tiktoken](https://github.com/openai/tiktoken)"
|
57 |
+
else:
|
58 |
+
url = f"https://huggingface.co/{tokenizer_name}"
|
59 |
+
link = f"Tokenizer is available [on the HuggingFace hub]({url})"
|
60 |
+
st.markdown(link, unsafe_allow_html=True)
|
61 |
|
62 |
+
val_data = load_data()
|
63 |
+
st.success(f"Data loaded: {len(val_data)}")
|
|
|
|
|
64 |
|
65 |
+
language_options = sorted(val_data["lang"].unique())
|
|
|
|
|
|
|
|
|
66 |
selected_language = st.selectbox(
|
67 |
"Select language",
|
68 |
options=language_options,
|
69 |
+
index=language_options.index("English") if "English" in language_options else 0,
|
70 |
label_visibility="collapsed",
|
71 |
)
|
72 |
|
|
|
73 |
selected_figure = st.radio(
|
74 |
"Select figure type",
|
75 |
options=["Boxplot", "Histogram", "Scatterplot"],
|
|
|
78 |
)
|
79 |
|
80 |
st.header("Example Text")
|
81 |
+
reload_example_text_data(selected_language)
|
|
|
82 |
st.table(st.session_state.examplesdf)
|
83 |
st.button("Reload", on_click=reload_example_text_data, args=(selected_language,))
|
84 |
|
85 |
+
tokenizer_to_num_tokens = {
|
86 |
+
name: val_data[name].tolist() for name in selected_tokenizers
|
87 |
+
}
|
|
|
|
|
|
|
|
|
88 |
|
89 |
if selected_figure == "Boxplot":
|
90 |
fig = go.Figure()
|
|
|
92 |
fig.add_trace(
|
93 |
go.Box(y=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name)
|
94 |
)
|
95 |
+
fig.update_layout(title="Distribution of Number of Tokens for Selected Tokenizers")
|
|
|
|
|
|
|
|
|
96 |
st.plotly_chart(fig)
|
97 |
elif selected_figure == "Histogram":
|
98 |
fig = make_subplots(
|
|
|
113 |
st.plotly_chart(fig)
|
114 |
elif selected_figure == "Scatterplot":
|
115 |
df = pd.DataFrame(tokenizer_to_num_tokens)
|
116 |
+
fig = px.scatter_matrix(df, dimensions=selected_tokenizers)
|
|
|
|
|
|
|
|
|
117 |
fig.update_layout(
|
118 |
+
title="Scatterplot Matrix of Number of Tokens for Selected Tokenizers"
|
|
|
|
|
119 |
)
|
120 |
st.plotly_chart(fig)
|