5w4n commited on
Commit
34eb18c
·
1 Parent(s): 116574f

Make the code to use precalculated tokens from the dataset

Browse files
Files changed (1) hide show
  1. app.py +21 -52
app.py CHANGED
@@ -1,20 +1,12 @@
1
  import streamlit as st
2
- from collections import defaultdict
3
- import tqdm
4
- import transformers
5
- from transformers import AutoTokenizer
6
  import pandas as pd
7
- import matplotlib.pyplot as plt
8
- import seaborn as sns
9
- import numpy as np
10
- import plotly.figure_factory as ff
11
  import plotly.express as px
12
  from plotly.subplots import make_subplots
13
  import plotly.graph_objects as go
14
- import random, glob
15
 
16
 
17
- @st.cache_data
18
  def load_data():
19
  return pd.read_csv("dataset.csv")
20
 
@@ -22,7 +14,6 @@ def load_data():
22
  def reload_example_text_data(language):
23
  random_id = random.choice(val_data["id"])
24
  tempdf = val_data[val_data["id"] == random_id]
25
- tempdf = tempdf[["iso", "text", *selected_tokenizers]]
26
  tempdf = tempdf[tempdf["iso"] == language]
27
  tempdf.set_index("iso", inplace=True)
28
  tempdf.columns = ["Text"] + [f"Num Tokens ({t})" for t in selected_tokenizers]
@@ -50,11 +41,8 @@ tokenizer_names_to_test = [
50
 
51
  with st.sidebar:
52
  st.header("Comparing Tokenizers")
53
- link = "This project compares the tokenization length for different tokenizers. Some tokenizers may result in significantly more tokens than others for the same text."
54
  st.markdown(link)
55
-
56
- st.header("Data Visualization")
57
- st.subheader("Tokenizers")
58
  selected_tokenizers = st.multiselect(
59
  "Select tokenizers",
60
  options=tokenizer_names_to_test,
@@ -63,29 +51,25 @@ with st.sidebar:
63
  label_visibility="collapsed",
64
  )
65
 
66
- st.subheader("Data")
67
- with st.spinner("Loading dataset..."):
68
- val_data = load_data()
69
- st.success(f"Data loaded: {len(val_data)}")
 
 
 
70
 
71
- with st.expander("Data Source"):
72
- st.write(
73
- "The data in this figure is the validation set of the [Amazon Massive](https://huggingface.co/datasets/AmazonScience/massive/viewer/af-ZA/validation) dataset, which consists of 2033 short sentences and phrases translated into 51 different languages. Learn more about the dataset from [Amazon's blog post](https://www.amazon.science/blog/amazon-releases-51-language-dataset-for-language-understanding)"
74
- )
75
 
76
- st.subheader("Language")
77
- language_options = sorted(val_data.lang.unique())
78
- default_language_index = (
79
- language_options.index("English") if "English" in language_options else 0
80
- )
81
  selected_language = st.selectbox(
82
  "Select language",
83
  options=language_options,
84
- index=default_language_index,
85
  label_visibility="collapsed",
86
  )
87
 
88
- st.subheader("Figure")
89
  selected_figure = st.radio(
90
  "Select figure type",
91
  options=["Boxplot", "Histogram", "Scatterplot"],
@@ -94,18 +78,13 @@ with st.sidebar:
94
  )
95
 
96
  st.header("Example Text")
97
- with st.spinner("Loading example text..."):
98
- reload_example_text_data(selected_language)
99
  st.table(st.session_state.examplesdf)
100
  st.button("Reload", on_click=reload_example_text_data, args=(selected_language,))
101
 
102
- tokenizer_to_num_tokens = defaultdict(list)
103
- for _, row in tqdm.tqdm(val_data.iterrows(), total=len(val_data)):
104
- text = row["text"]
105
- for tokenizer_name in selected_tokenizers:
106
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
107
- num_tokens = len(tokenizer(text)["input_ids"])
108
- tokenizer_to_num_tokens[tokenizer_name].append(num_tokens)
109
 
110
  if selected_figure == "Boxplot":
111
  fig = go.Figure()
@@ -113,11 +92,7 @@ if selected_figure == "Boxplot":
113
  fig.add_trace(
114
  go.Box(y=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name)
115
  )
116
- fig.update_layout(
117
- title=f"Distribution of Number of Tokens for Selected Tokenizers",
118
- xaxis_title="Tokenizer",
119
- yaxis_title="Number of Tokens",
120
- )
121
  st.plotly_chart(fig)
122
  elif selected_figure == "Histogram":
123
  fig = make_subplots(
@@ -138,14 +113,8 @@ elif selected_figure == "Histogram":
138
  st.plotly_chart(fig)
139
  elif selected_figure == "Scatterplot":
140
  df = pd.DataFrame(tokenizer_to_num_tokens)
141
- fig = px.scatter_matrix(
142
- df,
143
- dimensions=selected_tokenizers,
144
- color_discrete_sequence=px.colors.qualitative.Plotly,
145
- )
146
  fig.update_layout(
147
- title=f"Scatterplot Matrix of Number of Tokens for Selected Tokenizers",
148
- width=800,
149
- height=800,
150
  )
151
  st.plotly_chart(fig)
 
1
  import streamlit as st
 
 
 
 
2
  import pandas as pd
 
 
 
 
3
  import plotly.express as px
4
  from plotly.subplots import make_subplots
5
  import plotly.graph_objects as go
6
+ import random
7
 
8
 
9
+ @st.cache
10
  def load_data():
11
  return pd.read_csv("dataset.csv")
12
 
 
14
  def reload_example_text_data(language):
15
  random_id = random.choice(val_data["id"])
16
  tempdf = val_data[val_data["id"] == random_id]
 
17
  tempdf = tempdf[tempdf["iso"] == language]
18
  tempdf.set_index("iso", inplace=True)
19
  tempdf.columns = ["Text"] + [f"Num Tokens ({t})" for t in selected_tokenizers]
 
41
 
42
  with st.sidebar:
43
  st.header("Comparing Tokenizers")
44
+ link = "This project compares the tokenization length for different tokenizers."
45
  st.markdown(link)
 
 
 
46
  selected_tokenizers = st.multiselect(
47
  "Select tokenizers",
48
  options=tokenizer_names_to_test,
 
51
  label_visibility="collapsed",
52
  )
53
 
54
+ for tokenizer_name in selected_tokenizers:
55
+ if tokenizer_name == "openai/gpt4":
56
+ link = "Tokenized using [tiktoken](https://github.com/openai/tiktoken)"
57
+ else:
58
+ url = f"https://huggingface.co/{tokenizer_name}"
59
+ link = f"Tokenizer is available [on the HuggingFace hub]({url})"
60
+ st.markdown(link, unsafe_allow_html=True)
61
 
62
+ val_data = load_data()
63
+ st.success(f"Data loaded: {len(val_data)}")
 
 
64
 
65
+ language_options = sorted(val_data["lang"].unique())
 
 
 
 
66
  selected_language = st.selectbox(
67
  "Select language",
68
  options=language_options,
69
+ index=language_options.index("English") if "English" in language_options else 0,
70
  label_visibility="collapsed",
71
  )
72
 
 
73
  selected_figure = st.radio(
74
  "Select figure type",
75
  options=["Boxplot", "Histogram", "Scatterplot"],
 
78
  )
79
 
80
  st.header("Example Text")
81
+ reload_example_text_data(selected_language)
 
82
  st.table(st.session_state.examplesdf)
83
  st.button("Reload", on_click=reload_example_text_data, args=(selected_language,))
84
 
85
+ tokenizer_to_num_tokens = {
86
+ name: val_data[name].tolist() for name in selected_tokenizers
87
+ }
 
 
 
 
88
 
89
  if selected_figure == "Boxplot":
90
  fig = go.Figure()
 
92
  fig.add_trace(
93
  go.Box(y=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name)
94
  )
95
+ fig.update_layout(title="Distribution of Number of Tokens for Selected Tokenizers")
 
 
 
 
96
  st.plotly_chart(fig)
97
  elif selected_figure == "Histogram":
98
  fig = make_subplots(
 
113
  st.plotly_chart(fig)
114
  elif selected_figure == "Scatterplot":
115
  df = pd.DataFrame(tokenizer_to_num_tokens)
116
+ fig = px.scatter_matrix(df, dimensions=selected_tokenizers)
 
 
 
 
117
  fig.update_layout(
118
+ title="Scatterplot Matrix of Number of Tokens for Selected Tokenizers"
 
 
119
  )
120
  st.plotly_chart(fig)