5w4n commited on
Commit
0158b9f
·
1 Parent(s): 3e67b95

Format the output

Browse files
Files changed (1) hide show
  1. app.py +55 -62
app.py CHANGED
@@ -1,9 +1,7 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import random
4
  import plotly.graph_objects as go
5
- from plotly.subplots import make_subplots
6
- import plotly.express as px
7
 
8
 
9
  @st.cache_data
@@ -12,18 +10,13 @@ def load_data():
12
 
13
 
14
  def reload_example_text_data(selected_language, selected_tokenizers):
15
- random_id = random.choice(val_data["id"])
16
- tempdf = val_data[val_data["id"] == random_id]
17
- tempdf = tempdf[tempdf["lang"] == selected_language]
18
- tempdf.rename(columns={"lang": "Language"}, inplace=True)
19
- tempdf.set_index("Language", inplace=True)
20
- columns = ["iso", "text"] + selected_tokenizers
21
- tempdf = tempdf[columns]
22
- tempdf.columns = ["ISO", "Text"] + [
23
- f"Num Tokens ({tokenizer})" for tokenizer in selected_tokenizers
24
- ]
25
- tempdf.sort_values(by="ISO", inplace=True)
26
- st.session_state.examplesdf = tempdf
27
 
28
 
29
  val_data = load_data()
@@ -53,6 +46,17 @@ with st.sidebar:
53
  default=["openai/gpt4", "Xenova/gpt-4o"],
54
  label_visibility="collapsed",
55
  )
 
 
 
 
 
 
 
 
 
 
 
56
  language_options = sorted(val_data["lang"].unique())
57
  selected_language = st.selectbox(
58
  "Select language",
@@ -60,54 +64,43 @@ with st.sidebar:
60
  index=language_options.index("English") if "English" in language_options else 0,
61
  label_visibility="collapsed",
62
  )
63
- selected_figure = st.selectbox(
64
- "Select Plot Type",
65
- options=["Boxplot", "Histogram", "Scatterplot"],
66
- label_visibility="collapsed",
67
- )
68
 
69
- st.header("Example Text")
70
- reload_example_text_data(selected_language, selected_tokenizers)
 
71
  st.table(st.session_state.examplesdf)
72
- st.button(
73
- "Reload",
74
- on_click=reload_example_text_data,
75
- args=(selected_language, selected_tokenizers),
76
- )
77
 
78
- tokenizer_to_num_tokens = {
79
- name: val_data[name].tolist() for name in selected_tokenizers
80
- }
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- if selected_figure == "Boxplot":
83
- fig = go.Figure()
84
- for tokenizer_name in selected_tokenizers:
85
- fig.add_trace(
86
- go.Box(y=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name)
87
- )
88
- fig.update_layout(title="Distribution of Number of Tokens for Selected Tokenizers")
89
- st.plotly_chart(fig)
90
- elif selected_figure == "Histogram":
91
- fig = make_subplots(
92
- rows=len(selected_tokenizers), cols=1, subplot_titles=selected_tokenizers
93
- )
94
- for i, tokenizer_name in enumerate(selected_tokenizers):
95
- fig.add_trace(
96
- go.Histogram(
97
- x=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name
98
- ),
99
- row=i + 1,
100
- col=1,
101
- )
102
- fig.update_layout(
103
- height=200 * len(selected_tokenizers),
104
- title_text="Histogram of Number of Tokens",
105
- )
106
- st.plotly_chart(fig)
107
- elif selected_figure == "Scatterplot":
108
- df = pd.DataFrame(tokenizer_to_num_tokens)
109
- fig = px.scatter_matrix(df, dimensions=selected_tokenizers)
110
- fig.update_layout(
111
- title="Scatterplot Matrix of Number of Tokens for Selected Tokenizers"
112
- )
113
- st.plotly_chart(fig)
 
1
  import streamlit as st
2
  import pandas as pd
 
3
  import plotly.graph_objects as go
4
+ import numpy as np
 
5
 
6
 
7
  @st.cache_data
 
10
 
11
 
12
  def reload_example_text_data(selected_language, selected_tokenizers):
13
+ tempdf = val_data[val_data["lang"] == selected_language]
14
+ random_sample = tempdf.sample(n=1)
15
+ selected_text = random_sample["text"].iloc[0]
16
+ random_sample = random_sample[selected_tokenizers]
17
+ random_sample.columns = [f"{tokenizer}" for tokenizer in selected_tokenizers]
18
+ st.session_state.examplesdf = random_sample
19
+ return selected_text
 
 
 
 
 
20
 
21
 
22
  val_data = load_data()
 
46
  default=["openai/gpt4", "Xenova/gpt-4o"],
47
  label_visibility="collapsed",
48
  )
49
+ links = [
50
+ (
51
+ f"[{tokenizer_name}](https://huggingface.co/{tokenizer_name})"
52
+ if tokenizer_name != "openai/gpt4"
53
+ else f"[{tokenizer_name}](https://github.com/openai/tiktoken)"
54
+ )
55
+ for tokenizer_name in selected_tokenizers
56
+ ]
57
+ link = "Tokenized using " + ", ".join(links)
58
+ st.markdown(link, unsafe_allow_html=True)
59
+
60
  language_options = sorted(val_data["lang"].unique())
61
  selected_language = st.selectbox(
62
  "Select language",
 
64
  index=language_options.index("English") if "English" in language_options else 0,
65
  label_visibility="collapsed",
66
  )
 
 
 
 
 
67
 
68
+ selected_text = reload_example_text_data(selected_language, selected_tokenizers)
69
+ st.subheader(f"**Sampled Text:** `{selected_text}`")
70
+ st.subheader("Number of Tokens")
71
  st.table(st.session_state.examplesdf)
 
 
 
 
 
72
 
73
+ # Calculate metrics for each tokenizer
74
+ tokenizer_metrics = {}
75
+ for tokenizer in selected_tokenizers:
76
+ tokens = val_data[tokenizer].dropna()
77
+ median = np.median(tokens)
78
+ min_tokens = np.min(tokens)
79
+ max_tokens = np.max(tokens)
80
+ std_dev = np.std(tokens)
81
+ tokenizer_metrics[tokenizer] = {
82
+ "Median": median,
83
+ "Min": min_tokens,
84
+ "Max": max_tokens,
85
+ "Range": max_tokens - min_tokens,
86
+ "Standard Deviation": std_dev,
87
+ }
88
 
89
+ # Display metrics
90
+ st.subheader("Tokenizer Metrics")
91
+ st.json(tokenizer_metrics)
92
+
93
+ # Plot for top tokenizers by median token length
94
+ sorted_tokenizers = sorted(tokenizer_metrics.items(), key=lambda x: x[1]["Median"])
95
+ shortest_median = sorted_tokenizers[:5]
96
+ longest_median = sorted_tokenizers[-5:]
97
+
98
+ fig = go.Figure()
99
+ for name, metrics in shortest_median + longest_median:
100
+ fig.add_trace(go.Bar(x=[name], y=[metrics["Median"]], name=name))
101
+ fig.update_layout(
102
+ title="Top Tokenizers by Shortest and Longest Median Token Length",
103
+ xaxis_title="Tokenizer",
104
+ yaxis_title="Median Token Length",
105
+ )
106
+ st.plotly_chart(fig)