5w4n commited on
Commit
3e67b95
·
1 Parent(s): 34eb18c
Files changed (1) hide show
  1. app.py +42 -49
app.py CHANGED
@@ -1,67 +1,58 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import plotly.express as px
4
- from plotly.subplots import make_subplots
5
- import plotly.graph_objects as go
6
  import random
 
 
 
7
 
8
 
9
- @st.cache
10
  def load_data():
11
  return pd.read_csv("dataset.csv")
12
 
13
 
14
- def reload_example_text_data(language):
15
  random_id = random.choice(val_data["id"])
16
  tempdf = val_data[val_data["id"] == random_id]
17
- tempdf = tempdf[tempdf["iso"] == language]
18
- tempdf.set_index("iso", inplace=True)
19
- tempdf.columns = ["Text"] + [f"Num Tokens ({t})" for t in selected_tokenizers]
 
 
 
 
 
 
20
  st.session_state.examplesdf = tempdf
21
 
22
 
23
- tokenizer_names_to_test = [
24
- "openai/gpt4",
25
- "Xenova/gpt-4o",
26
- "Xenova/claude-tokenizer",
27
- "CohereForAI/aya-101",
28
- "meta-llama/Meta-Llama-3-70B",
29
- "mistralai/Mixtral-8x22B-v0.1",
30
- "google/gemma-7b",
31
- "facebook/nllb-200-distilled-600M",
32
- "xlm-roberta-base",
33
- "bert-base-uncased",
34
- "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
35
- "bigscience/bloom",
36
- "StabilityAI/stablelm-base-alpha-7b",
37
- "google/flan-t5-base",
38
- "facebook/mbart-large-50",
39
- "EleutherAI/gpt-neox-20b",
40
- ]
41
 
42
  with st.sidebar:
43
- st.header("Comparing Tokenizers")
44
- link = "This project compares the tokenization length for different tokenizers."
45
- st.markdown(link)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  selected_tokenizers = st.multiselect(
47
  "Select tokenizers",
48
  options=tokenizer_names_to_test,
49
- default=["openai/gpt4", "Xenova/gpt-4o", "Xenova/claude-tokenizer"],
50
- max_selections=6,
51
  label_visibility="collapsed",
52
  )
53
-
54
- for tokenizer_name in selected_tokenizers:
55
- if tokenizer_name == "openai/gpt4":
56
- link = "Tokenized using [tiktoken](https://github.com/openai/tiktoken)"
57
- else:
58
- url = f"https://huggingface.co/{tokenizer_name}"
59
- link = f"Tokenizer is available [on the HuggingFace hub]({url})"
60
- st.markdown(link, unsafe_allow_html=True)
61
-
62
- val_data = load_data()
63
- st.success(f"Data loaded: {len(val_data)}")
64
-
65
  language_options = sorted(val_data["lang"].unique())
66
  selected_language = st.selectbox(
67
  "Select language",
@@ -69,18 +60,20 @@ with st.sidebar:
69
  index=language_options.index("English") if "English" in language_options else 0,
70
  label_visibility="collapsed",
71
  )
72
-
73
- selected_figure = st.radio(
74
- "Select figure type",
75
  options=["Boxplot", "Histogram", "Scatterplot"],
76
- index=0,
77
  label_visibility="collapsed",
78
  )
79
 
80
  st.header("Example Text")
81
- reload_example_text_data(selected_language)
82
  st.table(st.session_state.examplesdf)
83
- st.button("Reload", on_click=reload_example_text_data, args=(selected_language,))
 
 
 
 
84
 
85
  tokenizer_to_num_tokens = {
86
  name: val_data[name].tolist() for name in selected_tokenizers
 
1
  import streamlit as st
2
  import pandas as pd
 
 
 
3
  import random
4
+ import plotly.graph_objects as go
5
+ from plotly.subplots import make_subplots
6
+ import plotly.express as px
7
 
8
 
9
+ @st.cache_data
10
  def load_data():
11
  return pd.read_csv("dataset.csv")
12
 
13
 
14
+ def reload_example_text_data(selected_language, selected_tokenizers):
15
  random_id = random.choice(val_data["id"])
16
  tempdf = val_data[val_data["id"] == random_id]
17
+ tempdf = tempdf[tempdf["lang"] == selected_language]
18
+ tempdf.rename(columns={"lang": "Language"}, inplace=True)
19
+ tempdf.set_index("Language", inplace=True)
20
+ columns = ["iso", "text"] + selected_tokenizers
21
+ tempdf = tempdf[columns]
22
+ tempdf.columns = ["ISO", "Text"] + [
23
+ f"Num Tokens ({tokenizer})" for tokenizer in selected_tokenizers
24
+ ]
25
+ tempdf.sort_values(by="ISO", inplace=True)
26
  st.session_state.examplesdf = tempdf
27
 
28
 
29
+ val_data = load_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  with st.sidebar:
32
+ tokenizer_names_to_test = [
33
+ "openai/gpt4",
34
+ "Xenova/gpt-4o",
35
+ "Xenova/claude-tokenizer",
36
+ "CohereForAI/aya-101",
37
+ "meta-llama/Meta-Llama-3-70B",
38
+ "mistralai/Mixtral-8x22B-v0.1",
39
+ "google/gemma-7b",
40
+ "facebook/nllb-200-distilled-600M",
41
+ "xlm-roberta-base",
42
+ "bert-base-uncased",
43
+ "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
44
+ "bigscience/bloom",
45
+ "StabilityAI/stablelm-base-alpha-7b",
46
+ "google/flan-t5-base",
47
+ "facebook/mbart-large-50",
48
+ "EleutherAI/gpt-neox-20b",
49
+ ]
50
  selected_tokenizers = st.multiselect(
51
  "Select tokenizers",
52
  options=tokenizer_names_to_test,
53
+ default=["openai/gpt4", "Xenova/gpt-4o"],
 
54
  label_visibility="collapsed",
55
  )
 
 
 
 
 
 
 
 
 
 
 
 
56
  language_options = sorted(val_data["lang"].unique())
57
  selected_language = st.selectbox(
58
  "Select language",
 
60
  index=language_options.index("English") if "English" in language_options else 0,
61
  label_visibility="collapsed",
62
  )
63
+ selected_figure = st.selectbox(
64
+ "Select Plot Type",
 
65
  options=["Boxplot", "Histogram", "Scatterplot"],
 
66
  label_visibility="collapsed",
67
  )
68
 
69
  st.header("Example Text")
70
+ reload_example_text_data(selected_language, selected_tokenizers)
71
  st.table(st.session_state.examplesdf)
72
+ st.button(
73
+ "Reload",
74
+ on_click=reload_example_text_data,
75
+ args=(selected_language, selected_tokenizers),
76
+ )
77
 
78
  tokenizer_to_num_tokens = {
79
  name: val_data[name].tolist() for name in selected_tokenizers