yenniejun commited on
Commit
62551b2
1 Parent(s): 8641796

Adding example texts to show

Browse files
Files changed (1) hide show
  1. app.py +29 -3
app.py CHANGED
@@ -9,11 +9,21 @@ import seaborn as sns
9
  import numpy as np
10
  import plotly.figure_factory as ff
11
  import plotly.express as px
 
12
 
13
  @st.cache_data
14
  def load_data():
15
  return pd.read_csv('MassiveDatasetValidationData.csv')
16
 
 
 
 
 
 
 
 
 
 
17
  # TODO allow new tokenizers from HF
18
  tokenizer_names_to_test = [
19
  "openai/gpt4",
@@ -55,7 +65,7 @@ with st.sidebar:
55
  languages = st.multiselect(
56
  'Select languages',
57
  options=sorted(val_data.lang.unique()),
58
- default=['English', 'Spanish' ,'Chinese'],
59
  max_selections=6,
60
  label_visibility='collapsed'
61
  )
@@ -82,7 +92,7 @@ with st.container():
82
  subset_df = val_data[val_data.lang.isin(languages)]
83
  subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
84
 
85
- st.header('Tokenization in different languages')
86
  fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=show_hist)
87
 
88
  fig.update_layout(
@@ -100,6 +110,22 @@ with st.container():
100
  metric_cols[i].metric(_lang, int(np.median(subset_df[subset_df.lang==_lang][tokenizer_name])))
101
 
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  with st.expander("About the project"):
104
- st.write("The purpose of this project is to compare the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 15-20x more tokens than a comparable message in another language.")
105
 
 
9
  import numpy as np
10
  import plotly.figure_factory as ff
11
  import plotly.express as px
12
+ import random
13
 
14
  @st.cache_data
15
  def load_data():
16
  return pd.read_csv('MassiveDatasetValidationData.csv')
17
 
18
+ def reload_example_text_data():
19
+ random_id = random.choice(val_data['id'])
20
+ tempdf = subset_df[subset_df['id']==random_id]
21
+ tempdf.set_index('lang', inplace=True)
22
+ tempdf = tempdf[['iso', 'text', tokenizer_name]]
23
+ tempdf.columns=['ISO', 'Text', 'Num Tokens']
24
+ tempdf.sort_values(by='ISO', inplace=True)
25
+ st.session_state.examplesdf = tempdf
26
+
27
  # TODO allow new tokenizers from HF
28
  tokenizer_names_to_test = [
29
  "openai/gpt4",
 
65
  languages = st.multiselect(
66
  'Select languages',
67
  options=sorted(val_data.lang.unique()),
68
+ default=['English', 'Spanish' ,'Chinese', 'Burmese'],
69
  max_selections=6,
70
  label_visibility='collapsed'
71
  )
 
92
  subset_df = val_data[val_data.lang.isin(languages)]
93
  subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
94
 
95
+ st.header('Compare tokenization in different languages')
96
  fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=show_hist)
97
 
98
  fig.update_layout(
 
110
  metric_cols[i].metric(_lang, int(np.median(subset_df[subset_df.lang==_lang][tokenizer_name])))
111
 
112
 
113
+ st.subheader('Example Texts')
114
+
115
+ reload_example_text_data()
116
+ if st.button("🔄 Refresh"):
117
+ reload_example_text_data()
118
+
119
+ st.dataframe(st.session_state.examplesdf) # Same as st.write(df)
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
  with st.expander("About the project"):
130
+ st.write("The purpose of this project is to compare the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP.")
131