Yennie Jun commited on
Commit
96bd252
1 Parent(s): 21a3b4f

Updating the About the Project section

Browse files
Files changed (1) hide show
  1. app.py +11 -3
app.py CHANGED
@@ -43,6 +43,8 @@ tokenizer_names_to_test = [
43
  ]
44
 
45
  with st.sidebar:
 
 
46
  st.subheader('Tokenizer')
47
  # TODO multi-select tokenizers
48
  tokenizer_name = st.sidebar.selectbox('Select tokenizer', options=tokenizer_names_to_test, label_visibility='collapsed')
@@ -80,6 +82,14 @@ with st.sidebar:
80
  show_hist = st.checkbox('Show histogram', value=False)
81
 
82
 
 
 
 
 
 
 
 
 
83
 
84
  # dist_marginal = st.radio('Select distribution', options=['box', 'violin', 'rug'], horizontal=True)
85
 
@@ -164,6 +174,4 @@ with st.container():
164
 
165
 
166
 
167
- with st.expander("About the project"):
168
- st.write("The purpose of this project is to compare the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP.")
169
-
 
43
  ]
44
 
45
  with st.sidebar:
46
+
47
+
48
  st.subheader('Tokenizer')
49
  # TODO multi-select tokenizers
50
  tokenizer_name = st.sidebar.selectbox('Select tokenizer', options=tokenizer_names_to_test, label_visibility='collapsed')
 
82
  show_hist = st.checkbox('Show histogram', value=False)
83
 
84
 
85
+ st.subheader('About the project')
86
+ with st.expander("All languages are NOT created (tokenized) equal!"):
87
+
88
+ link="The purpose of this project is to compare the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP. See the original article: [All languages are NOT created (tokenized) equal](https://blog.yenniejun.com/p/all-languages-are-not-created-tokenized)"
89
+ st.markdown(link)
90
+
91
+
92
+
93
 
94
  # dist_marginal = st.radio('Select distribution', options=['box', 'violin', 'rug'], horizontal=True)
95
 
 
174
 
175
 
176
 
177
+