Spaces:
Runtime error
Runtime error
Yennie Jun
commited on
Commit
·
96bd252
1
Parent(s):
21a3b4f
Updating the About the Project section
Browse files
app.py
CHANGED
@@ -43,6 +43,8 @@ tokenizer_names_to_test = [
|
|
43 |
]
|
44 |
|
45 |
with st.sidebar:
|
|
|
|
|
46 |
st.subheader('Tokenizer')
|
47 |
# TODO multi-select tokenizers
|
48 |
tokenizer_name = st.sidebar.selectbox('Select tokenizer', options=tokenizer_names_to_test, label_visibility='collapsed')
|
@@ -80,6 +82,14 @@ with st.sidebar:
|
|
80 |
show_hist = st.checkbox('Show histogram', value=False)
|
81 |
|
82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
# dist_marginal = st.radio('Select distribution', options=['box', 'violin', 'rug'], horizontal=True)
|
85 |
|
@@ -164,6 +174,4 @@ with st.container():
|
|
164 |
|
165 |
|
166 |
|
167 |
-
|
168 |
-
st.write("The purpose of this project is to compare the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP.")
|
169 |
-
|
|
|
43 |
]
|
44 |
|
45 |
with st.sidebar:
|
46 |
+
|
47 |
+
|
48 |
st.subheader('Tokenizer')
|
49 |
# TODO multi-select tokenizers
|
50 |
tokenizer_name = st.sidebar.selectbox('Select tokenizer', options=tokenizer_names_to_test, label_visibility='collapsed')
|
|
|
82 |
show_hist = st.checkbox('Show histogram', value=False)
|
83 |
|
84 |
|
85 |
+
st.subheader('About the project')
|
86 |
+
with st.expander("All languages are NOT created (tokenized) equal!"):
|
87 |
+
|
88 |
+
link="The purpose of this project is to compare the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP. See the original article: [All languages are NOT created (tokenized) equal](https://blog.yenniejun.com/p/all-languages-are-not-created-tokenized)"
|
89 |
+
st.markdown(link)
|
90 |
+
|
91 |
+
|
92 |
+
|
93 |
|
94 |
# dist_marginal = st.radio('Select distribution', options=['box', 'violin', 'rug'], horizontal=True)
|
95 |
|
|
|
174 |
|
175 |
|
176 |
|
177 |
+
|
|
|
|