Spaces:
Running
Running
taka-yamakoshi
commited on
Commit
·
ed9112c
1
Parent(s):
9240bf4
add instructions
Browse files
app.py
CHANGED
@@ -98,15 +98,19 @@ if __name__=='__main__':
|
|
98 |
st.markdown(generate_markdown('quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
|
99 |
|
100 |
# Select and load the tokenizer
|
101 |
-
tokenizer_name = st.sidebar.selectbox('Choose the tokenizer from below',
|
102 |
('bert-base-uncased','bert-large-cased',
|
103 |
'gpt2','gpt2-large',
|
104 |
'roberta-base','roberta-large',
|
105 |
'albert-base-v2','albert-xxlarge-v2'),index=7)
|
106 |
tokenizer = load_model(tokenizer_name)
|
107 |
|
|
|
|
|
|
|
|
|
108 |
comparison_mode = st.sidebar.checkbox('Compare two texts')
|
109 |
-
detokenize = st.sidebar.checkbox('de-tokenize
|
110 |
if comparison_mode:
|
111 |
sent_cols = st.columns(2)
|
112 |
num_tokens = {}
|
@@ -122,7 +126,7 @@ if __name__=='__main__':
|
|
122 |
sents[f'sent_{sent_id+1}'] = sentence
|
123 |
|
124 |
if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
|
125 |
-
st.markdown(generate_markdown('
|
126 |
if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
|
127 |
st.markdown(generate_markdown('Matched! ',color='MediumAquamarine'), unsafe_allow_html=True)
|
128 |
else:
|
|
|
98 |
st.markdown(generate_markdown('quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
|
99 |
|
100 |
# Select and load the tokenizer
|
101 |
+
tokenizer_name = st.sidebar.selectbox('1. Choose the tokenizer from below',
|
102 |
('bert-base-uncased','bert-large-cased',
|
103 |
'gpt2','gpt2-large',
|
104 |
'roberta-base','roberta-large',
|
105 |
'albert-base-v2','albert-xxlarge-v2'),index=7)
|
106 |
tokenizer = load_model(tokenizer_name)
|
107 |
|
108 |
+
st.sidebar.write('2. Optional settings')
|
109 |
+
st.sidebar.write(f'"Compare two texts" compares # tokens for two pieces of text '\
|
110 |
+
+f'and "de-tokenize" converts a list of tokenized indices back to strings.')
|
111 |
+
st.sidebar.write(f'For "de-tokenize", make sure to type in integers, separated by single spaces')
|
112 |
comparison_mode = st.sidebar.checkbox('Compare two texts')
|
113 |
+
detokenize = st.sidebar.checkbox('de-tokenize')
|
114 |
if comparison_mode:
|
115 |
sent_cols = st.columns(2)
|
116 |
num_tokens = {}
|
|
|
126 |
sents[f'sent_{sent_id+1}'] = sentence
|
127 |
|
128 |
if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
|
129 |
+
st.markdown(generate_markdown('# Tokens: ',size=16), unsafe_allow_html=True)
|
130 |
if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
|
131 |
st.markdown(generate_markdown('Matched! ',color='MediumAquamarine'), unsafe_allow_html=True)
|
132 |
else:
|