Spaces:

taka-yamakoshi
/

tokenizer-demo

Running

App Files Files Community

taka-yamakoshi commited on Jul 16, 2022

Commit

ed9112c

1 Parent(s): 9240bf4

add instructions

Browse files

Files changed (1) hide show

app.py +7 -3

app.py CHANGED Viewed

@@ -98,15 +98,19 @@ if __name__=='__main__':
     st.markdown(generate_markdown('quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
     # Select and load the tokenizer
-    tokenizer_name = st.sidebar.selectbox('Choose the tokenizer from below',
                                             ('bert-base-uncased','bert-large-cased',
                                             'gpt2','gpt2-large',
                                             'roberta-base','roberta-large',
                                             'albert-base-v2','albert-xxlarge-v2'),index=7)
     tokenizer = load_model(tokenizer_name)
     comparison_mode = st.sidebar.checkbox('Compare two texts')
-    detokenize = st.sidebar.checkbox('de-tokenize (make sure to type in integers separated by single spaces)')
     if comparison_mode:
         sent_cols = st.columns(2)
         num_tokens = {}
@@ -122,7 +126,7 @@ if __name__=='__main__':
                 sents[f'sent_{sent_id+1}'] = sentence
         if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
-            st.markdown(generate_markdown('Result&colon; ',size=16), unsafe_allow_html=True)
             if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
                 st.markdown(generate_markdown('Matched! ',color='MediumAquamarine'), unsafe_allow_html=True)
             else:

     st.markdown(generate_markdown('quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
     # Select and load the tokenizer
+    tokenizer_name = st.sidebar.selectbox('1. Choose the tokenizer from below',
                                             ('bert-base-uncased','bert-large-cased',
                                             'gpt2','gpt2-large',
                                             'roberta-base','roberta-large',
                                             'albert-base-v2','albert-xxlarge-v2'),index=7)
     tokenizer = load_model(tokenizer_name)
+    st.sidebar.write('2. Optional settings')
+    st.sidebar.write(f'"Compare two texts" compares # tokens for two pieces of text '\
+                        +f'and "de-tokenize" converts a list of tokenized indices back to strings.')
+    st.sidebar.write(f'For "de-tokenize", make sure to type in integers, separated by single spaces')
     comparison_mode = st.sidebar.checkbox('Compare two texts')
+    detokenize = st.sidebar.checkbox('de-tokenize')
     if comparison_mode:
         sent_cols = st.columns(2)
         num_tokens = {}
                 sents[f'sent_{sent_id+1}'] = sentence
         if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
+            st.markdown(generate_markdown('# Tokens&colon; ',size=16), unsafe_allow_html=True)
             if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
                 st.markdown(generate_markdown('Matched! ',color='MediumAquamarine'), unsafe_allow_html=True)
             else: