Spaces:

taka-yamakoshi
/

tokenizer-demo

Running

App Files Files Community

taka-yamakoshi commited on Jul 16, 2022

Commit

a999c8e

•

1 Parent(s): 8a204f8

fix a bug & minor update

Browse files

Files changed (1) hide show

app.py +13 -10

app.py CHANGED Viewed

@@ -36,8 +36,10 @@ def TokenizeText(sentence):
         #for word_col,word in zip(word_cols,decoded_sent):
             #with word_col:
                 #st.write(word)
-        st.write('   '.join(encoded_sent))
-        st.write('   '.join(decoded_sent))
         st.markdown(generate_markdown(f'{num_tokens} tokens'), unsafe_allow_html=True)
         return num_tokens
@@ -73,17 +75,18 @@ if __name__=='__main__':
     st.markdown(hide_table_row_index, unsafe_allow_html=True)
     # Title
-    st.markdown(generate_markdown('Tokenizer Demo',size=32), unsafe_allow_html=True)
     # Select and load the tokenizer
-    tokenizer_name = st.selectbox('Choose the tokenizer from below',
-                                    ('bert-base-uncased','bert-large-cased',
-                                    'gpt2','gpt2-large',
-                                    'roberta-base','roberta-large',
-                                    'albert-base-v2','albert-xxlarge-v2'),index=7)
     tokenizer = load_model(tokenizer_name)
-    comparison_mode = st.checkbox('Compare two texts')
     if comparison_mode:
         sent_cols = st.columns(2)
         num_tokens = {}
@@ -92,7 +95,7 @@ if __name__=='__main__':
             with sent_col:
                 sentence = st.text_input(f'Text {sent_id+1}')
                 sents[f'sent_{sent_id+1}'] = sentence
-                num_tokens[f'{sent_id+1}'] = TokenizeText(sentence)
         if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
             st.markdown(generate_markdown('Result&colon; ',size=16), unsafe_allow_html=True)

         #for word_col,word in zip(word_cols,decoded_sent):
             #with word_col:
                 #st.write(word)
+        #st.write('   '.join(encoded_sent))
+        #st.write('   '.join(decoded_sent))
+        st.markdown(generate_markdown('   '.join(encoded_sent),size=16), unsafe_allow_html=True)
+        st.markdown(generate_markdown('   '.join(decoded_sent),size=16), unsafe_allow_html=True)
         st.markdown(generate_markdown(f'{num_tokens} tokens'), unsafe_allow_html=True)
         return num_tokens
     st.markdown(hide_table_row_index, unsafe_allow_html=True)
     # Title
+    st.markdown(generate_markdown('Tokenizer Demo:',size=32), unsafe_allow_html=True)
+    st.markdown(generate_markdown('Quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
     # Select and load the tokenizer
+    tokenizer_name = st.sidebar.selectbox('Choose the tokenizer from below',
+                                            ('bert-base-uncased','bert-large-cased',
+                                            'gpt2','gpt2-large',
+                                            'roberta-base','roberta-large',
+                                            'albert-base-v2','albert-xxlarge-v2'),index=7)
     tokenizer = load_model(tokenizer_name)
+    comparison_mode = st.sidebar.checkbox('Compare two texts')
     if comparison_mode:
         sent_cols = st.columns(2)
         num_tokens = {}
             with sent_col:
                 sentence = st.text_input(f'Text {sent_id+1}')
                 sents[f'sent_{sent_id+1}'] = sentence
+                num_tokens[f'sent_{sent_id+1}'] = TokenizeText(sentence)
         if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
             st.markdown(generate_markdown('Result&colon; ',size=16), unsafe_allow_html=True)