taka-yamakoshi commited on
Commit
ef0b5c6
1 Parent(s): 145f48c

add default inputs

Browse files
Files changed (1) hide show
  1. app.py +7 -3
app.py CHANGED
@@ -94,7 +94,7 @@ if __name__=='__main__':
94
  st.markdown(hide_table_row_index, unsafe_allow_html=True)
95
 
96
  # Title
97
- st.markdown(generate_markdown('Tokenizer Demo:',size=32), unsafe_allow_html=True)
98
  st.markdown(generate_markdown('quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
99
 
100
  # Select and load the tokenizer
@@ -135,8 +135,12 @@ if __name__=='__main__':
135
 
136
  else:
137
  if detokenize:
138
- sentence = st.text_input(f'Tokenized IDs')
 
 
 
 
139
  num_tokens = DeTokenizeText(sentence)
140
  else:
141
- sentence = st.text_input(f'Text')
142
  num_tokens = TokenizeText(sentence,tokenizer_name)
94
  st.markdown(hide_table_row_index, unsafe_allow_html=True)
95
 
96
  # Title
97
+ st.markdown(generate_markdown('WordPiece Explorer',size=32), unsafe_allow_html=True)
98
  st.markdown(generate_markdown('quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
99
 
100
  # Select and load the tokenizer
135
 
136
  else:
137
  if detokenize:
138
+ if tokenizer_name.startswith('gpt2'):
139
+ default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
140
+ else:
141
+ default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids'][1:-1]
142
+ sentence = st.text_input(f'Tokenized IDs',value=' '.join(default_tokens))
143
  num_tokens = DeTokenizeText(sentence)
144
  else:
145
+ sentence = st.text_input(f'Text',value='Tokenizers decompose bigger words into smaller tokens')
146
  num_tokens = TokenizeText(sentence,tokenizer_name)