Takateru Yamakoshi commited on
Commit
b5a5fbe
1 Parent(s): 8004d5f

add tokenizers

Browse files
Files changed (1) hide show
  1. app.py +23 -25
app.py CHANGED
@@ -6,19 +6,17 @@ import io
6
  import time
7
 
8
  @st.cache(show_spinner=True,allow_output_mutation=True)
9
- def load_model(model_name):
10
- if model_name.startswith('bert'):
11
- from transformers import BertTokenizer
12
- tokenizer = BertTokenizer.from_pretrained(model_name)
13
- elif model_name.startswith('gpt2'):
14
- from transformers import GPT2Tokenizer
15
- tokenizer = GPT2Tokenizer.from_pretrained(model_name)
16
- elif model_name.startswith('roberta'):
17
- from transformers import RobertaTokenizer
18
- tokenizer = RobertaTokenizer.from_pretrained(model_name)
19
- elif model_name.startswith('albert'):
20
- from transformers import AlbertTokenizer
21
- tokenizer = AlbertTokenizer.from_pretrained(model_name)
22
  return tokenizer
23
 
24
  def generate_markdown(text,color='black',font='Arial',size=20):
@@ -26,10 +24,11 @@ def generate_markdown(text,color='black',font='Arial',size=20):
26
 
27
  def TokenizeText(sentence,tokenizer_name):
28
  if len(sentence)>0:
29
- if tokenizer_name.startswith('gpt2'):
30
- input_sent = tokenizer(sentence)['input_ids']
31
- else:
32
- input_sent = tokenizer(sentence)['input_ids'][1:-1]
 
33
  encoded_sent = [str(token) for token in input_sent]
34
  decoded_sent = [tokenizer.decode([token]) for token in input_sent]
35
  num_tokens = len(decoded_sent)
@@ -100,10 +99,8 @@ if __name__=='__main__':
100
  # Select and load the tokenizer
101
  st.sidebar.write('1. Choose the tokenizer from below')
102
  tokenizer_name = st.sidebar.selectbox('',
103
- ('bert-base-uncased','bert-large-cased',
104
- 'gpt2','gpt2-large',
105
- 'roberta-base','roberta-large',
106
- 'albert-base-v2','albert-xxlarge-v2'),index=7)
107
  tokenizer = load_model(tokenizer_name)
108
 
109
  st.sidebar.write('2. Optional settings')
@@ -135,10 +132,11 @@ if __name__=='__main__':
135
 
136
  else:
137
  if detokenize:
138
- if tokenizer_name.startswith('gpt2'):
139
- default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
140
- else:
141
- default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids'][1:-1]
 
142
  sentence = st.text_input(f'Tokenized IDs',value=' '.join([str(token) for token in default_tokens]))
143
  num_tokens = DeTokenizeText(sentence)
144
  else:
 
6
  import time
7
 
8
  @st.cache(show_spinner=True,allow_output_mutation=True)
9
+ def load_model(tokenizer_name):
10
+ from transformers import AutoTokenizer
11
+ model_name_dict = {
12
+ "BERT":"bert-base-uncased",
13
+ "RoBERTa":"roberta-base",
14
+ "ALBERT":"albert-v2-base",
15
+ "GPT2":"gpt2",
16
+ "Llama":"meta-lama/Llama-2-7b-chat-hf",
17
+ "Gemma":"google/gemma-7b",
18
+ }
19
+ tokenizer = AutoTokenizer.from_pretrained(model_name_dict[tokenizer_name])
 
 
20
  return tokenizer
21
 
22
  def generate_markdown(text,color='black',font='Arial',size=20):
 
24
 
25
  def TokenizeText(sentence,tokenizer_name):
26
  if len(sentence)>0:
27
+ #if tokenizer_name.startswith('gpt2'):
28
+ # input_sent = tokenizer(sentence)['input_ids']
29
+ #else:
30
+ # input_sent = tokenizer(sentence)['input_ids'][1:-1]
31
+ input_sent = tokenizer(sentence)['input_ids']
32
  encoded_sent = [str(token) for token in input_sent]
33
  decoded_sent = [tokenizer.decode([token]) for token in input_sent]
34
  num_tokens = len(decoded_sent)
 
99
  # Select and load the tokenizer
100
  st.sidebar.write('1. Choose the tokenizer from below')
101
  tokenizer_name = st.sidebar.selectbox('',
102
+ ("BERT","RoBERTa","ALBERT",
103
+ "GPT2","Llama","Gemma"))
 
 
104
  tokenizer = load_model(tokenizer_name)
105
 
106
  st.sidebar.write('2. Optional settings')
 
132
 
133
  else:
134
  if detokenize:
135
+ #if tokenizer_name.startswith('gpt2'):
136
+ # default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
137
+ #else:
138
+ # default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids'][1:-1]
139
+ default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
140
  sentence = st.text_input(f'Tokenized IDs',value=' '.join([str(token) for token in default_tokens]))
141
  num_tokens = DeTokenizeText(sentence)
142
  else: