Spaces:
Running
Running
Takateru Yamakoshi
commited on
Commit
•
b5a5fbe
1
Parent(s):
8004d5f
add tokenizers
Browse files
app.py
CHANGED
@@ -6,19 +6,17 @@ import io
|
|
6 |
import time
|
7 |
|
8 |
@st.cache(show_spinner=True,allow_output_mutation=True)
|
9 |
-
def load_model(
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
from transformers import AlbertTokenizer
|
21 |
-
tokenizer = AlbertTokenizer.from_pretrained(model_name)
|
22 |
return tokenizer
|
23 |
|
24 |
def generate_markdown(text,color='black',font='Arial',size=20):
|
@@ -26,10 +24,11 @@ def generate_markdown(text,color='black',font='Arial',size=20):
|
|
26 |
|
27 |
def TokenizeText(sentence,tokenizer_name):
|
28 |
if len(sentence)>0:
|
29 |
-
if tokenizer_name.startswith('gpt2'):
|
30 |
-
|
31 |
-
else:
|
32 |
-
|
|
|
33 |
encoded_sent = [str(token) for token in input_sent]
|
34 |
decoded_sent = [tokenizer.decode([token]) for token in input_sent]
|
35 |
num_tokens = len(decoded_sent)
|
@@ -100,10 +99,8 @@ if __name__=='__main__':
|
|
100 |
# Select and load the tokenizer
|
101 |
st.sidebar.write('1. Choose the tokenizer from below')
|
102 |
tokenizer_name = st.sidebar.selectbox('',
|
103 |
-
(
|
104 |
-
|
105 |
-
'roberta-base','roberta-large',
|
106 |
-
'albert-base-v2','albert-xxlarge-v2'),index=7)
|
107 |
tokenizer = load_model(tokenizer_name)
|
108 |
|
109 |
st.sidebar.write('2. Optional settings')
|
@@ -135,10 +132,11 @@ if __name__=='__main__':
|
|
135 |
|
136 |
else:
|
137 |
if detokenize:
|
138 |
-
if tokenizer_name.startswith('gpt2'):
|
139 |
-
|
140 |
-
else:
|
141 |
-
|
|
|
142 |
sentence = st.text_input(f'Tokenized IDs',value=' '.join([str(token) for token in default_tokens]))
|
143 |
num_tokens = DeTokenizeText(sentence)
|
144 |
else:
|
|
|
6 |
import time
|
7 |
|
8 |
@st.cache(show_spinner=True,allow_output_mutation=True)
|
9 |
+
def load_model(tokenizer_name):
|
10 |
+
from transformers import AutoTokenizer
|
11 |
+
model_name_dict = {
|
12 |
+
"BERT":"bert-base-uncased",
|
13 |
+
"RoBERTa":"roberta-base",
|
14 |
+
"ALBERT":"albert-v2-base",
|
15 |
+
"GPT2":"gpt2",
|
16 |
+
"Llama":"meta-lama/Llama-2-7b-chat-hf",
|
17 |
+
"Gemma":"google/gemma-7b",
|
18 |
+
}
|
19 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name_dict[tokenizer_name])
|
|
|
|
|
20 |
return tokenizer
|
21 |
|
22 |
def generate_markdown(text,color='black',font='Arial',size=20):
|
|
|
24 |
|
25 |
def TokenizeText(sentence,tokenizer_name):
|
26 |
if len(sentence)>0:
|
27 |
+
#if tokenizer_name.startswith('gpt2'):
|
28 |
+
# input_sent = tokenizer(sentence)['input_ids']
|
29 |
+
#else:
|
30 |
+
# input_sent = tokenizer(sentence)['input_ids'][1:-1]
|
31 |
+
input_sent = tokenizer(sentence)['input_ids']
|
32 |
encoded_sent = [str(token) for token in input_sent]
|
33 |
decoded_sent = [tokenizer.decode([token]) for token in input_sent]
|
34 |
num_tokens = len(decoded_sent)
|
|
|
99 |
# Select and load the tokenizer
|
100 |
st.sidebar.write('1. Choose the tokenizer from below')
|
101 |
tokenizer_name = st.sidebar.selectbox('',
|
102 |
+
("BERT","RoBERTa","ALBERT",
|
103 |
+
"GPT2","Llama","Gemma"))
|
|
|
|
|
104 |
tokenizer = load_model(tokenizer_name)
|
105 |
|
106 |
st.sidebar.write('2. Optional settings')
|
|
|
132 |
|
133 |
else:
|
134 |
if detokenize:
|
135 |
+
#if tokenizer_name.startswith('gpt2'):
|
136 |
+
# default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
|
137 |
+
#else:
|
138 |
+
# default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids'][1:-1]
|
139 |
+
default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
|
140 |
sentence = st.text_input(f'Tokenized IDs',value=' '.join([str(token) for token in default_tokens]))
|
141 |
num_tokens = DeTokenizeText(sentence)
|
142 |
else:
|