Spaces:
Runtime error
Runtime error
anonymousauthors
commited on
Commit
•
30c7c90
1
Parent(s):
38b6dfd
updates
Browse files- pages/0_📙_Dictionary_(Search).py +21 -24
- pages/2_😈_Blackbox_Attack.py +134 -63
- requirements.txt +2 -1
pages/0_📙_Dictionary_(Search).py
CHANGED
@@ -62,7 +62,7 @@ if title in datas:
|
|
62 |
# st.markdown(f"## {title}'s meaning in English[¹](#jump)")
|
63 |
colored_header(
|
64 |
label=f"{title}'s meaning in English[¹](#jump)",
|
65 |
-
description="",
|
66 |
color_name="violet-70",
|
67 |
)
|
68 |
# write the meaning of input word
|
@@ -114,36 +114,33 @@ if title in datas:
|
|
114 |
if all_sl[i] != '':
|
115 |
new_all_sl.append(all_sl[i].replace("\n", "/n").strip())
|
116 |
all_sl = sorted(new_all_sl)
|
117 |
-
st.
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
|
|
|
|
128 |
|
129 |
|
130 |
present_sl_task(secret_language_by_task, 'NLI')
|
131 |
present_sl_task(secret_language_by_task, 'QA')
|
132 |
present_sl_task(secret_language_by_task, 'Paraphrase')
|
133 |
|
134 |
-
st.
|
135 |
f"For example, the hyperlinks in the paraphrase section will only work when the paraphrase tab is open. However, due to the container property of Hugging Face Space, the hyperlinks might be not able to function.*")
|
136 |
-
st.
|
137 |
-
# st.markdown(f"---")
|
138 |
-
# st.markdown(f"## Examples of replaced sentences")
|
139 |
-
# st.markdown(f"The number following the tasks represents the number of examples found for a particular task, which may be different from the number of secret languages.")
|
140 |
colored_header(
|
141 |
label=f"Examples of replaced sentences",
|
142 |
-
description=f'The number following the tasks represents the number of examples found for a particular task, which may be different from the number of secret languages
|
143 |
color_name="orange-70",
|
144 |
)
|
145 |
-
# st.text(','.join(title_secret_languages).replace('<s>', '\<s\>'))
|
146 |
-
# st.dataframe(dataframe)
|
147 |
_num = Counter(data_title['tasks'])
|
148 |
tab1, tab2, tab3 = st.tabs([f'NLI ({_num["NLI"]})', f'QA ({_num["QA"]})', f'Paraphrase ({_num["Paraphrase"]})'])
|
149 |
|
@@ -242,7 +239,7 @@ if title in datas:
|
|
242 |
_string += 'question**: :'
|
243 |
elif task == 'Paraphrase':
|
244 |
_string += 'sentence 1**: :'
|
245 |
-
_string += f'red[{_all[_sl]["Replaced hypothesis"][j]}]'.replace('/', '\\').replace(
|
246 |
if task == 'NLI':
|
247 |
_string += '<br> **Premise**: :'
|
248 |
elif task == 'QA':
|
@@ -273,8 +270,8 @@ if title in datas:
|
|
273 |
with tab3:
|
274 |
# present(dataframe, 'Paraphrase', title)
|
275 |
present_dict(data_title, 'Paraphrase')
|
276 |
-
st.markdown(
|
277 |
-
|
278 |
-
|
279 |
else:
|
280 |
st.error(f'{title} is not in the dictionary of Secret Language.', icon="⚠️")
|
|
|
62 |
# st.markdown(f"## {title}'s meaning in English[¹](#jump)")
|
63 |
colored_header(
|
64 |
label=f"{title}'s meaning in English[¹](#jump)",
|
65 |
+
description="**Enlish meaning is supported by [PyDictionary](https://pypi.org/project/PyDictionary/)*",
|
66 |
color_name="violet-70",
|
67 |
)
|
68 |
# write the meaning of input word
|
|
|
114 |
if all_sl[i] != '':
|
115 |
new_all_sl.append(all_sl[i].replace("\n", "/n").strip())
|
116 |
all_sl = sorted(new_all_sl)
|
117 |
+
with st.expander(f'***{len(all_sl)}*** secret languages of ***{title}*** on {task.replace("paraphrase", "Paraphrase")}'):
|
118 |
+
# st.markdown(
|
119 |
+
# f':red[{len(all_sl)}] secret languages of :blue[{title}] on {task.replace("paraphrase", "Paraphrase")}',
|
120 |
+
# unsafe_allow_html=True)
|
121 |
+
special = '"'
|
122 |
+
_title_secret_languages = [
|
123 |
+
# f'[{i}](#{i.strip().replace("(", ",,").replace(")", "..").replace("[", ",,,").replace("]", "...").replace(special, "././")}_{task})'
|
124 |
+
f'<a href="#{i.strip().replace("(", ",,").replace(")", "..").replace("[", ",,,").replace("]", "...").replace(special, "././")}_{task}">{i}</a>'
|
125 |
+
for i in all_sl]
|
126 |
+
st.markdown(
|
127 |
+
# '>' + ', '.join(_title_secret_languages).replace('<s>', '\<s\>').replace('$', '\$').replace('~', '\~'),
|
128 |
+
', '.join(_title_secret_languages).replace('<s>', '\<s\>').replace('$', '\$').replace('~', '\~'),
|
129 |
+
unsafe_allow_html=True)
|
130 |
|
131 |
|
132 |
present_sl_task(secret_language_by_task, 'NLI')
|
133 |
present_sl_task(secret_language_by_task, 'QA')
|
134 |
present_sl_task(secret_language_by_task, 'Paraphrase')
|
135 |
|
136 |
+
st.caption(f"\**Hyperlinks only function when the corresponding tab is open. "
|
137 |
f"For example, the hyperlinks in the paraphrase section will only work when the paraphrase tab is open. However, due to the container property of Hugging Face Space, the hyperlinks might be not able to function.*")
|
138 |
+
st.caption('\**Due to the grammatical properties of HTML, the layout of this page may vary.*')
|
|
|
|
|
|
|
139 |
colored_header(
|
140 |
label=f"Examples of replaced sentences",
|
141 |
+
description=f'**The number following the tasks represents the number of examples found for a particular task, which may be different from the number of secret languages.*',
|
142 |
color_name="orange-70",
|
143 |
)
|
|
|
|
|
144 |
_num = Counter(data_title['tasks'])
|
145 |
tab1, tab2, tab3 = st.tabs([f'NLI ({_num["NLI"]})', f'QA ({_num["QA"]})', f'Paraphrase ({_num["Paraphrase"]})'])
|
146 |
|
|
|
239 |
_string += 'question**: :'
|
240 |
elif task == 'Paraphrase':
|
241 |
_string += 'sentence 1**: :'
|
242 |
+
_string += f'red[{_all[_sl]["Replaced hypothesis"][j]}]'.replace('/', '\\').replace(_sl, f"<i><b>{___sl}</b></i>").replace(":", "[colon]")
|
243 |
if task == 'NLI':
|
244 |
_string += '<br> **Premise**: :'
|
245 |
elif task == 'QA':
|
|
|
270 |
with tab3:
|
271 |
# present(dataframe, 'Paraphrase', title)
|
272 |
present_dict(data_title, 'Paraphrase')
|
273 |
+
# st.markdown(
|
274 |
+
# f'<span id="jump">¹</span>*Enlish meaning is supported by [PyDictionary](https://pypi.org/project/PyDictionary/).*',
|
275 |
+
# unsafe_allow_html=True)
|
276 |
else:
|
277 |
st.error(f'{title} is not in the dictionary of Secret Language.', icon="⚠️")
|
pages/2_😈_Blackbox_Attack.py
CHANGED
@@ -1,41 +1,60 @@
|
|
1 |
import streamlit as st
|
2 |
-
import os
|
3 |
from streamlit_extras.stateful_button import button
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
model = GPT2Model.from_pretrained('gpt2')
|
8 |
import pickle
|
9 |
-
all_keys = pickle.load(open('keys.pkl', 'rb'))
|
10 |
-
all_keys = [i.strip() for i in all_keys]
|
11 |
import torch
|
12 |
from copy import deepcopy
|
13 |
from time import time
|
|
|
|
|
14 |
|
15 |
-
|
16 |
-
|
|
|
|
|
17 |
|
|
|
|
|
|
|
18 |
st.sidebar.markdown('#### Require ')
|
19 |
st.sidebar.markdown('`Input text`: a sentence or paragraph.')
|
20 |
st.sidebar.markdown('`Number of replacements`: the number of secret language samples.')
|
21 |
st.sidebar.markdown('`Steps for searching Secret Langauge`: the steps in the SecretFinding process.')
|
22 |
-
|
23 |
st.sidebar.markdown('#### Two methods')
|
24 |
-
st.sidebar.markdown('1.
|
25 |
st.sidebar.markdown('2. Use the secret language we found on ALBERT, DistillBERT, and Roberta: this method replaces words directly with the secret language dictionary derived from ALBERT, DistillBERT, and Roberta.')
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
|
|
|
|
28 |
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
31 |
|
32 |
_input = tokenizer([text] * restarts, return_tensors="pt")
|
33 |
for k in _input.keys():
|
34 |
_input[k] = _input[k].to(device)
|
35 |
|
36 |
-
ori_output = model(**_input)
|
|
|
|
|
|
|
|
|
37 |
|
38 |
-
ori_embedding =
|
39 |
ori_embedding.requires_grad = False
|
40 |
ori_word_one_hot = torch.nn.functional.one_hot(_input['input_ids'].detach(), num_classes=subword_num).to(device)
|
41 |
|
@@ -52,16 +71,20 @@ def run(model, _bar_text=None, bar=None, text='Which name is also used to descri
|
|
52 |
for i in range(len(noise_mask)):
|
53 |
_tmp_perturbed_input = ori_word_one_hot[:, noise_mask[i]] + noise[:, i]
|
54 |
_tmp_perturbed_input /= _tmp_perturbed_input.sum(-1, keepdim=True)
|
55 |
-
perturbed_embedding[:, noise_mask[i]] = torch.matmul(_tmp_perturbed_input,
|
56 |
|
57 |
_input_['inputs_embeds'] = perturbed_embedding
|
58 |
-
outputs_perturbed = model(**_input_)
|
|
|
|
|
|
|
|
|
59 |
|
60 |
loss = loss_funt(ori_output, outputs_perturbed)
|
61 |
loss.backward()
|
62 |
noise.data = (noise.data - lr * noise.grad.detach())
|
63 |
noise.grad.zero_()
|
64 |
-
_bar_text.text(f'{(time() - start_time) * (step - _i - 1) / (_i + 1):.2f} seconds left')
|
65 |
# validate
|
66 |
with torch.no_grad():
|
67 |
perturbed_inputs = deepcopy(_input)
|
@@ -73,42 +96,15 @@ def run(model, _bar_text=None, bar=None, text='Which name is also used to descri
|
|
73 |
perturbed_questions = []
|
74 |
for i in range(restarts):
|
75 |
perturbed_questions.append(tokenizer.decode(perturbed_inputs["input_ids"][i]).split("</s></s>")[0])
|
76 |
-
|
77 |
-
for
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
81 |
return perturbed_questions
|
82 |
|
83 |
-
|
84 |
-
from transformers import GPT2Tokenizer, GPT2Model
|
85 |
-
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
86 |
-
model = GPT2Model.from_pretrained('gpt2')
|
87 |
-
# encoded_input = tokenizer(text, return_tensors='pt')
|
88 |
-
# output = model(**encoded_input)
|
89 |
-
|
90 |
-
option = st.selectbox(
|
91 |
-
'Which method you would like to use?',
|
92 |
-
('GPT-2 (Searching secret languages based on GPT-2)', 'Use the secret language we found on ALBERT, DistillBERT, and Roberta.')
|
93 |
-
)
|
94 |
-
|
95 |
-
def clf_keys():
|
96 |
-
for key in st.session_state.keys():
|
97 |
-
if key in ['tokenizer', 'start']:
|
98 |
-
st.session_state[key] = False
|
99 |
-
elif 'tokenizer_' in key:
|
100 |
-
del st.session_state[key]
|
101 |
-
|
102 |
-
|
103 |
-
title = st.text_area('Input text.', 'Which name is also used to describe the Amazon rainforest in English?', on_change=clf_keys)
|
104 |
-
|
105 |
-
if option == 'GPT-2 (Searching secret languages based on GPT-2)':
|
106 |
-
_cols = st.columns(2)
|
107 |
-
restarts = _cols[0].number_input('Number of replacements.', value=10, min_value=1, step=1, format='%d')
|
108 |
-
step = _cols[1].number_input('Steps for searching Secret Langauge', value=100, min_value=1, step=1, format='%d')
|
109 |
-
else:
|
110 |
-
restarts = st.number_input('Number of replacements.', value=10, min_value=1, step=1, format='%d')
|
111 |
-
|
112 |
def get_secret_language(title):
|
113 |
if ord(title[0]) in list(range(48, 57)):
|
114 |
file_name = 'num_dict.pkl'
|
@@ -129,7 +125,57 @@ def get_secret_language(title):
|
|
129 |
break
|
130 |
return _sls_id
|
131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
if button('Tokenize', key='tokenizer'):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
for key in st.session_state.keys():
|
134 |
if key not in ['tokenizer', 'start'] and 'tokenizer_' not in key:
|
135 |
del st.session_state[key]
|
@@ -164,11 +210,31 @@ if button('Tokenize', key='tokenizer'):
|
|
164 |
if _index < len(input_ids):
|
165 |
chose_indices.append(_index)
|
166 |
if len(chose_indices):
|
167 |
-
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
bar = st.progress(0)
|
170 |
-
|
171 |
-
|
|
|
172 |
else:
|
173 |
_new_ids = []
|
174 |
_sl = {}
|
@@ -184,15 +250,20 @@ if button('Tokenize', key='tokenizer'):
|
|
184 |
else:
|
185 |
_tmp.append(input_ids[j])
|
186 |
_new_ids.append(_tmp)
|
187 |
-
# st.markdown(_new_ids)
|
188 |
outputs = [tokenizer.decode(_new_ids[i]).split('</s></s>')[0] for i in range(restarts)]
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
194 |
st.success(f'We found {restarts} replacements!', icon="✅")
|
195 |
-
st.markdown('<br>'.join(outputs), unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
196 |
else:
|
197 |
st.error('At least choose one subword.')
|
198 |
|
|
|
1 |
import streamlit as st
|
|
|
2 |
from streamlit_extras.stateful_button import button
|
3 |
+
import os
|
4 |
+
import openai
|
5 |
+
from transformers import GPT2Tokenizer, GPT2Model, AutoTokenizer, AutoModelForCausalLM
|
|
|
6 |
import pickle
|
|
|
|
|
7 |
import torch
|
8 |
from copy import deepcopy
|
9 |
from time import time
|
10 |
+
from transformers import pipeline, set_seed
|
11 |
+
import platform
|
12 |
|
13 |
+
# init
|
14 |
+
openai.api_key = os.environ.get('openai_api_key')
|
15 |
+
all_keys = pickle.load(open('keys.pkl', 'rb'))
|
16 |
+
all_keys = [i.strip() for i in all_keys]
|
17 |
|
18 |
+
set_seed(0)
|
19 |
+
# sidebar instructions
|
20 |
+
st.sidebar.markdown('On this page, we offer a tool for generating replacement words using secret languages.')
|
21 |
st.sidebar.markdown('#### Require ')
|
22 |
st.sidebar.markdown('`Input text`: a sentence or paragraph.')
|
23 |
st.sidebar.markdown('`Number of replacements`: the number of secret language samples.')
|
24 |
st.sidebar.markdown('`Steps for searching Secret Langauge`: the steps in the SecretFinding process.')
|
|
|
25 |
st.sidebar.markdown('#### Two methods')
|
26 |
+
st.sidebar.markdown('1. Searching secret languages based on models: this method calculates secret languages using [GPT-2](https://huggingface.co/gpt2), [EleutherAI/gpt-neo-1.3B](https://huggingface.co/EleutherAI/gpt-neo-1.3B), [EleutherAI/gpt-neo-2.7B](https://huggingface.co/EleutherAI/gpt-neo-2.7B), [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b), or [EleutherAI/gpt-j-6B](https://huggingface.co/EleutherAI/gpt-j-6B).')
|
27 |
st.sidebar.markdown('2. Use the secret language we found on ALBERT, DistillBERT, and Roberta: this method replaces words directly with the secret language dictionary derived from ALBERT, DistillBERT, and Roberta.')
|
28 |
|
29 |
+
st.sidebar.markdown('#### Return')
|
30 |
+
st.sidebar.markdown(
|
31 |
+
'To see whether the white attack works on LLMs, we set seed to 0.'
|
32 |
+
)
|
33 |
+
st.sidebar.markdown(
|
34 |
+
'To see whether the blackbox attack works on LLMs, we also add the response using [Codex](https://openai.com/blog/openai-codex/). '
|
35 |
+
'Specifically, we use the `code-davinci-002` model with 16 max_tokens responses.'
|
36 |
+
)
|
37 |
|
38 |
+
# title
|
39 |
+
st.title('Blackbox Attack')
|
40 |
|
41 |
+
# online search
|
42 |
+
def run(model, tokenizer, embedidng_layer=None, _bar_text=None, bar=None, text='Which name is also used to describe the Amazon rainforest in English?',
|
43 |
+
loss_funt=torch.nn.MSELoss(), lr=1, noise_mask=[1,2], restarts=10, step=100, device = torch.device('cpu'),
|
44 |
+
sl_paint_red=False, model_choice='GPT-2'):
|
45 |
+
subword_num = embedidng_layer.weight.shape[0]
|
46 |
|
47 |
_input = tokenizer([text] * restarts, return_tensors="pt")
|
48 |
for k in _input.keys():
|
49 |
_input[k] = _input[k].to(device)
|
50 |
|
51 |
+
ori_output = model(**_input)
|
52 |
+
if 'last_hidden_state' in ori_output:
|
53 |
+
ori_output = ori_output['last_hidden_state']
|
54 |
+
else:
|
55 |
+
ori_output = ori_output['logits']
|
56 |
|
57 |
+
ori_embedding = embedidng_layer(_input['input_ids']).detach()
|
58 |
ori_embedding.requires_grad = False
|
59 |
ori_word_one_hot = torch.nn.functional.one_hot(_input['input_ids'].detach(), num_classes=subword_num).to(device)
|
60 |
|
|
|
71 |
for i in range(len(noise_mask)):
|
72 |
_tmp_perturbed_input = ori_word_one_hot[:, noise_mask[i]] + noise[:, i]
|
73 |
_tmp_perturbed_input /= _tmp_perturbed_input.sum(-1, keepdim=True)
|
74 |
+
perturbed_embedding[:, noise_mask[i]] = torch.matmul(_tmp_perturbed_input, embedidng_layer.weight)
|
75 |
|
76 |
_input_['inputs_embeds'] = perturbed_embedding
|
77 |
+
outputs_perturbed = model(**_input_)
|
78 |
+
if 'last_hidden_state' in outputs_perturbed:
|
79 |
+
outputs_perturbed = outputs_perturbed['last_hidden_state']
|
80 |
+
else:
|
81 |
+
outputs_perturbed = outputs_perturbed['logits']
|
82 |
|
83 |
loss = loss_funt(ori_output, outputs_perturbed)
|
84 |
loss.backward()
|
85 |
noise.data = (noise.data - lr * noise.grad.detach())
|
86 |
noise.grad.zero_()
|
87 |
+
_bar_text.text(f'Using {model_choice}, {(time() - start_time) * (step - _i - 1) / (_i + 1):.2f} seconds left')
|
88 |
# validate
|
89 |
with torch.no_grad():
|
90 |
perturbed_inputs = deepcopy(_input)
|
|
|
96 |
perturbed_questions = []
|
97 |
for i in range(restarts):
|
98 |
perturbed_questions.append(tokenizer.decode(perturbed_inputs["input_ids"][i]).split("</s></s>")[0])
|
99 |
+
if sl_paint_red:
|
100 |
+
for i in range(len(perturbed_questions)):
|
101 |
+
for j in noise_mask:
|
102 |
+
_j = tokenizer.decode(perturbed_inputs["input_ids"][i][j])
|
103 |
+
# print(f'_j {_j}')
|
104 |
+
perturbed_questions[i] = perturbed_questions[i].replace(_j, f':red[{_j}]')
|
105 |
return perturbed_questions
|
106 |
|
107 |
+
# get secret language using the found dictionary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
def get_secret_language(title):
|
109 |
if ord(title[0]) in list(range(48, 57)):
|
110 |
file_name = 'num_dict.pkl'
|
|
|
125 |
break
|
126 |
return _sls_id
|
127 |
|
128 |
+
# openai api
|
129 |
+
def get_codex_response(prompt):
|
130 |
+
try:
|
131 |
+
response = openai.Completion.create(
|
132 |
+
engine='code-davinci-002',
|
133 |
+
prompt=prompt,
|
134 |
+
max_tokens=16,
|
135 |
+
temperature=0,
|
136 |
+
logprobs=1
|
137 |
+
)
|
138 |
+
output_openai = ''.join(response['choices'][0]['logprobs']['tokens'])
|
139 |
+
except Exception as ex:
|
140 |
+
output_openai = str(ex).replace('org-oOthbOAqOPamO9jhWBjUwDRa', '')
|
141 |
+
return output_openai
|
142 |
+
|
143 |
+
# help function
|
144 |
+
def clf_keys():
|
145 |
+
for key in st.session_state.keys():
|
146 |
+
if key in ['tokenizer', 'start']:
|
147 |
+
st.session_state[key] = False
|
148 |
+
elif 'tokenizer_' in key:
|
149 |
+
del st.session_state[key]
|
150 |
+
|
151 |
+
# main page
|
152 |
+
option = st.selectbox(
|
153 |
+
'Which method you would like to use?',
|
154 |
+
('Searching secret languages based on models', 'Use the secret language we found on ALBERT, DistillBERT, and Roberta.')
|
155 |
+
)
|
156 |
+
|
157 |
+
title = st.text_area('Input text.', 'Which name is also used to describe the Amazon rainforest in English?', on_change=clf_keys)
|
158 |
+
|
159 |
+
if option == 'Searching secret languages based on models':
|
160 |
+
model_choice = st.selectbox(
|
161 |
+
'Which model you would like to use?',
|
162 |
+
# ('GPT-2', "EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-neo-2.7B", "EleutherAI/gpt-neox-20b", "EleutherAI/gpt-j-6B")
|
163 |
+
('GPT-2', "EleutherAI/gpt-neo-1.3B")
|
164 |
+
)
|
165 |
+
_cols = st.columns(2)
|
166 |
+
restarts = _cols[0].number_input('Number of replacements.', value=10, min_value=1, step=1, format='%d')
|
167 |
+
step = _cols[1].number_input('Steps for searching Secret Langauge', value=100, min_value=1, step=1, format='%d')
|
168 |
+
else:
|
169 |
+
restarts = st.number_input('Number of replacements.', value=10, min_value=1, step=1, format='%d')
|
170 |
+
|
171 |
if button('Tokenize', key='tokenizer'):
|
172 |
+
if option == 'Searching secret languages based on models':
|
173 |
+
if model_choice == 'GPT-2':
|
174 |
+
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
175 |
+
else:
|
176 |
+
tokenizer = AutoTokenizer.from_pretrained(model_choice)
|
177 |
+
else:
|
178 |
+
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
179 |
for key in st.session_state.keys():
|
180 |
if key not in ['tokenizer', 'start'] and 'tokenizer_' not in key:
|
181 |
del st.session_state[key]
|
|
|
210 |
if _index < len(input_ids):
|
211 |
chose_indices.append(_index)
|
212 |
if len(chose_indices):
|
213 |
+
if option == 'Searching secret languages based on models':
|
214 |
+
if model_choice == 'GPT-2':
|
215 |
+
model = GPT2Model.from_pretrained('gpt2')
|
216 |
+
else:
|
217 |
+
model = AutoModelForCausalLM.from_pretrained(model_choice)
|
218 |
+
generator = pipeline('text-generation', model='gpt2')
|
219 |
+
if not platform.system().lower() == 'darwin':
|
220 |
+
generator1 = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B')
|
221 |
+
with st.expander('**Original input text**: '+ title):
|
222 |
+
output_openai = get_codex_response(title)
|
223 |
+
st.markdown(f'The response of GPT-2 with the prompt :blue[{title}]')
|
224 |
+
st.markdown('<blockquote>' + generator(title, max_length=30, num_return_sequences=1)[0]['generated_text'].replace(title, '', 1) + '</blockquote>', unsafe_allow_html=True)
|
225 |
+
if not platform.system().lower() == 'darwin':
|
226 |
+
st.markdown(f'The response of {model_choice} with the prompt :blue[{title}]')
|
227 |
+
st.markdown('<blockquote>' + generator1("EleutherAI has", do_sample=True, min_length=50)[0]['generated_text'].replace(title, '', 1) + '</blockquote>', unsafe_allow_html=True)
|
228 |
+
|
229 |
+
st.markdown(f'The response of [Codex](https://openai.com/blog/openai-codex/) with the prompt :blue[{title}]')
|
230 |
+
st.markdown('<blockquote>' + output_openai + '</blockquote>', unsafe_allow_html=True)
|
231 |
+
|
232 |
+
if option == 'Searching secret languages based on models':
|
233 |
+
_bar_text = st.empty()
|
234 |
bar = st.progress(0)
|
235 |
+
outputs = run(model, tokenizer, model.wte if model_choice == 'GPT-2' else model.transformer.wte,
|
236 |
+
_bar_text=_bar_text, bar=bar, text=title, noise_mask=chose_indices, restarts=restarts, step=step,
|
237 |
+
model_choice=model_choice)
|
238 |
else:
|
239 |
_new_ids = []
|
240 |
_sl = {}
|
|
|
250 |
else:
|
251 |
_tmp.append(input_ids[j])
|
252 |
_new_ids.append(_tmp)
|
|
|
253 |
outputs = [tokenizer.decode(_new_ids[i]).split('</s></s>')[0] for i in range(restarts)]
|
254 |
+
if False:
|
255 |
+
original_outputs = outputs
|
256 |
+
for i in range(len(outputs)):
|
257 |
+
for j in _used_sl:
|
258 |
+
_j = tokenizer.decode(j)
|
259 |
+
outputs[i] = outputs[i].replace(_j, f':red[{_j}]')
|
260 |
st.success(f'We found {restarts} replacements!', icon="✅")
|
261 |
+
# st.markdown('<br>'.join(outputs), unsafe_allow_html=True)
|
262 |
+
for i in range(restarts):
|
263 |
+
with st.expander(outputs[i]):
|
264 |
+
output_openai = get_codex_response(outputs[i])
|
265 |
+
st.markdown(f'The response of [Codex](https://openai.com/blog/openai-codex/) with the prompt :blue[{outputs[i]}]')
|
266 |
+
st.markdown('<blockquote>' + output_openai + '</blockquote>', unsafe_allow_html=True)
|
267 |
else:
|
268 |
st.error('At least choose one subword.')
|
269 |
|
requirements.txt
CHANGED
@@ -2,4 +2,5 @@ pandas
|
|
2 |
PyDictionary
|
3 |
streamlit_extras
|
4 |
transformers
|
5 |
-
torch
|
|
|
|
2 |
PyDictionary
|
3 |
streamlit_extras
|
4 |
transformers
|
5 |
+
torch
|
6 |
+
openai
|