Spaces:
Runtime error
Runtime error
anonymousauthors
commited on
Commit
β’
5fb4442
1
Parent(s):
2247711
Upload 4 files
Browse files- SecretLanguage.py +11 -4
- pages/0_π_Dictionary_(Search).py +13 -1
- pages/2_π_Blackbox_Attack.py +37 -6
SecretLanguage.py
CHANGED
@@ -15,23 +15,30 @@ st.set_page_config(layout="wide", page_title="ACl23 Secret Language")
|
|
15 |
|
16 |
st.title("ACl23 Submission: Finding Secret Language")
|
17 |
|
|
|
|
|
18 |
st.markdown('This webpage serves as an illustration of an anonymous submission to ACL 23.')
|
19 |
|
20 |
-
st.markdown('###
|
21 |
-
st.markdown('We present two methods for searching secret
|
|
|
22 |
|
23 |
-
st.markdown(
|
|
|
24 |
"The hyperlinks sometimes might not work due to the contained property of Hugging Face space.")
|
25 |
st.image(search_image, caption='A search example.')
|
26 |
|
|
|
27 |
st.markdown('By clicking on the initial letters (A to Z, numbers, and other characters), you can view all the words whose secret languages have been discovered and that begin with the selected initial. By clicking on a word, you will be redirected to the search page, where you can view information about the selected word.')
|
28 |
st.image(browse_image, caption='A browse example.')
|
29 |
|
|
|
|
|
|
|
30 |
st.markdown('### Models and datasets.')
|
31 |
|
32 |
st.markdown('On this page, we present the secret languages we discovered using ALBERT, DistillBERT, and Roberta models and data from the GLUE (MRPC), SNLI, and SQuAD datasets.')
|
33 |
|
34 |
-
|
35 |
st.markdown('### Ethics statements for this webpage')
|
36 |
|
37 |
st.markdown('We present secret languages discovered using our proposed algorithms. '
|
|
|
15 |
|
16 |
st.title("ACl23 Submission: Finding Secret Language")
|
17 |
|
18 |
+
# st.sidebar.markdown("### This webpage serves as an illustration of an anonymous submission to ACL 23.")
|
19 |
+
|
20 |
st.markdown('This webpage serves as an illustration of an anonymous submission to ACL 23.')
|
21 |
|
22 |
+
st.markdown('### What do we offer?')
|
23 |
+
st.markdown('We present two methods for searching secret languages. The first method is a direct search using the "π Dictionary (Search)" option, while the second method, "π Dictionary (Browse)", involves browsing words that have already been found to have secret languages. '
|
24 |
+
'Additionally, we also provide a tool for finding secret languages in a black-box manner.')
|
25 |
|
26 |
+
st.markdown('#### How to use "π Dictionary (Search)"?')
|
27 |
+
st.markdown("By entering a word you want to find its secret languages, you can view the word's meaning in English, all the secret languages we have discovered for it, and examples. "
|
28 |
"The hyperlinks sometimes might not work due to the contained property of Hugging Face space.")
|
29 |
st.image(search_image, caption='A search example.')
|
30 |
|
31 |
+
st.markdown('#### How to use "π Dictionary (Browse)"?')
|
32 |
st.markdown('By clicking on the initial letters (A to Z, numbers, and other characters), you can view all the words whose secret languages have been discovered and that begin with the selected initial. By clicking on a word, you will be redirected to the search page, where you can view information about the selected word.')
|
33 |
st.image(browse_image, caption='A browse example.')
|
34 |
|
35 |
+
st.markdown('#### How to use "π Blackbox Attack"?')
|
36 |
+
st.markdown('We offer two methods for generating replacement words using secret languages. Detailed introduction can be found on the page.')
|
37 |
+
|
38 |
st.markdown('### Models and datasets.')
|
39 |
|
40 |
st.markdown('On this page, we present the secret languages we discovered using ALBERT, DistillBERT, and Roberta models and data from the GLUE (MRPC), SNLI, and SQuAD datasets.')
|
41 |
|
|
|
42 |
st.markdown('### Ethics statements for this webpage')
|
43 |
|
44 |
st.markdown('We present secret languages discovered using our proposed algorithms. '
|
pages/0_π_Dictionary_(Search).py
CHANGED
@@ -36,6 +36,18 @@ for key in st.session_state.keys():
|
|
36 |
|
37 |
title = st.sidebar.text_input(":red[Search secret languages given the following word (case-sensitive)]", default_title)
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
if ord(title[0]) in list(range(48, 57)):
|
40 |
file_name = 'num_dict.pkl'
|
41 |
elif ord(title[0]) in list(range(97, 122)) + list(range(65, 90)):
|
@@ -230,7 +242,7 @@ if title in datas:
|
|
230 |
_string += 'question**: :'
|
231 |
elif task == 'Paraphrase':
|
232 |
_string += 'sentence 1**: :'
|
233 |
-
_string += f'red[{_all[_sl]["Replaced hypothesis"][j]}]'.replace(":", "[colon]")
|
234 |
if task == 'NLI':
|
235 |
_string += '<br> **Premise**: :'
|
236 |
elif task == 'QA':
|
|
|
36 |
|
37 |
title = st.sidebar.text_input(":red[Search secret languages given the following word (case-sensitive)]", default_title)
|
38 |
|
39 |
+
st.sidebar.markdown("### Frequent FAQs")
|
40 |
+
st.sidebar.markdown("1. *Why are words in sentences represented as subwords instead of complete words?*<br>"
|
41 |
+
"The tokenizer we use is from DistillBERT, ALBERT, or Roberta, which tokenizes sentences into subwords. As a result, the word being replaced in a sentence might be a subword (such as `rain` in `rainforest`).",
|
42 |
+
unsafe_allow_html=True)
|
43 |
+
st.sidebar.markdown("2. *This page is extremely slow. I cannot stand it.*<br>"
|
44 |
+
"We apologize for the slow performance of this page. We are actively working on improving it."
|
45 |
+
"As loading the data can take time and some words have many secret languages, this page needs time to process.",
|
46 |
+
unsafe_allow_html=True)
|
47 |
+
st.sidebar.markdown("3. *Why are some examples significantly different from the original sentences? *<br>"
|
48 |
+
"As per our submission, we replace 1 to 10 subwords in a sentence. However, for some examples with short lengths, the entire sentence may be altered. We are conducting experiments and will present examples where only a single subword has been changed.",
|
49 |
+
unsafe_allow_html=True)
|
50 |
+
|
51 |
if ord(title[0]) in list(range(48, 57)):
|
52 |
file_name = 'num_dict.pkl'
|
53 |
elif ord(title[0]) in list(range(97, 122)) + list(range(65, 90)):
|
|
|
242 |
_string += 'question**: :'
|
243 |
elif task == 'Paraphrase':
|
244 |
_string += 'sentence 1**: :'
|
245 |
+
_string += f'red[{_all[_sl]["Replaced hypothesis"][j]}]'.replace('/', '\\').replace(___sl, f"<i><b>{___sl}</b></i>").replace(":", "[colon]")
|
246 |
if task == 'NLI':
|
247 |
_string += '<br> **Premise**: :'
|
248 |
elif task == 'QA':
|
pages/2_π_Blackbox_Attack.py
CHANGED
@@ -15,10 +15,17 @@ from time import time
|
|
15 |
st.title('Blackbox Attack')
|
16 |
st.sidebar.markdown('On this page, we offer a tool for generating replacement words using secret languages.')
|
17 |
|
18 |
-
st.sidebar.markdown('
|
19 |
-
st.sidebar.markdown('
|
|
|
|
|
|
|
|
|
|
|
20 |
st.sidebar.markdown('2. Use the secret language we found on ALBERT, DistillBERT, and Roberta: this method replaces words directly with the secret language dictionary derived from ALBERT, DistillBERT, and Roberta.')
|
21 |
|
|
|
|
|
22 |
def run(model, _bar_text=None, bar=None, text='Which name is also used to describe the Amazon rainforest in English?', loss_funt=torch.nn.MSELoss(), lr=1, noise_mask=[1,2], restarts=10, step=100, device = torch.device('cpu')):
|
23 |
subword_num = model.wte.weight.shape[0]
|
24 |
|
@@ -66,6 +73,11 @@ def run(model, _bar_text=None, bar=None, text='Which name is also used to descri
|
|
66 |
perturbed_questions = []
|
67 |
for i in range(restarts):
|
68 |
perturbed_questions.append(tokenizer.decode(perturbed_inputs["input_ids"][i]).split("</s></s>")[0])
|
|
|
|
|
|
|
|
|
|
|
69 |
return perturbed_questions
|
70 |
|
71 |
|
@@ -80,7 +92,15 @@ option = st.selectbox(
|
|
80 |
('GPT-2 (Searching secret languages based on GPT-2)', 'Use the secret language we found on ALBERT, DistillBERT, and Roberta.')
|
81 |
)
|
82 |
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
if option == 'GPT-2 (Searching secret languages based on GPT-2)':
|
86 |
_cols = st.columns(2)
|
@@ -124,8 +144,11 @@ if button('Tokenize', key='tokenizer'):
|
|
124 |
_index = i * 6 + j
|
125 |
if _index < _len:
|
126 |
disable = False
|
127 |
-
if
|
128 |
-
|
|
|
|
|
|
|
129 |
button(subwords[_index], key=f'tokenizer_{_index}', disabled=disable)
|
130 |
|
131 |
|
@@ -136,8 +159,10 @@ if button('Tokenize', key='tokenizer'):
|
|
136 |
for key in st.session_state:
|
137 |
if st.session_state[key]:
|
138 |
if 'tokenizer_' in key:
|
|
|
139 |
# st.markdown(key)
|
140 |
-
|
|
|
141 |
if len(chose_indices):
|
142 |
_bar_text = st.empty()
|
143 |
if option == 'GPT-2 (Searching secret languages based on GPT-2)':
|
@@ -147,6 +172,7 @@ if button('Tokenize', key='tokenizer'):
|
|
147 |
else:
|
148 |
_new_ids = []
|
149 |
_sl = {}
|
|
|
150 |
for j in chose_indices:
|
151 |
_sl[j] = get_secret_language(tokenizer.decode(input_ids[j]).strip())
|
152 |
for i in range(restarts):
|
@@ -154,11 +180,16 @@ if button('Tokenize', key='tokenizer'):
|
|
154 |
for j in range(len(input_ids)):
|
155 |
if j in chose_indices:
|
156 |
_tmp.append(_sl[j][i % len(_sl[j])])
|
|
|
157 |
else:
|
158 |
_tmp.append(input_ids[j])
|
159 |
_new_ids.append(_tmp)
|
160 |
# st.markdown(_new_ids)
|
161 |
outputs = [tokenizer.decode(_new_ids[i]).split('</s></s>')[0] for i in range(restarts)]
|
|
|
|
|
|
|
|
|
162 |
|
163 |
st.success(f'We found {restarts} replacements!', icon="β
")
|
164 |
st.markdown('<br>'.join(outputs), unsafe_allow_html=True)
|
|
|
15 |
st.title('Blackbox Attack')
|
16 |
st.sidebar.markdown('On this page, we offer a tool for generating replacement words using secret languages.')
|
17 |
|
18 |
+
st.sidebar.markdown('#### Require ')
|
19 |
+
st.sidebar.markdown('`Input text`: a sentence or paragraph.')
|
20 |
+
st.sidebar.markdown('`Number of replacements`: the number of secret language samples.')
|
21 |
+
st.sidebar.markdown('`Steps for searching Secret Langauge`: the steps in the SecretFinding process.')
|
22 |
+
|
23 |
+
st.sidebar.markdown('#### Two methods')
|
24 |
+
st.sidebar.markdown('1. GPT-2 (Searching secret languages based on GPT-2): this method calculates secret languages using [GPT-2](https://huggingface.co/gpt2).')
|
25 |
st.sidebar.markdown('2. Use the secret language we found on ALBERT, DistillBERT, and Roberta: this method replaces words directly with the secret language dictionary derived from ALBERT, DistillBERT, and Roberta.')
|
26 |
|
27 |
+
|
28 |
+
|
29 |
def run(model, _bar_text=None, bar=None, text='Which name is also used to describe the Amazon rainforest in English?', loss_funt=torch.nn.MSELoss(), lr=1, noise_mask=[1,2], restarts=10, step=100, device = torch.device('cpu')):
|
30 |
subword_num = model.wte.weight.shape[0]
|
31 |
|
|
|
73 |
perturbed_questions = []
|
74 |
for i in range(restarts):
|
75 |
perturbed_questions.append(tokenizer.decode(perturbed_inputs["input_ids"][i]).split("</s></s>")[0])
|
76 |
+
for i in range(len(perturbed_questions)):
|
77 |
+
for j in noise_mask:
|
78 |
+
_j = tokenizer.decode(perturbed_inputs["input_ids"][i][j])
|
79 |
+
# print(f'_j {_j}')
|
80 |
+
perturbed_questions[i] = perturbed_questions[i].replace(_j, f':red[{_j}]')
|
81 |
return perturbed_questions
|
82 |
|
83 |
|
|
|
92 |
('GPT-2 (Searching secret languages based on GPT-2)', 'Use the secret language we found on ALBERT, DistillBERT, and Roberta.')
|
93 |
)
|
94 |
|
95 |
+
def clf_keys():
|
96 |
+
for key in st.session_state.keys():
|
97 |
+
if key in ['tokenizer', 'start']:
|
98 |
+
st.session_state[key] = False
|
99 |
+
elif 'tokenizer_' in key:
|
100 |
+
del st.session_state[key]
|
101 |
+
|
102 |
+
|
103 |
+
title = st.text_area('Input text.', 'Which name is also used to describe the Amazon rainforest in English?', on_change=clf_keys)
|
104 |
|
105 |
if option == 'GPT-2 (Searching secret languages based on GPT-2)':
|
106 |
_cols = st.columns(2)
|
|
|
144 |
_index = i * 6 + j
|
145 |
if _index < _len:
|
146 |
disable = False
|
147 |
+
if option == 'Use the secret language we found on ALBERT, DistillBERT, and Roberta.':
|
148 |
+
if subwords[_index].strip() not in all_keys:
|
149 |
+
disable = True
|
150 |
+
# if f'tokenizer_{_index}' in st.session_state:
|
151 |
+
# del st.session_state[f'tokenizer_{_index}']
|
152 |
button(subwords[_index], key=f'tokenizer_{_index}', disabled=disable)
|
153 |
|
154 |
|
|
|
159 |
for key in st.session_state:
|
160 |
if st.session_state[key]:
|
161 |
if 'tokenizer_' in key:
|
162 |
+
_index = int(key.replace('tokenizer_', ''))
|
163 |
# st.markdown(key)
|
164 |
+
if _index < len(input_ids):
|
165 |
+
chose_indices.append(_index)
|
166 |
if len(chose_indices):
|
167 |
_bar_text = st.empty()
|
168 |
if option == 'GPT-2 (Searching secret languages based on GPT-2)':
|
|
|
172 |
else:
|
173 |
_new_ids = []
|
174 |
_sl = {}
|
175 |
+
_used_sl = []
|
176 |
for j in chose_indices:
|
177 |
_sl[j] = get_secret_language(tokenizer.decode(input_ids[j]).strip())
|
178 |
for i in range(restarts):
|
|
|
180 |
for j in range(len(input_ids)):
|
181 |
if j in chose_indices:
|
182 |
_tmp.append(_sl[j][i % len(_sl[j])])
|
183 |
+
_used_sl.append(_sl[j][i % len(_sl[j])])
|
184 |
else:
|
185 |
_tmp.append(input_ids[j])
|
186 |
_new_ids.append(_tmp)
|
187 |
# st.markdown(_new_ids)
|
188 |
outputs = [tokenizer.decode(_new_ids[i]).split('</s></s>')[0] for i in range(restarts)]
|
189 |
+
for i in range(len(outputs)):
|
190 |
+
for j in _used_sl:
|
191 |
+
_j = tokenizer.decode(j)
|
192 |
+
outputs[i] = outputs[i].replace(_j, f':red[{_j}]')
|
193 |
|
194 |
st.success(f'We found {restarts} replacements!', icon="β
")
|
195 |
st.markdown('<br>'.join(outputs), unsafe_allow_html=True)
|