anonymousauthors commited on
Commit
5fb4442
β€’
1 Parent(s): 2247711

Upload 4 files

Browse files
SecretLanguage.py CHANGED
@@ -15,23 +15,30 @@ st.set_page_config(layout="wide", page_title="ACl23 Secret Language")
15
 
16
  st.title("ACl23 Submission: Finding Secret Language")
17
 
 
 
18
  st.markdown('This webpage serves as an illustration of an anonymous submission to ACL 23.')
19
 
20
- st.markdown('### How to play with this page?')
21
- st.markdown('We present two methods for searching secret language: a direct search using the Dictionary (Search) option, and browsing words that have already been found for secret languages.')
 
22
 
23
- st.markdown("By entering a word you want to find its secret languages, you can view the word's meaning in English, all the secret languages we have discovered for it, and examples."
 
24
  "The hyperlinks sometimes might not work due to the contained property of Hugging Face space.")
25
  st.image(search_image, caption='A search example.')
26
 
 
27
  st.markdown('By clicking on the initial letters (A to Z, numbers, and other characters), you can view all the words whose secret languages have been discovered and that begin with the selected initial. By clicking on a word, you will be redirected to the search page, where you can view information about the selected word.')
28
  st.image(browse_image, caption='A browse example.')
29
 
 
 
 
30
  st.markdown('### Models and datasets.')
31
 
32
  st.markdown('On this page, we present the secret languages we discovered using ALBERT, DistillBERT, and Roberta models and data from the GLUE (MRPC), SNLI, and SQuAD datasets.')
33
 
34
-
35
  st.markdown('### Ethics statements for this webpage')
36
 
37
  st.markdown('We present secret languages discovered using our proposed algorithms. '
 
15
 
16
  st.title("ACl23 Submission: Finding Secret Language")
17
 
18
+ # st.sidebar.markdown("### This webpage serves as an illustration of an anonymous submission to ACL 23.")
19
+
20
  st.markdown('This webpage serves as an illustration of an anonymous submission to ACL 23.')
21
 
22
+ st.markdown('### What do we offer?')
23
+ st.markdown('We present two methods for searching secret languages. The first method is a direct search using the "πŸ“™ Dictionary (Search)" option, while the second method, "πŸ“– Dictionary (Browse)", involves browsing words that have already been found to have secret languages. '
24
+ 'Additionally, we also provide a tool for finding secret languages in a black-box manner.')
25
 
26
+ st.markdown('#### How to use "πŸ“™ Dictionary (Search)"?')
27
+ st.markdown("By entering a word you want to find its secret languages, you can view the word's meaning in English, all the secret languages we have discovered for it, and examples. "
28
  "The hyperlinks sometimes might not work due to the contained property of Hugging Face space.")
29
  st.image(search_image, caption='A search example.')
30
 
31
+ st.markdown('#### How to use "πŸ“– Dictionary (Browse)"?')
32
  st.markdown('By clicking on the initial letters (A to Z, numbers, and other characters), you can view all the words whose secret languages have been discovered and that begin with the selected initial. By clicking on a word, you will be redirected to the search page, where you can view information about the selected word.')
33
  st.image(browse_image, caption='A browse example.')
34
 
35
+ st.markdown('#### How to use "😈 Blackbox Attack"?')
36
+ st.markdown('We offer two methods for generating replacement words using secret languages. Detailed introduction can be found on the page.')
37
+
38
  st.markdown('### Models and datasets.')
39
 
40
  st.markdown('On this page, we present the secret languages we discovered using ALBERT, DistillBERT, and Roberta models and data from the GLUE (MRPC), SNLI, and SQuAD datasets.')
41
 
 
42
  st.markdown('### Ethics statements for this webpage')
43
 
44
  st.markdown('We present secret languages discovered using our proposed algorithms. '
pages/0_πŸ“™_Dictionary_(Search).py CHANGED
@@ -36,6 +36,18 @@ for key in st.session_state.keys():
36
 
37
  title = st.sidebar.text_input(":red[Search secret languages given the following word (case-sensitive)]", default_title)
38
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  if ord(title[0]) in list(range(48, 57)):
40
  file_name = 'num_dict.pkl'
41
  elif ord(title[0]) in list(range(97, 122)) + list(range(65, 90)):
@@ -230,7 +242,7 @@ if title in datas:
230
  _string += 'question**: :'
231
  elif task == 'Paraphrase':
232
  _string += 'sentence 1**: :'
233
- _string += f'red[{_all[_sl]["Replaced hypothesis"][j]}]'.replace(":", "[colon]")
234
  if task == 'NLI':
235
  _string += '<br> **Premise**: :'
236
  elif task == 'QA':
 
36
 
37
  title = st.sidebar.text_input(":red[Search secret languages given the following word (case-sensitive)]", default_title)
38
 
39
+ st.sidebar.markdown("### Frequent FAQs")
40
+ st.sidebar.markdown("1. *Why are words in sentences represented as subwords instead of complete words?*<br>"
41
+ "The tokenizer we use is from DistillBERT, ALBERT, or Roberta, which tokenizes sentences into subwords. As a result, the word being replaced in a sentence might be a subword (such as `rain` in `rainforest`).",
42
+ unsafe_allow_html=True)
43
+ st.sidebar.markdown("2. *This page is extremely slow. I cannot stand it.*<br>"
44
+ "We apologize for the slow performance of this page. We are actively working on improving it."
45
+ "As loading the data can take time and some words have many secret languages, this page needs time to process.",
46
+ unsafe_allow_html=True)
47
+ st.sidebar.markdown("3. *Why are some examples significantly different from the original sentences? *<br>"
48
+ "As per our submission, we replace 1 to 10 subwords in a sentence. However, for some examples with short lengths, the entire sentence may be altered. We are conducting experiments and will present examples where only a single subword has been changed.",
49
+ unsafe_allow_html=True)
50
+
51
  if ord(title[0]) in list(range(48, 57)):
52
  file_name = 'num_dict.pkl'
53
  elif ord(title[0]) in list(range(97, 122)) + list(range(65, 90)):
 
242
  _string += 'question**: :'
243
  elif task == 'Paraphrase':
244
  _string += 'sentence 1**: :'
245
+ _string += f'red[{_all[_sl]["Replaced hypothesis"][j]}]'.replace('/', '\\').replace(___sl, f"<i><b>{___sl}</b></i>").replace(":", "[colon]")
246
  if task == 'NLI':
247
  _string += '<br> **Premise**: :'
248
  elif task == 'QA':
pages/2_😈_Blackbox_Attack.py CHANGED
@@ -15,10 +15,17 @@ from time import time
15
  st.title('Blackbox Attack')
16
  st.sidebar.markdown('On this page, we offer a tool for generating replacement words using secret languages.')
17
 
18
- st.sidebar.markdown('There are two methods for generating replacements.')
19
- st.sidebar.markdown('1. GPT-2 (Searching secret languages based on GPT-2): this method calculates secret languages using [GPT-2](https://huggingface.co/gpt2) and requires input text, the number of replacements desired, and the steps. The number of replacements represents the number of sentences you want to generate, while steps refer to the steps in the SecretFinding process.')
 
 
 
 
 
20
  st.sidebar.markdown('2. Use the secret language we found on ALBERT, DistillBERT, and Roberta: this method replaces words directly with the secret language dictionary derived from ALBERT, DistillBERT, and Roberta.')
21
 
 
 
22
  def run(model, _bar_text=None, bar=None, text='Which name is also used to describe the Amazon rainforest in English?', loss_funt=torch.nn.MSELoss(), lr=1, noise_mask=[1,2], restarts=10, step=100, device = torch.device('cpu')):
23
  subword_num = model.wte.weight.shape[0]
24
 
@@ -66,6 +73,11 @@ def run(model, _bar_text=None, bar=None, text='Which name is also used to descri
66
  perturbed_questions = []
67
  for i in range(restarts):
68
  perturbed_questions.append(tokenizer.decode(perturbed_inputs["input_ids"][i]).split("</s></s>")[0])
 
 
 
 
 
69
  return perturbed_questions
70
 
71
 
@@ -80,7 +92,15 @@ option = st.selectbox(
80
  ('GPT-2 (Searching secret languages based on GPT-2)', 'Use the secret language we found on ALBERT, DistillBERT, and Roberta.')
81
  )
82
 
83
- title = st.text_area('Input text.', 'Which name is also used to describe the Amazon rainforest in English?')
 
 
 
 
 
 
 
 
84
 
85
  if option == 'GPT-2 (Searching secret languages based on GPT-2)':
86
  _cols = st.columns(2)
@@ -124,8 +144,11 @@ if button('Tokenize', key='tokenizer'):
124
  _index = i * 6 + j
125
  if _index < _len:
126
  disable = False
127
- if subwords[_index].strip() not in all_keys and option == 'Use the secret language we found on ALBERT, DistillBERT, and Roberta.':
128
- disable = True
 
 
 
129
  button(subwords[_index], key=f'tokenizer_{_index}', disabled=disable)
130
 
131
 
@@ -136,8 +159,10 @@ if button('Tokenize', key='tokenizer'):
136
  for key in st.session_state:
137
  if st.session_state[key]:
138
  if 'tokenizer_' in key:
 
139
  # st.markdown(key)
140
- chose_indices.append(int(key.replace('tokenizer_', '')))
 
141
  if len(chose_indices):
142
  _bar_text = st.empty()
143
  if option == 'GPT-2 (Searching secret languages based on GPT-2)':
@@ -147,6 +172,7 @@ if button('Tokenize', key='tokenizer'):
147
  else:
148
  _new_ids = []
149
  _sl = {}
 
150
  for j in chose_indices:
151
  _sl[j] = get_secret_language(tokenizer.decode(input_ids[j]).strip())
152
  for i in range(restarts):
@@ -154,11 +180,16 @@ if button('Tokenize', key='tokenizer'):
154
  for j in range(len(input_ids)):
155
  if j in chose_indices:
156
  _tmp.append(_sl[j][i % len(_sl[j])])
 
157
  else:
158
  _tmp.append(input_ids[j])
159
  _new_ids.append(_tmp)
160
  # st.markdown(_new_ids)
161
  outputs = [tokenizer.decode(_new_ids[i]).split('</s></s>')[0] for i in range(restarts)]
 
 
 
 
162
 
163
  st.success(f'We found {restarts} replacements!', icon="βœ…")
164
  st.markdown('<br>'.join(outputs), unsafe_allow_html=True)
 
15
  st.title('Blackbox Attack')
16
  st.sidebar.markdown('On this page, we offer a tool for generating replacement words using secret languages.')
17
 
18
+ st.sidebar.markdown('#### Require ')
19
+ st.sidebar.markdown('`Input text`: a sentence or paragraph.')
20
+ st.sidebar.markdown('`Number of replacements`: the number of secret language samples.')
21
+ st.sidebar.markdown('`Steps for searching Secret Langauge`: the steps in the SecretFinding process.')
22
+
23
+ st.sidebar.markdown('#### Two methods')
24
+ st.sidebar.markdown('1. GPT-2 (Searching secret languages based on GPT-2): this method calculates secret languages using [GPT-2](https://huggingface.co/gpt2).')
25
  st.sidebar.markdown('2. Use the secret language we found on ALBERT, DistillBERT, and Roberta: this method replaces words directly with the secret language dictionary derived from ALBERT, DistillBERT, and Roberta.')
26
 
27
+
28
+
29
  def run(model, _bar_text=None, bar=None, text='Which name is also used to describe the Amazon rainforest in English?', loss_funt=torch.nn.MSELoss(), lr=1, noise_mask=[1,2], restarts=10, step=100, device = torch.device('cpu')):
30
  subword_num = model.wte.weight.shape[0]
31
 
 
73
  perturbed_questions = []
74
  for i in range(restarts):
75
  perturbed_questions.append(tokenizer.decode(perturbed_inputs["input_ids"][i]).split("</s></s>")[0])
76
+ for i in range(len(perturbed_questions)):
77
+ for j in noise_mask:
78
+ _j = tokenizer.decode(perturbed_inputs["input_ids"][i][j])
79
+ # print(f'_j {_j}')
80
+ perturbed_questions[i] = perturbed_questions[i].replace(_j, f':red[{_j}]')
81
  return perturbed_questions
82
 
83
 
 
92
  ('GPT-2 (Searching secret languages based on GPT-2)', 'Use the secret language we found on ALBERT, DistillBERT, and Roberta.')
93
  )
94
 
95
+ def clf_keys():
96
+ for key in st.session_state.keys():
97
+ if key in ['tokenizer', 'start']:
98
+ st.session_state[key] = False
99
+ elif 'tokenizer_' in key:
100
+ del st.session_state[key]
101
+
102
+
103
+ title = st.text_area('Input text.', 'Which name is also used to describe the Amazon rainforest in English?', on_change=clf_keys)
104
 
105
  if option == 'GPT-2 (Searching secret languages based on GPT-2)':
106
  _cols = st.columns(2)
 
144
  _index = i * 6 + j
145
  if _index < _len:
146
  disable = False
147
+ if option == 'Use the secret language we found on ALBERT, DistillBERT, and Roberta.':
148
+ if subwords[_index].strip() not in all_keys:
149
+ disable = True
150
+ # if f'tokenizer_{_index}' in st.session_state:
151
+ # del st.session_state[f'tokenizer_{_index}']
152
  button(subwords[_index], key=f'tokenizer_{_index}', disabled=disable)
153
 
154
 
 
159
  for key in st.session_state:
160
  if st.session_state[key]:
161
  if 'tokenizer_' in key:
162
+ _index = int(key.replace('tokenizer_', ''))
163
  # st.markdown(key)
164
+ if _index < len(input_ids):
165
+ chose_indices.append(_index)
166
  if len(chose_indices):
167
  _bar_text = st.empty()
168
  if option == 'GPT-2 (Searching secret languages based on GPT-2)':
 
172
  else:
173
  _new_ids = []
174
  _sl = {}
175
+ _used_sl = []
176
  for j in chose_indices:
177
  _sl[j] = get_secret_language(tokenizer.decode(input_ids[j]).strip())
178
  for i in range(restarts):
 
180
  for j in range(len(input_ids)):
181
  if j in chose_indices:
182
  _tmp.append(_sl[j][i % len(_sl[j])])
183
+ _used_sl.append(_sl[j][i % len(_sl[j])])
184
  else:
185
  _tmp.append(input_ids[j])
186
  _new_ids.append(_tmp)
187
  # st.markdown(_new_ids)
188
  outputs = [tokenizer.decode(_new_ids[i]).split('</s></s>')[0] for i in range(restarts)]
189
+ for i in range(len(outputs)):
190
+ for j in _used_sl:
191
+ _j = tokenizer.decode(j)
192
+ outputs[i] = outputs[i].replace(_j, f':red[{_j}]')
193
 
194
  st.success(f'We found {restarts} replacements!', icon="βœ…")
195
  st.markdown('<br>'.join(outputs), unsafe_allow_html=True)