Spaces:

anonauthors
/

SecretLanguage

Runtime error

App Files Files Community

AnonymousAuthors commited on Feb 8, 2023

Commit

91b1515

•

1 Parent(s): 5a5f4eb

Upload 4 files

Browse files

Files changed (3) hide show

keys.pkl +3 -0
pages/1_📖_Dictionary_(Browse).py +175 -0
pages/2_😈_Blackbox_Attack.py +162 -0

keys.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:701fae656585df909b98fdcfa4ec681754a99dd9a16eb07559a948725ba25b73
+size 187154

pages/1_📖_Dictionary_(Browse).py ADDED Viewed

	@@ -0,0 +1,175 @@

+import streamlit as st
+import pandas as pd
+# import gdown
+import os
+import pickle
+from streamlit import session_state as _state
+from PyDictionary import PyDictionary
+from streamlit_extras.colored_header import colored_header
+dictionary = PyDictionary()
+st.set_page_config(layout="wide", page_title="ACl23 Secret Language")
+from streamlit_extras.switch_page_button import switch_page
+from streamlit_extras.stateful_button import button
+# for key in st.session_state.keys():
+    # del st.session_state[key]
+# st.markdown(st.session_state)
+# want_to_contribute = st.button("I want to contribute!")
+# if want_to_contribute:
+#     # switch_page("dictionary (search)")
+#     switch_page("home")
+# st.title("ACl23 Secret Language")
+# sidebar
+st.sidebar.header("📙 Dictionary (Browse)")
+# title = st.sidebar.text_input(":red[Search secret languages given the following word]", 'Asian')
+buttons = {}
+# def call_back():
+#     for k in buttons:
+#         if buttons[k]:
+#             st.text(k)
+with st.sidebar:
+    cols0 = st.columns(8)
+    for i in range(len(cols0)):
+        with cols0[i]:
+            buttons[chr(65 + i)] = st.button(chr(65 + i))
+            # buttons[chr(65 + i)] = button(chr(65 + i), key=chr(65 + i))
+    cols1 = st.columns(8)
+    for i in range(len(cols1)):
+        with cols1[i]:
+            buttons[chr(65 + 8 + i)] = st.button(chr(65 + 8 + i))
+            # buttons[chr(65 + 8 + i)] = button(chr(65 + 8 + i), key=chr(65 + 8 + i))
+    cols2 = st.columns(8)
+    for i in range(len(cols2)):
+        with cols2[i]:
+            buttons[chr(65 + 16 + i)] = st.button(chr(65 + 16 + i))
+            # buttons[chr(65 + 16 + i)] = button(chr(65 + 16 + i), key=chr(65 + 16 + i))
+    cols3 = st.columns(8)
+    for i in range(2):
+        with cols3[i]:
+            buttons[chr(65 + 24 + i)] = st.button(chr(65 + 24 + i))
+            # buttons[chr(65 + 24 + i)] = button(chr(65 + 24 + i), key=chr(65 + 24 + i))
+    cols4 = st.columns(2)
+    buttons['0-9'] = cols4[0].button('0-9')
+    buttons['Others'] = cols4[1].button('Others')
+#     select = st.radio(
+#         "Select initial to browse.",
+#         [chr(i) for i in range(97, 123)] + ['0-9', 'Others'],
+#         key="nokeyjustselect",
+#     )
+# if select == '0-9':
+#     st.title(select)
+#     file_names = ['num_dict.pkl']
+# elif select == 'Others':
+#     st.title(select)
+#     file_names = ['other_dict.pkl']
+# elif ord(select[0]) in list(range(97, 123)) + list(range(65, 91)):
+#     st.title(chr(ord(select)))
+#     file_names = [f'{ord(select[0]) - 32}_dict.pkl', f'{ord(select[0])}_dict.pkl']
+# all_data = {}
+# all_key = []
+# for file_name in file_names:
+#     _data = pickle.load(open(f'all_secret_langauge_by_fist/{file_name}', 'rb'))
+#     all_data.update(_data)
+#     all_key.extend(sorted(list(_data.keys())))
+#     # st.markdown(file_name, unsafe_allow_html=True)
+#     # st.markdown(_data.keys(), unsafe_allow_html=True)
+# all_key = sorted(list(set(all_key)))
+# # st.markdown(','.join(all_key))
+# for key in all_key:
+#     # if len(key) and key[0] != '"':
+#     #     st.markdown(key, unsafe_allow_html=True)
+#     # st.change_page("home")
+#     # st.markdown(f'<a href="Dictionary_(Search)?word={key}" target="_self">{key}</a>', unsafe_allow_html=True)
+#     # print(key)
+#     # word_buttons.append(st.button(f'{key}', key=key))
+#     _button = st.button(f'{key}', key=key)
+#     if _button:
+#         st.session_state.click_word = key
+#         switch_page("dictionary (search)")
+# st.markdown(st.session_state)
+all_word_button = {}
+# for key in all_word_button:
+    # if all_word_button[key]:
+    #     st.session_state.click_word = key
+    #     switch_page("dictionary (search)")
+for k in st.session_state.keys():
+    if st.session_state[k]:
+        if 'button_' in k:
+            st.session_state.click_word = k.split('button_')[-1]
+            # st.markdown(k)
+            switch_page("dictionary (search)")
+# all_condition = False
+word_buttons = None
+for k in buttons:
+    if buttons[k]:
+        # for _k in buttons:
+        #     if _k != k:
+        #         st.session_state[_k] = False
+        word_buttons = []
+        if k == '0-9':
+            # st.title(k)
+            colored_header(
+                label=k,
+                description="",
+                color_name="violet-70",
+            )
+            file_names = ['num_dict.pkl']
+        elif k == 'Others':
+            # st.title(k)
+            colored_header(
+                label=k,
+                description="",
+                color_name="violet-70",
+            )
+            file_names = ['other_dict.pkl']
+        elif ord(k[0]) in list(range(97, 123)) + list(range(65, 91)):
+            # st.title(chr(ord(k)))
+            colored_header(
+                label=chr(ord(k)),
+                description="",
+                color_name="violet-70",
+            )
+            file_names = [f'{ord(k[0]) + 32}_dict.pkl', f'{ord(k[0])}_dict.pkl']
+        all_data = {}
+        all_key = []
+        for file_name in file_names:
+            _data = pickle.load(open(f'all_secret_langauge_by_fist/{file_name}', 'rb'))
+            all_data.update(_data)
+            all_key.extend(sorted(list(_data.keys())))
+            # st.markdown(file_name, unsafe_allow_html=True)
+            # st.markdown(_data.keys(), unsafe_allow_html=True)
+        all_key = sorted(list(set(all_key)))
+        # st.markdown(','.join(all_key))
+        for key in all_key:
+            # if len(key) and key[0] != '"':
+            #     st.markdown(key, unsafe_allow_html=True)
+            # st.change_page("home")
+            # st.markdown(f'<a href="Dictionary_(Search)?word={key}" target="_self">{key}</a>', unsafe_allow_html=True)
+            # print(key)
+            # word_buttons.append(st.button(f'{key}', key=key))
+            all_word_button[key] = st.button(f'{key}', key=f'button_{key}')
+            # all_word_button[key] = button(f'{key}', key=key)
+# for _button in word_buttons:
+#     if _button:
+# # st.session_state.click_word = key
+# # st.markdown('asdfd')
+#         switch_page("home")
+            # with st.expander(key):
+            #     st.markdown(':red[Secret Languages:' + ','.join(all_data[key]['secret languages']), unsafe_allow_html=True)

pages/2_😈_Blackbox_Attack.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import streamlit as st
+import os
+from streamlit_extras.stateful_button import button
+from transformers import GPT2Tokenizer, GPT2Model
+tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+model = GPT2Model.from_pretrained('gpt2')
+import pickle
+all_keys = pickle.load(open('keys.pkl', 'rb'))
+all_keys = [i.strip() for i in all_keys]
+import torch
+from copy import deepcopy
+from time import time
+st.title('Blackbox Attack')
+def run(model, _bar_text=None, bar=None, text='Which name is also used to describe the Amazon rainforest in English?', loss_funt=torch.nn.MSELoss(), lr=1, noise_mask=[1,2], restarts=10, step=100, device = torch.device('cpu')):
+    subword_num = model.wte.weight.shape[0]
+    _input = tokenizer([text] * restarts, return_tensors="pt")
+    for k in _input.keys():
+        _input[k] = _input[k].to(device)
+    ori_output = model(**_input)['last_hidden_state']
+    ori_embedding = model.wte(_input['input_ids']).detach()
+    ori_embedding.requires_grad = False
+    ori_word_one_hot = torch.nn.functional.one_hot(_input['input_ids'].detach(), num_classes=subword_num).to(device)
+    noise = torch.randn(ori_embedding.shape[0], ori_embedding.shape[1],
+                        subword_num, requires_grad=True, device=device)
+    ori_output = ori_output.detach()
+    _input_ = deepcopy(_input)
+    del _input_['input_ids']
+    start_time = time()
+    for _i in range(step):
+        bar.progress((_i + 1) / step)
+        perturbed_embedding = ori_embedding.clone()
+        for i in range(len(noise_mask)):
+            _tmp_perturbed_input = ori_word_one_hot[:, noise_mask[i]] + noise[:, i]
+            _tmp_perturbed_input /= _tmp_perturbed_input.sum(-1, keepdim=True)
+            perturbed_embedding[:, noise_mask[i]] = torch.matmul(_tmp_perturbed_input, model.wte.weight)
+        _input_['inputs_embeds'] = perturbed_embedding
+        outputs_perturbed = model(**_input_)['last_hidden_state']
+        loss = loss_funt(ori_output, outputs_perturbed)
+        loss.backward()
+        noise.data = (noise.data - lr * noise.grad.detach())
+        noise.grad.zero_()
+        _bar_text.text(f'{(time() - start_time) * (step - _i - 1) / (_i + 1):.2f} seconds left')
+    # validate
+    with torch.no_grad():
+        perturbed_inputs = deepcopy(_input)
+        for i in range(len(noise_mask)):
+            _tmp_perturbed_input = ori_word_one_hot[:, noise_mask[i]] + noise[:, i]
+            _tmp_perturbed_input /= _tmp_perturbed_input.sum(-1, keepdim=True)
+            # print(f'torch.argmax(_tmp_perturbed_input, dim=-1).long(){torch.argmax(_tmp_perturbed_input, dim=-1).long()}')
+            perturbed_inputs['input_ids'][:, noise_mask[i]] = torch.argmax(_tmp_perturbed_input, dim=-1).long()
+        perturbed_questions = []
+        for i in range(restarts):
+            perturbed_questions.append(tokenizer.decode(perturbed_inputs["input_ids"][i]).split("</s></s>")[0])
+    return perturbed_questions
+from transformers import GPT2Tokenizer, GPT2Model
+tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+model = GPT2Model.from_pretrained('gpt2')
+# encoded_input = tokenizer(text, return_tensors='pt')
+# output = model(**encoded_input)
+option = st.selectbox(
+    'Which method you would like to use?',
+    ('GPT-2 (Searching secret languages based on GPT-2)', 'Use the secret language we found on ALBERT, DistillBERT, and Roberta.')
+)
+title = st.text_area('Input text.', 'Which name is also used to describe the Amazon rainforest in English?')
+if option == 'GPT-2 (Searching secret languages based on GPT-2)':
+    _cols = st.columns(2)
+    restarts = _cols[0].number_input('Number of replacements.', value=10, min_value=1, step=1, format='%d')
+    step = _cols[1].number_input('Step for searching Secret Langauge', value=100, min_value=1, step=1, format='%d')
+else:
+    restarts = st.number_input('Number of replacements.', value=10, min_value=1, step=1, format='%d')
+def get_secret_language(title):
+    if ord(title[0]) in list(range(48, 57)):
+        file_name = 'num_dict.pkl'
+    elif ord(title[0]) in list(range(97, 122)) + list(range(65, 90)):
+        file_name = f'{ord(title[0])}_dict.pkl'
+    else:
+        file_name = 'other_dict.pkl'
+    datas = pickle.load(open(f'all_secret_langauge_by_fist/{file_name}', 'rb'))
+    data_ = datas[title.strip()]
+    _sls_id = []
+    for i in range(len(data_['secret languages'])):
+        new_ids = tokenizer(data_['replaced sentences'][i])['input_ids']
+        _sl = data_['secret languages'][i]
+        for _id in new_ids:
+            if _sl.strip() == tokenizer.decode(_id):
+                _sls_id.append(_id)
+                break
+    return _sls_id
+if button('Tokenize', key='tokenizer'):
+    for key in st.session_state.keys():
+        if key not in ['tokenizer', 'start'] and 'tokenizer_' not in key:
+            del st.session_state[key]
+    input_ids = tokenizer(title)['input_ids']
+    st.markdown('## Choose the (sub)words you want to replace.')
+    subwords = [tokenizer.decode(i) for i in input_ids]
+    _len = len(subwords)
+    for i in range(int(_len / 6) + 1):
+        cols = st.columns(6)
+        for j in range(6):
+            with cols[j]:
+                _index = i * 6 + j
+                if _index < _len:
+                    disable = False
+                    if subwords[_index].strip() not in all_keys and option == 'Use the secret language we found on ALBERT, DistillBERT, and Roberta.':
+                        disable = True
+                    button(subwords[_index], key=f'tokenizer_{_index}', disabled=disable)
+    # st.markdown(dict(st.session_state))
+    st.markdown('## Ready to go? Hold on tight.')
+    if button('Give it a shot!', key='start'):
+        chose_indices = []
+        for key in st.session_state:
+            if st.session_state[key]:
+                if 'tokenizer_' in key:
+                    # st.markdown(key)
+                    chose_indices.append(int(key.replace('tokenizer_', '')))
+        if len(chose_indices):
+            _bar_text = st.empty()
+            if option == 'GPT-2 (Searching secret languages based on GPT-2)':
+                bar = st.progress(0)
+            # st.markdown('start')
+                outputs = run(model, _bar_text=_bar_text, bar=bar, text=title, noise_mask=chose_indices, restarts=restarts, step=step)
+            else:
+                _new_ids = []
+                _sl = {}
+                for j in chose_indices:
+                    _sl[j] = get_secret_language(tokenizer.decode(input_ids[j]).strip())
+                for i in range(restarts):
+                    _tmp = []
+                    for j in range(len(input_ids)):
+                        if j in chose_indices:
+                            _tmp.append(_sl[j][i % len(_sl[j])])
+                        else:
+                            _tmp.append(input_ids[j])
+                    _new_ids.append(_tmp)
+                # st.markdown(_new_ids)
+                outputs = [tokenizer.decode(_new_ids[i]).split('</s></s>')[0] for i in range(restarts)]
+            st.success(f'We found {restarts} replacements!', icon="✅")
+            st.markdown('<br>'.join(outputs), unsafe_allow_html=True)
+        else:
+            st.error('At least choose one subword.')