File size: 14,525 Bytes
701f8ff
 
cba28b4
701f8ff
 
 
5a5f4eb
701f8ff
 
 
 
 
7ef98e6
701f8ff
 
 
 
 
 
 
6609588
701f8ff
 
 
 
 
 
8e1b917
d8663a9
e735e42
 
701f8ff
75d0a70
 
 
 
7ef98e6
701f8ff
5fb4442
38b6dfd
5fb4442
 
38b6dfd
621caef
7ef98e6
5fb4442
38b6dfd
5fb4442
 
 
701f8ff
 
 
 
 
 
 
 
 
 
e735e42
5a5f4eb
 
 
1ff8db6
5a5f4eb
 
701f8ff
 
 
 
 
265c345
701f8ff
 
e735e42
 
701f8ff
5a5f4eb
7ef98e6
701f8ff
63e2daf
701f8ff
 
6609588
5a5f4eb
 
 
6609588
 
5a5f4eb
 
701f8ff
 
 
 
 
 
 
e735e42
 
75d0a70
701f8ff
 
 
 
 
 
e735e42
701f8ff
 
b00878a
63e2daf
b00878a
 
 
6609588
30c7c90
 
 
 
 
 
 
 
 
 
 
 
e735e42
 
701f8ff
 
75d0a70
701f8ff
30c7c90
3e701f9
30c7c90
5a5f4eb
 
6609588
5a5f4eb
 
701f8ff
3e701f9
701f8ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63e2daf
701f8ff
63e2daf
701f8ff
63e2daf
701f8ff
 
 
 
 
 
6609588
701f8ff
 
 
 
 
 
 
 
 
 
 
 
63e2daf
1ff8db6
701f8ff
1ff8db6
 
 
 
 
 
 
 
701f8ff
 
 
 
e735e42
 
63e2daf
e735e42
 
63e2daf
 
701f8ff
 
 
 
 
 
 
 
 
 
 
 
265c345
701f8ff
 
 
 
 
 
 
265c345
701f8ff
30c7c90
701f8ff
63e2daf
701f8ff
63e2daf
265c345
63e2daf
701f8ff
63e2daf
701f8ff
 
e735e42
 
701f8ff
 
 
 
6609588
701f8ff
 
 
 
 
 
 
 
 
 
 
 
75d0a70
30c7c90
 
 
701f8ff
6609588
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
import streamlit as st
import pandas as pd
# import gdown
import os
import pickle
from collections import defaultdict, Counter
from streamlit_extras.colored_header import colored_header

from PyDictionary import PyDictionary

dictionary = PyDictionary()

st.set_page_config(layout="wide", page_title="Obstinate Adversarial Examples")

hide_expander_border = """
<style>
.st-bd {border-style: none;}
</style>
"""

# st.title("Obstinate Adversarial Examples of LMs")

# sidebar
st.sidebar.header("📙 Dictionary")
_data = st.experimental_get_query_params()
default_title = 'Asian'
if _data:
    if 'word' in _data.keys():
        default_title = _data['word'][0]
if 'click_word' in st.session_state:
    default_title = st.session_state.click_word

for key in st.session_state.keys():
    if key != 'click_word':
        del st.session_state[key]

title = st.sidebar.text_input(":red[Search obstinate subsituitions given the following word (case-sensitive)]", default_title)

st.sidebar.markdown("### Frequent FAQs")
st.sidebar.markdown("1. :blue[*Why are words in sentences represented as subwords instead of complete words?*]<br>"
        "The tokenizer we use is from DistillBERT, ALBERT, or Roberta, which tokenizes sentences into subwords. As a result, the word being replaced in a sentence might be a subword (such as `rain` in `rainforest`).",
         unsafe_allow_html=True)
st.sidebar.markdown("2. :blue[*This page is extremely slow. I cannot stand it.*]<br>"
        "We apologize for the slow performance of this page. We are actively working on improving it. "
        "As loading the data can take time and some words have many obstinate subsituitions, this page needs time to process.",
         unsafe_allow_html=True)
st.sidebar.markdown("3. :blue[*Why are some examples significantly different from the original sentences?*] <br>"
        "As per our submission, we replace 1 to 10 subwords in a sentence. However, for some examples with short lengths, the entire sentence may be altered. We are conducting experiments and will present examples where only a single subword has been changed.",
         unsafe_allow_html=True)

if ord(title[0]) in list(range(48, 57)):
    file_name = 'num_dict.pkl'
elif ord(title[0]) in list(range(97, 122)) + list(range(65, 90)):
    file_name = f'{ord(title[0])}_dict.pkl'
else:
    file_name = 'other_dict.pkl'

datas = pickle.load(open(f'all_secret_langauge_by_fist/{file_name}', 'rb'))
if title in datas:
    st.title(title)
    # st.markdown(f":red[[]]")
    # st.markdown(f"## {title}'s meaning in English[¹](#jump)")
    colored_header(
                label=f"{title}'s meaning in English[¹](#jump)",
                description="**English meaning is supported by [PyDictionary](https://pypi.org/project/PyDictionary/)*",
                color_name="violet-70",
            )
    # write the meaning of input word
    try:
        title_mean = dictionary.meaning(title)
        _string = '>'
        for key in title_mean:
            _string += f':violet[{key}]: {"; ".join(title_mean[key])}<br>'
        st.markdown(_string, unsafe_allow_html=True)
    except:
        st.error(f'We cannot find the meaning of {title} in English (PyDictionary), which might be due to the bug.',
                 icon="🚨")

    # st.markdown(f"---")
    # st.markdown(f"## {title}'s obstinate subsituitions")
    data_title = datas[title]
    # st.markdown(data_title.keys())
    title_secret_languages = list(sorted(list(set(data_title["secret languages"]))))
    # dataframe = pd.DataFrame(datas[title])
    # st.markdown(f'### We found {len(set(dataframe.loc[:, "secret languages"]))} obstinate substitutions of {title}.', unsafe_allow_html=True)
    # st.markdown(f'Overall, we found :red[{len(title_secret_languages)}] secret languages of :blue[{title}].',
                # unsafe_allow_html=True)
    colored_header(
                label=f"{title}'s obstinate substitutions",
                description=f'Overall, we found :red[{len(title_secret_languages)}] obstinate adversarial substitutions of :blue[{title}].',
                color_name="red-70",
            )
    special = '"'
    # _title_secret_languages = [f'[{i}](#{i.strip().replace("(", ",,").replace(")", "..").replace("[", ",,,").replace("]", "...").replace(special, "././")})'
    #                            for i in title_secret_languages]
    # st.markdown('>' + ', '.join(_title_secret_languages).replace('<s>', '\<s\>').replace('$', '\$').replace('~', '\~'),
    #             unsafe_allow_html=True)

    secret_language_by_task = {
        'QA': [],
        'NLI': [],
        'Paraphrase': [],
    }
    for i in range(len(data_title['secret languages'])):
        secret_language_by_task[data_title['tasks'][i]].append(data_title['secret languages'][i])
    for k in secret_language_by_task:
        secret_language_by_task[k] = list(set(secret_language_by_task[k]))


    def present_sl_task(secret_language_by_task, task):
        all_sl = sorted(secret_language_by_task[task])
        new_all_sl = []
        for i in range(len(all_sl)):
            if all_sl[i] != '':
                new_all_sl.append(all_sl[i].replace("\n", "/n").strip()) 
        all_sl = sorted(new_all_sl)
        with st.expander(f'***{len(all_sl)}*** obstinate substitutions of ***{title}*** on {task.replace("paraphrase", "Paraphrase")}'):
        # st.markdown(
            # f':red[{len(all_sl)}] secret languages of :blue[{title}] on {task.replace("paraphrase", "Paraphrase")}',
            # unsafe_allow_html=True)
            special = '"'
            _title_secret_languages = [
                # f'[{i}](#{i.strip().replace("(", ",,").replace(")", "..").replace("[", ",,,").replace("]", "...").replace(special, "././")}_{task})'
                f'<a href="#{i.strip().replace("(", ",,").replace(")", "..").replace("[", ",,,").replace("]", "...").replace(special, "././")}_{task}">{i}</a>'
                for i in all_sl]
            st.markdown(
                # '>' + ', '.join(_title_secret_languages).replace('<s>', '\<s\>').replace('$', '\$').replace('~', '\~'),
                ', '.join(_title_secret_languages).replace('<s>', '\<s\>').replace('$', '\$').replace('~', '\~'),
                unsafe_allow_html=True)


    present_sl_task(secret_language_by_task, 'NLI')
    present_sl_task(secret_language_by_task, 'QA')
    present_sl_task(secret_language_by_task, 'Paraphrase')

    st.caption(f"\**Hyperlinks only function when the corresponding tab is open. "
                f"For example, the hyperlinks in the paraphrase section will only work when the paraphrase tab is open. However, due to the container property of Hugging Face Space, the hyperlinks might be not able to function.*")
    st.caption('\**Due to the grammatical properties of HTML, the layout of this page may vary.*')
    colored_header(
                label=f"Examples of replaced sentences",
                description=f'**The number following the tasks represents the number of examples found for a particular task, which may be different from the number of obstinate adversarial substitutions.*',
                color_name="orange-70",
            )
    _num = Counter(data_title['tasks'])
    tab1, tab2, tab3 = st.tabs([f'NLI ({_num["NLI"]})', f'QA ({_num["QA"]})', f'Paraphrase ({_num["Paraphrase"]})'])


    def present_dataframe(dataframe, key, title):
        new_dataframe = dataframe.loc[dataframe['tasks'] == key].reset_index()
        new_dataframe['replaced sentences'] = new_dataframe['replaced sentences'].str.replace('<s>', '[POS]')
        if len(new_dataframe):
            new_dataframe = new_dataframe.drop(columns=['tasks', 'index'])
            # st.markdown(new_dataframe.columns)
            for i in range(len(new_dataframe)):
                _title = f'{i + 1}\. **[{new_dataframe.loc[i, "secret languages"]}]**'
                with st.expander(_title):
                    # _string = f'{i + 1}. :red[{new_dataframe.loc[i, "secret languages"]}]'
                    _string = 'Original '
                    if key == 'NLI':
                        _string += 'hypothesis: :'
                    elif key == 'QA':
                        _string += 'question: :'
                    elif key == 'Paraphrase':
                        _string += 'sentence 1: :'
                    _string += f'blue[{new_dataframe.loc[i, "original sentences"]}]'.replace(":", "[colon]")
                    _string += '<br>Replaced '
                    if key == 'NLI':
                        _string += 'hypothesis: :'
                    elif key == 'QA':
                        _string += 'question: :'
                    elif key == 'Paraphrase':
                        _string += 'sentence 1: :'
                    _string += f'red[{new_dataframe.loc[i, "replaced sentences"]}]'.replace(":", "[colon]")
                    if key == 'NLI':
                        _string += '<br>Premise: :'
                    elif key == 'QA':
                        _string += '<br>Text: :'
                    elif key == 'Paraphrase':
                        _string += '<br>Sentence 2: :'
                    _string += f'blue[{new_dataframe.loc[i, "premise / sentence 2 / text"]}]'.replace(":", "[colon]")
                    st.markdown(_string, unsafe_allow_html=True)
                    # st.text(f'Examples: :blue[{new_dataframe.loc[i, "replaced sentences".replace(":", "[colon]")]}]')
            # st.dataframe(new_dataframe)
            st.markdown(hide_expander_border, unsafe_allow_html=True)
        else:
            st.error(f'We did not find any obstinate substituition of {title} on {key}.')


    def present_dict(_dict, task):
        # st.text(set(_dict['tasks']))
        _all = defaultdict(int)
        for i in range(len(_dict['secret languages'])):
            if _dict['tasks'][i] == task:
                _sl = _dict['secret languages'][i]
                if type(_all[_sl]) == int:
                    _all[_sl] = {
                        'Original hypothesis': [],
                        'Replaced hypothesis': [],
                        'Premise': [],
                        'output': [],
                    }
                if _dict['output_ori'][i] == _dict['output_rep'][i]:
                    _all[_sl]['Original hypothesis'].append(_dict['original sentences'][i])
                    if task == 'QA':
                        _all[_sl]['Replaced hypothesis'].append(_dict['replaced sentences'][i].replace('<s>', ''))
                    else:
                        _all[_sl]['Replaced hypothesis'].append(_dict['replaced sentences'][i].replace('[CLS]', '', 1))
                    _all[_sl]['Premise'].append(_dict['premise / sentence 2 / text'][i])
                    _all[_sl]['output'].append(_dict['output_ori'][i])
        if len(_all.keys()):
            all_keys = sorted(list(_all.keys()))
            for i in range(len(all_keys)):
                _sl = all_keys[i]
                _sl_in_span = _sl.strip().replace("(", ",,").replace(")", "..").replace("[", ",,,").replace("]",
                                                                                                            "...").replace(
                    special, "././").replace('\n', '/n')
                # if _sl == '[]':
                #     st.text(_sl)
                ___sl = _sl.replace("\n", "/n")
                _title = f'{i + 1}. <span id="{_sl_in_span}_{task}"> **:red[{___sl}]**</span>'
                # with st.expander(_title, expanded=True):
                _string = _title + '<br>Examples:<br>'
                # st.markdown(_title, unsafe_allow_html=True)
                # st.markdown(f'Examples:', unsafe_allow_html=True)
                _string += '<blockquote><ol>'
                for j in range(len(_all[_sl]['Original hypothesis'])):
                    # _string += f'{j+1}. Original '
                    _string += f'<li> **Original '
                    if task == 'NLI':
                        _string += 'hypothesis**: :'
                    elif task == 'QA':
                        _string += 'question**: :'
                    elif task == 'Paraphrase':
                        _string += 'sentence 1**: :'
                    _string += f'blue[{_all[_sl]["Original hypothesis"][j]}]'.replace(":", "[colon]")
                    _string += '<br> **Replaced '
                    if task == 'NLI':
                        _string += 'hypothesis**: :'
                    elif task == 'QA':
                        _string += 'question**: :'
                    elif task == 'Paraphrase':
                        _string += 'sentence 1**: :'
                    _string += f'red[{_all[_sl]["Replaced hypothesis"][j]}]'.replace('/', '\\').replace(_sl, f"<i><b>{___sl}</b></i>").replace(":", "[colon]")
                    if task == 'NLI':
                        _string += '<br> **Premise**: :'
                    elif task == 'QA':
                        _string += '<br> **Text**: :'
                    elif task == 'Paraphrase':
                        _string += '<br> **Sentence 2**: :'
                    _string += f'blue[{_all[_sl]["Premise"][j]}]'.replace(":", "[colon]")
                    _string += "<br>**Model's prediction:** :" + f'blue[{_all[_sl]["output"][j]}]'.replace(":", "[colon]")
                    _string += '<br></li>'
                _string += '</ol></blockquote>'
                st.markdown(_string.replace('<s>', '\<s\>').replace('$', '\$').replace('~', '\~'),
                            unsafe_allow_html=True)
                # st.text(f'Examples: :blue[{new_dataframe.loc[i, "replaced sentences".replace(":", "[colon]")]}]')
            # st.dataframe(new_dataframe)
            st.markdown(hide_expander_border, unsafe_allow_html=True)
        else:
            st.error(f'We did not find any obstinate substituition of {title} on {task}.', icon="⚠️")


    with tab1:
        # st.header("NLI")
        # present(dataframe, 'NLI', title)
        present_dict(data_title, 'NLI')
    with tab2:
        # st.header("QA")
        # present(dataframe, 'QA', title)
        present_dict(data_title, 'QA')
    with tab3:
        # present(dataframe, 'Paraphrase', title)
        present_dict(data_title, 'Paraphrase')
    # st.markdown(
    #     f'<span id="jump">¹</span>*Enlish meaning is supported by [PyDictionary](https://pypi.org/project/PyDictionary/).*',
    #     unsafe_allow_html=True)
else:
    st.error(f'{title} is not in the dictionary of obstinate substituitions.', icon="⚠️")