tybrs commited on
Commit
2f9157a
1 Parent(s): 2d2a68f

Update Space (evaluate main: e5933120)

Browse files
Files changed (3) hide show
  1. README.md +59 -1
  2. adversarial_glue.py +202 -0
  3. requirements.txt +1 -0
README.md CHANGED
@@ -8,4 +8,62 @@ pinned: false
8
  license: apache-2.0
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  license: apache-2.0
9
  ---
10
 
11
+ # Adversarial GLUE Evaluation Suite
12
+
13
+ ## Description
14
+
15
+ This evaluation suite compares the GLUE results with Adversarial GLUE (AdvGLUE), a multi-task benchmark that evaluates modern large-scale language models robustness with respect to various types of adversarial attacks.
16
+
17
+ ## How to use
18
+
19
+ This suite requires installations of the following fork [IntelAI/evaluate](https://github.com/IntelAI/evaluate/tree/develop).
20
+
21
+ After installation, there are two steps: (1) loading the Adversarial GLUE suite; and (2) calculating the metric.
22
+
23
+ 1. **Loading the relevant GLUE metric** : This suite loads an evaluation suite subtasks for the following tasks on both AdvGLUE and GLUE datasets: `sst2`, `mnli`, `qnli`, `rte`, and `qqp`.
24
+
25
+ More information about the different subsets of the GLUE dataset can be found on the [GLUE dataset page](https://huggingface.co/datasets/glue).
26
+
27
+ 2. **Calculating the metric**: the metric takes one input: the name of the model or pipeline
28
+
29
+
30
+ ```python
31
+ from evaluate import EvaluationSuite
32
+
33
+ suite = EvaluationSuite.load('intel/adversarial_glue')
34
+ mc_results, = suite.run("gpt2")
35
+ ```
36
+
37
+ ## Output results
38
+
39
+ The output of the metric depends on the GLUE subset chosen, consisting of a dictionary that contains one or several of the following metrics:
40
+
41
+ `accuracy`: the proportion of correct predictions among the total number of cases processed, with a range between 0 and 1 (see [accuracy](https://huggingface.co/metrics/accuracy) for more information).
42
+
43
+
44
+ ### Values from popular papers
45
+
46
+ The [original GLUE paper](https://huggingface.co/datasets/glue) reported average scores ranging from 58% to 64%, depending on the model used (with all evaluation values scaled by 100 to make computing the average possible).
47
+
48
+ For more recent model performance, see the [dataset leaderboard](https://paperswithcode.com/dataset/glue).
49
+
50
+ ## Examples
51
+
52
+ For full example see [HF Evaluate Adversarial Attacks.ipynb](https://github.com/IntelAI/evaluate/blob/develop/notebooks/HF%20Evaluate%20Adversarial%20Attacks.ipynb)
53
+
54
+ ## Limitations and bias
55
+ This metric works only with datasets that have the same format as the [GLUE dataset](https://huggingface.co/datasets/glue).
56
+
57
+ While the GLUE dataset is meant to represent "General Language Understanding", the tasks represented in it are not necessarily representative of language understanding, and should not be interpreted as such.
58
+
59
+ ## Citation
60
+
61
+ ```bibtex
62
+ @inproceedings{wang2021adversarial,
63
+ title={Adversarial GLUE: A Multi-Task Benchmark for Robustness Evaluation of Language Models},
64
+ author={Wang, Boxin and Xu, Chejian and Wang, Shuohang and Gan, Zhe and Cheng, Yu and Gao, Jianfeng and Awadallah, Ahmed Hassan and Li, Bo},
65
+ booktitle={Advances in Neural Information Processing Systems},
66
+ year={2021}
67
+ }
68
+ ```
69
+
adversarial_glue.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from evaluate.evaluation_suite import SubTask
2
+ from evaluate.visualization import radar_plot
3
+
4
+ from intel_evaluate_extension.evaluation_suite.model_card_suite import ModelCardSuiteResults
5
+
6
+ _HEADER = "GLUE/AdvGlue Evaluation Results"
7
+
8
+ _DESCRIPTION = """
9
+ The suite compares the GLUE results with Adversarial GLUE (AdvGLUE), a
10
+ multi-task benchmark that tests the vulnerability of modern large-scale
11
+ language models againstvarious adversarial attacks."""
12
+
13
+
14
+ class Suite(ModelCardSuiteResults):
15
+ def __init__(self, name):
16
+ super().__init__(name)
17
+ self.result_keys = ["accuracy", "f1"]
18
+ self.preprocessor = lambda x: {"text": x["text"].lower()}
19
+ self.suite = [
20
+ SubTask(
21
+ task_type="text-classification",
22
+ data="glue",
23
+ subset="sst2",
24
+ split="validation[:5]",
25
+ args_for_task={
26
+ "metric": "glue",
27
+ "input_column": "sentence",
28
+ "label_column": "label",
29
+ "config_name": "sst2",
30
+ "label_mapping": {
31
+ "LABEL_0": 0.0,
32
+ "LABEL_1": 1.0
33
+ }
34
+ }
35
+ ),
36
+ SubTask(
37
+ task_type="text-classification",
38
+ data="adv_glue",
39
+ subset="adv_sst2",
40
+ split="validation[:5]",
41
+ args_for_task={
42
+ "metric": "glue",
43
+ "input_column": "sentence",
44
+ "label_column": "label",
45
+ "config_name": "sst2",
46
+ "label_mapping": {
47
+ "LABEL_0": 0.0,
48
+ "LABEL_1": 1.0
49
+ }
50
+ }
51
+ ),
52
+ SubTask(
53
+ task_type="text-classification",
54
+ data="glue",
55
+ subset="qqp",
56
+ split="validation[:5]",
57
+
58
+ args_for_task={
59
+ "metric": "glue",
60
+ "input_column": "question1",
61
+ "second_input_column": "question2",
62
+ "label_column": "label",
63
+ "config_name": "qqp",
64
+ "label_mapping": {
65
+ "LABEL_0": 0,
66
+ "LABEL_1": 1
67
+ }
68
+ }
69
+ ),
70
+ SubTask(
71
+ task_type="text-classification",
72
+ data="adv_glue",
73
+ subset="adv_qqp",
74
+ split="validation[:5]",
75
+ args_for_task={
76
+ "metric": "glue",
77
+ "input_column": "question1",
78
+ "second_input_column": "question2",
79
+ "label_column": "label",
80
+ "config_name": "qqp",
81
+ "label_mapping": {
82
+ "LABEL_0": 0,
83
+ "LABEL_1": 1
84
+ }
85
+ }
86
+ ),
87
+ SubTask(
88
+ task_type="text-classification",
89
+ data="glue",
90
+ subset="qnli",
91
+ split="validation[:5]",
92
+ args_for_task={
93
+ "metric": "glue",
94
+ "input_column": "question",
95
+ "second_input_column": "sentence",
96
+ "label_column": "label",
97
+ "config_name": "qnli",
98
+ "label_mapping": {
99
+ "LABEL_0": 0,
100
+ "LABEL_1": 1
101
+ }
102
+ }
103
+ ),
104
+ SubTask(
105
+ task_type="text-classification",
106
+ data="adv_glue",
107
+ subset="adv_qnli",
108
+ split="validation[:5]",
109
+ args_for_task={
110
+ "metric": "glue",
111
+ "input_column": "question",
112
+ "second_input_column": "sentence",
113
+ "label_column": "label",
114
+ "config_name": "qnli",
115
+ "label_mapping": {
116
+ "LABEL_0": 0,
117
+ "LABEL_1": 1
118
+ }
119
+ }
120
+ ),
121
+ SubTask(
122
+ task_type="text-classification",
123
+ data="glue",
124
+ subset="rte",
125
+ split="validation[:5]",
126
+ args_for_task={
127
+ "metric": "glue",
128
+ "input_column": "sentence1",
129
+ "second_input_column": "sentence2",
130
+ "label_column": "label",
131
+ "config_name": "rte",
132
+ "label_mapping": {
133
+ "LABEL_0": 0,
134
+ "LABEL_1": 1
135
+ }
136
+ }
137
+ ),
138
+ SubTask(
139
+ task_type="text-classification",
140
+ data="adv_glue",
141
+ subset="adv_rte",
142
+ split="validation[:5]",
143
+ args_for_task={
144
+ "metric": "glue",
145
+ "input_column": "sentence1",
146
+ "second_input_column": "sentence2",
147
+ "label_column": "label",
148
+ "config_name": "rte",
149
+ "label_mapping": {
150
+ "LABEL_0": 0,
151
+ "LABEL_1": 1
152
+ }
153
+ }
154
+ ),
155
+ SubTask(
156
+ task_type="text-classification",
157
+ data="glue",
158
+ subset="mnli",
159
+ split="validation_mismatched[:5]",
160
+ args_for_task={
161
+ "metric": "glue",
162
+ "input_column": "premise",
163
+ "second_input_column": "hypothesis",
164
+ "config_name": "mnli",
165
+ "label_mapping": {
166
+ "LABEL_0": 0,
167
+ "LABEL_1": 1,
168
+ "LABEL_2": 2
169
+ }
170
+ }
171
+ ),
172
+ SubTask(
173
+ task_type="text-classification",
174
+ data="adv_glue",
175
+ subset="adv_mnli",
176
+ split="validation[:5]",
177
+ args_for_task={
178
+ "metric": "glue",
179
+ "input_column": "premise",
180
+ "second_input_column": "hypothesis",
181
+ "config_name": "mnli",
182
+ "label_mapping": {
183
+ "LABEL_0": 0,
184
+ "LABEL_1": 1,
185
+ "LABEL_2": 2
186
+ }
187
+ }
188
+ ),
189
+ ]
190
+
191
+ def process_results(self, results):
192
+ radar_data = [
193
+ {"accuracy " + result["task_name"].split("/")[-1]:
194
+ result["accuracy"] for result in results[::2]},
195
+ {"accuracy " + result["task_name"].replace("adv_", "").split("/")[-1]:
196
+ result["accuracy"] for result in results[1::2]}]
197
+ return radar_plot(radar_data, ['GLUE', 'AdvGLUE'])
198
+
199
+ def plot_results(self, results, model_or_pipeline):
200
+ radar_data = self.process_results(results)
201
+ graphic = radar_plot(radar_data, ['GLUE ' + model_or_pipeline, 'AdvGLUE ' + model_or_pipeline])
202
+ return graphic
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ git+https://github.com/IntelAI/evaluate@develop