tybrs commited on
Commit
b9e00cb
1 Parent(s): 4b8cc84

Update Space (evaluate main: 1a12c674)

Browse files
Files changed (2) hide show
  1. README.md +3 -4
  2. adversarial_glue.py +30 -60
README.md CHANGED
@@ -38,7 +38,7 @@ mc_results, = suite.run("gpt2")
38
 
39
  The output of the metric depends on the GLUE subset chosen, consisting of a dictionary that contains one or several of the following metrics:
40
 
41
- `accuracy`: the proportion of correct predictions among the total number of cases processed, with a range between 0 and 1 (see [accuracy](https://huggingface.co/metrics/accuracy) for more information).
42
 
43
 
44
  ### Values from popular papers
@@ -47,14 +47,14 @@ The [original GLUE paper](https://huggingface.co/datasets/glue) reported average
47
 
48
  For more recent model performance, see the [dataset leaderboard](https://paperswithcode.com/dataset/glue).
49
 
50
- ## Examples
51
 
52
  For full example see [HF Evaluate Adversarial Attacks.ipynb](https://github.com/IntelAI/evaluate/blob/develop/notebooks/HF%20Evaluate%20Adversarial%20Attacks.ipynb)
53
 
54
  ## Limitations and bias
55
  This metric works only with datasets that have the same format as the [GLUE dataset](https://huggingface.co/datasets/glue).
56
 
57
- While the GLUE dataset is meant to represent "General Language Understanding", the tasks represented in it are not necessarily representative of language understanding, and should not be interpreted as such.
58
 
59
  ## Citation
60
 
@@ -66,4 +66,3 @@ While the GLUE dataset is meant to represent "General Language Understanding", t
66
  year={2021}
67
  }
68
  ```
69
-
 
38
 
39
  The output of the metric depends on the GLUE subset chosen, consisting of a dictionary that contains one or several of the following metrics:
40
 
41
+ `accuracy`: the proportion of correct predictions among the total number of cases processed, with a range between 0 and 1 (see [accuracy](https://huggingface.co/metrics/accuracy) for more information).
42
 
43
 
44
  ### Values from popular papers
 
47
 
48
  For more recent model performance, see the [dataset leaderboard](https://paperswithcode.com/dataset/glue).
49
 
50
+ ## Examples
51
 
52
  For full example see [HF Evaluate Adversarial Attacks.ipynb](https://github.com/IntelAI/evaluate/blob/develop/notebooks/HF%20Evaluate%20Adversarial%20Attacks.ipynb)
53
 
54
  ## Limitations and bias
55
  This metric works only with datasets that have the same format as the [GLUE dataset](https://huggingface.co/datasets/glue).
56
 
57
+ While the GLUE dataset is meant to represent "General Language Understanding", the tasks represented in it are not necessarily representative of language understanding, and should not be interpreted as such.
58
 
59
  ## Citation
60
 
 
66
  year={2021}
67
  }
68
  ```
 
adversarial_glue.py CHANGED
@@ -1,7 +1,8 @@
 
 
1
  from evaluate.evaluation_suite import SubTask
2
  from evaluate.visualization import radar_plot
3
 
4
- from intel_evaluate_extension.evaluation_suite.model_card_suite import ModelCardSuiteResults
5
 
6
  _HEADER = "GLUE/AdvGlue Evaluation Results"
7
 
@@ -27,11 +28,8 @@ class Suite(ModelCardSuiteResults):
27
  "input_column": "sentence",
28
  "label_column": "label",
29
  "config_name": "sst2",
30
- "label_mapping": {
31
- "LABEL_0": 0.0,
32
- "LABEL_1": 1.0
33
- }
34
- }
35
  ),
36
  SubTask(
37
  task_type="text-classification",
@@ -43,29 +41,22 @@ class Suite(ModelCardSuiteResults):
43
  "input_column": "sentence",
44
  "label_column": "label",
45
  "config_name": "sst2",
46
- "label_mapping": {
47
- "LABEL_0": 0.0,
48
- "LABEL_1": 1.0
49
- }
50
- }
51
  ),
52
  SubTask(
53
  task_type="text-classification",
54
  data="glue",
55
  subset="qqp",
56
  split="validation[:5]",
57
-
58
  args_for_task={
59
  "metric": "glue",
60
  "input_column": "question1",
61
  "second_input_column": "question2",
62
  "label_column": "label",
63
  "config_name": "qqp",
64
- "label_mapping": {
65
- "LABEL_0": 0,
66
- "LABEL_1": 1
67
- }
68
- }
69
  ),
70
  SubTask(
71
  task_type="text-classification",
@@ -78,11 +69,8 @@ class Suite(ModelCardSuiteResults):
78
  "second_input_column": "question2",
79
  "label_column": "label",
80
  "config_name": "qqp",
81
- "label_mapping": {
82
- "LABEL_0": 0,
83
- "LABEL_1": 1
84
- }
85
- }
86
  ),
87
  SubTask(
88
  task_type="text-classification",
@@ -95,11 +83,8 @@ class Suite(ModelCardSuiteResults):
95
  "second_input_column": "sentence",
96
  "label_column": "label",
97
  "config_name": "qnli",
98
- "label_mapping": {
99
- "LABEL_0": 0,
100
- "LABEL_1": 1
101
- }
102
- }
103
  ),
104
  SubTask(
105
  task_type="text-classification",
@@ -112,11 +97,8 @@ class Suite(ModelCardSuiteResults):
112
  "second_input_column": "sentence",
113
  "label_column": "label",
114
  "config_name": "qnli",
115
- "label_mapping": {
116
- "LABEL_0": 0,
117
- "LABEL_1": 1
118
- }
119
- }
120
  ),
121
  SubTask(
122
  task_type="text-classification",
@@ -129,11 +111,8 @@ class Suite(ModelCardSuiteResults):
129
  "second_input_column": "sentence2",
130
  "label_column": "label",
131
  "config_name": "rte",
132
- "label_mapping": {
133
- "LABEL_0": 0,
134
- "LABEL_1": 1
135
- }
136
- }
137
  ),
138
  SubTask(
139
  task_type="text-classification",
@@ -146,11 +125,8 @@ class Suite(ModelCardSuiteResults):
146
  "second_input_column": "sentence2",
147
  "label_column": "label",
148
  "config_name": "rte",
149
- "label_mapping": {
150
- "LABEL_0": 0,
151
- "LABEL_1": 1
152
- }
153
- }
154
  ),
155
  SubTask(
156
  task_type="text-classification",
@@ -162,12 +138,8 @@ class Suite(ModelCardSuiteResults):
162
  "input_column": "premise",
163
  "second_input_column": "hypothesis",
164
  "config_name": "mnli",
165
- "label_mapping": {
166
- "LABEL_0": 0,
167
- "LABEL_1": 1,
168
- "LABEL_2": 2
169
- }
170
- }
171
  ),
172
  SubTask(
173
  task_type="text-classification",
@@ -179,24 +151,22 @@ class Suite(ModelCardSuiteResults):
179
  "input_column": "premise",
180
  "second_input_column": "hypothesis",
181
  "config_name": "mnli",
182
- "label_mapping": {
183
- "LABEL_0": 0,
184
- "LABEL_1": 1,
185
- "LABEL_2": 2
186
- }
187
- }
188
  ),
189
  ]
190
 
191
  def process_results(self, results):
192
  radar_data = [
193
- {"accuracy " + result["task_name"].split("/")[-1]:
194
- result["accuracy"] for result in results[::2]},
195
- {"accuracy " + result["task_name"].replace("adv_", "").split("/")[-1]:
196
- result["accuracy"] for result in results[1::2]}]
197
- return radar_plot(radar_data, ['GLUE', 'AdvGLUE'])
 
 
198
 
199
  def plot_results(self, results, model_or_pipeline):
200
  radar_data = self.process_results(results)
201
- graphic = radar_plot(radar_data, ['GLUE ' + model_or_pipeline, 'AdvGLUE ' + model_or_pipeline])
202
  return graphic
 
1
+ from intel_evaluate_extension.evaluation_suite.model_card_suite import ModelCardSuiteResults
2
+
3
  from evaluate.evaluation_suite import SubTask
4
  from evaluate.visualization import radar_plot
5
 
 
6
 
7
  _HEADER = "GLUE/AdvGlue Evaluation Results"
8
 
 
28
  "input_column": "sentence",
29
  "label_column": "label",
30
  "config_name": "sst2",
31
+ "label_mapping": {"LABEL_0": 0.0, "LABEL_1": 1.0},
32
+ },
 
 
 
33
  ),
34
  SubTask(
35
  task_type="text-classification",
 
41
  "input_column": "sentence",
42
  "label_column": "label",
43
  "config_name": "sst2",
44
+ "label_mapping": {"LABEL_0": 0.0, "LABEL_1": 1.0},
45
+ },
 
 
 
46
  ),
47
  SubTask(
48
  task_type="text-classification",
49
  data="glue",
50
  subset="qqp",
51
  split="validation[:5]",
 
52
  args_for_task={
53
  "metric": "glue",
54
  "input_column": "question1",
55
  "second_input_column": "question2",
56
  "label_column": "label",
57
  "config_name": "qqp",
58
+ "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
59
+ },
 
 
 
60
  ),
61
  SubTask(
62
  task_type="text-classification",
 
69
  "second_input_column": "question2",
70
  "label_column": "label",
71
  "config_name": "qqp",
72
+ "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
73
+ },
 
 
 
74
  ),
75
  SubTask(
76
  task_type="text-classification",
 
83
  "second_input_column": "sentence",
84
  "label_column": "label",
85
  "config_name": "qnli",
86
+ "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
87
+ },
 
 
 
88
  ),
89
  SubTask(
90
  task_type="text-classification",
 
97
  "second_input_column": "sentence",
98
  "label_column": "label",
99
  "config_name": "qnli",
100
+ "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
101
+ },
 
 
 
102
  ),
103
  SubTask(
104
  task_type="text-classification",
 
111
  "second_input_column": "sentence2",
112
  "label_column": "label",
113
  "config_name": "rte",
114
+ "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
115
+ },
 
 
 
116
  ),
117
  SubTask(
118
  task_type="text-classification",
 
125
  "second_input_column": "sentence2",
126
  "label_column": "label",
127
  "config_name": "rte",
128
+ "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
129
+ },
 
 
 
130
  ),
131
  SubTask(
132
  task_type="text-classification",
 
138
  "input_column": "premise",
139
  "second_input_column": "hypothesis",
140
  "config_name": "mnli",
141
+ "label_mapping": {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2},
142
+ },
 
 
 
 
143
  ),
144
  SubTask(
145
  task_type="text-classification",
 
151
  "input_column": "premise",
152
  "second_input_column": "hypothesis",
153
  "config_name": "mnli",
154
+ "label_mapping": {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2},
155
+ },
 
 
 
 
156
  ),
157
  ]
158
 
159
  def process_results(self, results):
160
  radar_data = [
161
+ {"accuracy " + result["task_name"].split("/")[-1]: result["accuracy"] for result in results[::2]},
162
+ {
163
+ "accuracy " + result["task_name"].replace("adv_", "").split("/")[-1]: result["accuracy"]
164
+ for result in results[1::2]
165
+ },
166
+ ]
167
+ return radar_plot(radar_data, ["GLUE", "AdvGLUE"])
168
 
169
  def plot_results(self, results, model_or_pipeline):
170
  radar_data = self.process_results(results)
171
+ graphic = radar_plot(radar_data, ["GLUE " + model_or_pipeline, "AdvGLUE " + model_or_pipeline])
172
  return graphic