Spaces:

Intel
/

adversarial_glue

Running

App Files Files Community

tybrs commited on Dec 15, 2023

Commit

4b8cc84

•

1 Parent(s): 0b196bf

Update Space (evaluate main: 06fba808)

Browse files

Files changed (2) hide show

README.md +4 -3
adversarial_glue.py +60 -30

README.md CHANGED Viewed

@@ -38,7 +38,7 @@ mc_results,  = suite.run("gpt2")
 The output of the metric depends on the GLUE subset chosen, consisting of a dictionary that contains one or several of the following metrics:
-`accuracy`: the proportion of correct predictions among the total number of cases processed, with a range between 0 and 1 (see [accuracy](https://huggingface.co/metrics/accuracy) for more information).
 ### Values from popular papers
@@ -47,14 +47,14 @@ The [original GLUE paper](https://huggingface.co/datasets/glue) reported average
 For more recent model performance, see the [dataset leaderboard](https://paperswithcode.com/dataset/glue).
-## Examples
 For full example see [HF Evaluate Adversarial Attacks.ipynb](https://github.com/IntelAI/evaluate/blob/develop/notebooks/HF%20Evaluate%20Adversarial%20Attacks.ipynb)
 ## Limitations and bias
 This metric works only with datasets that have the same format as the [GLUE dataset](https://huggingface.co/datasets/glue).
-While the GLUE dataset is meant to represent "General Language Understanding", the tasks represented in it are not necessarily representative of language understanding, and should not be interpreted as such.
 ## Citation
@@ -66,3 +66,4 @@ While the GLUE dataset is meant to represent "General Language Understanding", t
   year={2021}
 }
 ```

 The output of the metric depends on the GLUE subset chosen, consisting of a dictionary that contains one or several of the following metrics:
+`accuracy`: the proportion of correct predictions among the total number of cases processed, with a range between 0 and 1 (see [accuracy](https://huggingface.co/metrics/accuracy) for more information).
 ### Values from popular papers
 For more recent model performance, see the [dataset leaderboard](https://paperswithcode.com/dataset/glue).
+## Examples
 For full example see [HF Evaluate Adversarial Attacks.ipynb](https://github.com/IntelAI/evaluate/blob/develop/notebooks/HF%20Evaluate%20Adversarial%20Attacks.ipynb)
 ## Limitations and bias
 This metric works only with datasets that have the same format as the [GLUE dataset](https://huggingface.co/datasets/glue).
+While the GLUE dataset is meant to represent "General Language Understanding", the tasks represented in it are not necessarily representative of language understanding, and should not be interpreted as such.
 ## Citation
   year={2021}
 }
 ```

adversarial_glue.py CHANGED Viewed

@@ -1,8 +1,7 @@
-from intel_evaluate_extension.evaluation_suite.model_card_suite import ModelCardSuiteResults
 from evaluate.evaluation_suite import SubTask
 from evaluate.visualization import radar_plot
 _HEADER = "GLUE/AdvGlue Evaluation Results"
@@ -28,8 +27,11 @@ class Suite(ModelCardSuiteResults):
                     "input_column": "sentence",
                     "label_column": "label",
                     "config_name": "sst2",
-                    "label_mapping": {"LABEL_0": 0.0, "LABEL_1": 1.0},
-                },
             ),
             SubTask(
                 task_type="text-classification",
@@ -41,22 +43,29 @@ class Suite(ModelCardSuiteResults):
                     "input_column": "sentence",
                     "label_column": "label",
                     "config_name": "sst2",
-                    "label_mapping": {"LABEL_0": 0.0, "LABEL_1": 1.0},
-                },
             ),
             SubTask(
                 task_type="text-classification",
                 data="glue",
                 subset="qqp",
                 split="validation[:5]",
                 args_for_task={
                     "metric": "glue",
                     "input_column": "question1",
                     "second_input_column": "question2",
                     "label_column": "label",
                     "config_name": "qqp",
-                    "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
-                },
             ),
             SubTask(
                 task_type="text-classification",
@@ -69,8 +78,11 @@ class Suite(ModelCardSuiteResults):
                     "second_input_column": "question2",
                     "label_column": "label",
                     "config_name": "qqp",
-                    "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
-                },
             ),
             SubTask(
                 task_type="text-classification",
@@ -83,8 +95,11 @@ class Suite(ModelCardSuiteResults):
                     "second_input_column": "sentence",
                     "label_column": "label",
                     "config_name": "qnli",
-                    "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
-                },
             ),
             SubTask(
                 task_type="text-classification",
@@ -97,8 +112,11 @@ class Suite(ModelCardSuiteResults):
                     "second_input_column": "sentence",
                     "label_column": "label",
                     "config_name": "qnli",
-                    "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
-                },
             ),
             SubTask(
                 task_type="text-classification",
@@ -111,8 +129,11 @@ class Suite(ModelCardSuiteResults):
                     "second_input_column": "sentence2",
                     "label_column": "label",
                     "config_name": "rte",
-                    "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
-                },
             ),
             SubTask(
                 task_type="text-classification",
@@ -125,8 +146,11 @@ class Suite(ModelCardSuiteResults):
                     "second_input_column": "sentence2",
                     "label_column": "label",
                     "config_name": "rte",
-                    "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
-                },
             ),
             SubTask(
                 task_type="text-classification",
@@ -138,8 +162,12 @@ class Suite(ModelCardSuiteResults):
                     "input_column": "premise",
                     "second_input_column": "hypothesis",
                     "config_name": "mnli",
-                    "label_mapping": {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2},
-                },
             ),
             SubTask(
                 task_type="text-classification",
@@ -151,22 +179,24 @@ class Suite(ModelCardSuiteResults):
                     "input_column": "premise",
                     "second_input_column": "hypothesis",
                     "config_name": "mnli",
-                    "label_mapping": {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2},
-                },
             ),
         ]
     def process_results(self, results):
         radar_data = [
-            {"accuracy " + result["task_name"].split("/")[-1]: result["accuracy"] for result in results[::2]},
-            {
-                "accuracy " + result["task_name"].replace("adv_", "").split("/")[-1]: result["accuracy"]
-                for result in results[1::2]
-            },
-        ]
-        return radar_plot(radar_data, ["GLUE", "AdvGLUE"])
     def plot_results(self, results, model_or_pipeline):
         radar_data = self.process_results(results)
-        graphic = radar_plot(radar_data, ["GLUE " + model_or_pipeline, "AdvGLUE " + model_or_pipeline])
         return graphic

 from evaluate.evaluation_suite import SubTask
 from evaluate.visualization import radar_plot
+from intel_evaluate_extension.evaluation_suite.model_card_suite import ModelCardSuiteResults
 _HEADER = "GLUE/AdvGlue Evaluation Results"
                     "input_column": "sentence",
                     "label_column": "label",
                     "config_name": "sst2",
+                    "label_mapping": {
+                        "LABEL_0": 0.0,
+                        "LABEL_1": 1.0
+                    }
+                }
             ),
             SubTask(
                 task_type="text-classification",
                     "input_column": "sentence",
                     "label_column": "label",
                     "config_name": "sst2",
+                    "label_mapping": {
+                        "LABEL_0": 0.0,
+                        "LABEL_1": 1.0
+                    }
+                }
             ),
             SubTask(
                 task_type="text-classification",
                 data="glue",
                 subset="qqp",
                 split="validation[:5]",
                 args_for_task={
                     "metric": "glue",
                     "input_column": "question1",
                     "second_input_column": "question2",
                     "label_column": "label",
                     "config_name": "qqp",
+                    "label_mapping": {
+                        "LABEL_0": 0,
+                        "LABEL_1": 1
+                    }
+                }
             ),
             SubTask(
                 task_type="text-classification",
                     "second_input_column": "question2",
                     "label_column": "label",
                     "config_name": "qqp",
+                    "label_mapping": {
+                        "LABEL_0": 0,
+                        "LABEL_1": 1
+                    }
+                }
             ),
             SubTask(
                 task_type="text-classification",
                     "second_input_column": "sentence",
                     "label_column": "label",
                     "config_name": "qnli",
+                    "label_mapping": {
+                        "LABEL_0": 0,
+                        "LABEL_1": 1
+                    }
+                }
             ),
             SubTask(
                 task_type="text-classification",
                     "second_input_column": "sentence",
                     "label_column": "label",
                     "config_name": "qnli",
+                    "label_mapping": {
+                        "LABEL_0": 0,
+                        "LABEL_1": 1
+                    }
+                }
             ),
             SubTask(
                 task_type="text-classification",
                     "second_input_column": "sentence2",
                     "label_column": "label",
                     "config_name": "rte",
+                    "label_mapping": {
+                        "LABEL_0": 0,
+                        "LABEL_1": 1
+                    }
+                }
             ),
             SubTask(
                 task_type="text-classification",
                     "second_input_column": "sentence2",
                     "label_column": "label",
                     "config_name": "rte",
+                    "label_mapping": {
+                        "LABEL_0": 0,
+                        "LABEL_1": 1
+                    }
+                }
             ),
             SubTask(
                 task_type="text-classification",
                     "input_column": "premise",
                     "second_input_column": "hypothesis",
                     "config_name": "mnli",
+                    "label_mapping": {
+                        "LABEL_0": 0,
+                        "LABEL_1": 1,
+                        "LABEL_2": 2
+                    }
+                }
             ),
             SubTask(
                 task_type="text-classification",
                     "input_column": "premise",
                     "second_input_column": "hypothesis",
                     "config_name": "mnli",
+                    "label_mapping": {
+                        "LABEL_0": 0,
+                        "LABEL_1": 1,
+                        "LABEL_2": 2
+                    }
+                }
             ),
         ]
     def process_results(self, results):
         radar_data = [
+            {"accuracy " + result["task_name"].split("/")[-1]:
+             result["accuracy"] for result in results[::2]},
+            {"accuracy " + result["task_name"].replace("adv_", "").split("/")[-1]:
+             result["accuracy"] for result in results[1::2]}]
+        return radar_plot(radar_data, ['GLUE', 'AdvGLUE'])
     def plot_results(self, results, model_or_pipeline):
         radar_data = self.process_results(results)
+        graphic = radar_plot(radar_data, ['GLUE ' + model_or_pipeline,  'AdvGLUE ' + model_or_pipeline])
         return graphic