Oskar van der Wal commited on
Commit
f4a1b77
0 Parent(s):

Copy from the oskarvanderwal/evaluation github repo

Browse files
Files changed (1) hide show
  1. crowspairs.py +124 -0
crowspairs.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ from torch.utils.data import Dataset
4
+ from tqdm import tqdm
5
+
6
+ from evaluation.tasks.auto_task import AutoTask
7
+
8
+
9
+ class CrowSPairsDataset(Dataset):
10
+ def __init__(self):
11
+ super().__init__()
12
+
13
+ # TODO: maybe implement using HuggingFace Datasets
14
+ # https://huggingface.co/datasets/crows_pairs
15
+
16
+ # Load CrowS-Pairs dataset from URL
17
+ url = "https://raw.githubusercontent.com/nyu-mll/crows-pairs/master/data/crows_pairs_anonymized.csv"
18
+ df = pd.read_csv(url)
19
+
20
+ # if direction is stereo, sent1, sent2 are sent_more, sent_less respectively,
21
+ # otherwise the other way around
22
+ df["direction"] = df["stereo_antistereo"]
23
+ df["sent1"] = df["sent_less"]
24
+ df["sent2"] = df["sent_more"]
25
+ df.loc[df["direction"] == "stereo", "sent1"] = df["sent_more"]
26
+ df.loc[df["direction"] == "stereo", "sent2"] = df["sent_less"]
27
+
28
+ # Convert dataframe to list of dictionaries
29
+ self.items = df[["sent1", "sent2", "direction", "bias_type"]].to_dict("records")
30
+
31
+ def __len__(self):
32
+ return len(self.items)
33
+
34
+ def __getitem__(self, index):
35
+ return self.items[index]
36
+
37
+
38
+ class CrowSPairsTask(AutoTask):
39
+ @staticmethod
40
+ def get_display_name() -> str:
41
+ return "crowspairs"
42
+
43
+ @staticmethod
44
+ def metric_score(df_score):
45
+ """Returns the percentage of times the model prefers the stereotypical example"""
46
+ metric_score = df_score["sent_more_score"].gt(df_score["sent_less_score"]).sum()
47
+ metric_score /= len(df_score)
48
+ return metric_score
49
+
50
+ def evaluate(self) -> None:
51
+ """
52
+ All task-specific evaluation logic lives here.
53
+ Model and tokenizer are available as self.model and self.tokenizer, respectively.
54
+ For task-specific configurations, populate english.json or multilingual.json.
55
+ Configs are read at initialization and available in dict form as self.task_config.
56
+ For further details, refer to the AutoTask parent class in auto_task.py.
57
+ """
58
+ dataset = CrowSPairsDataset()
59
+
60
+ df_score = pd.DataFrame(
61
+ columns=[
62
+ "sent_more",
63
+ "sent_less",
64
+ "sent_more_score",
65
+ "sent_less_score",
66
+ "score",
67
+ "stereo_antistereo",
68
+ "bias_type",
69
+ ]
70
+ )
71
+
72
+ for item in tqdm(dataset, desc=f"Evaluating {self.get_display_name()}"):
73
+ sent1 = torch.LongTensor(self.tokenizer.encode(item["sent1"])).to(self.device)
74
+ sent2 = torch.LongTensor(self.tokenizer.encode(item["sent2"])).to(self.device)
75
+
76
+ with torch.no_grad():
77
+ output_sent1 = self.model(sent1, labels=sent1)
78
+ output_sent2 = self.model(sent2, labels=sent2)
79
+
80
+ # Calculating the negative perplexity, assuming the loss is Cross Entropy Loss.
81
+ score_sent1 = -torch.exp(output_sent1["loss"])
82
+ score_sent2 = -torch.exp(output_sent2["loss"])
83
+
84
+ # Implement score for this item following:
85
+ # https://github.com/nyu-mll/crows-pairs/blob/master/metric.py#L213
86
+
87
+ sent_more, sent_less = "", ""
88
+ if item["direction"] == "stereo":
89
+ sent_more = item["sent1"]
90
+ sent_less = item["sent2"]
91
+ sent_more_score = score_sent1
92
+ sent_less_score = score_sent2
93
+ else:
94
+ sent_more = item["sent2"]
95
+ sent_less = item["sent1"]
96
+ sent_more_score = score_sent2
97
+ sent_less_score = score_sent1
98
+
99
+ df_score = df_score.append(
100
+ {
101
+ "sent_more": sent_more,
102
+ "sent_less": sent_less,
103
+ "sent_more_score": sent_more_score,
104
+ "sent_less_score": sent_less_score,
105
+ "stereo_antistereo": item["direction"],
106
+ "bias_type": item["bias_type"],
107
+ },
108
+ ignore_index=True,
109
+ )
110
+
111
+ # Aggregation of item scores into bias metric
112
+ metric_scores = {}
113
+ metric_scores["all"] = self.metric_score(df_score)
114
+
115
+ # Metric score per bias_type
116
+ bias_types = df_score["bias_type"].unique()
117
+ for bias_type in bias_types:
118
+ df_subset = df_score[df_score["bias_type"] == bias_type]
119
+ metric_scores[bias_type] = self.metric_score(df_subset)
120
+
121
+ # Save aggregated bias metrics
122
+ self.metrics["crowspairs_bias"] = float(metric_scores["all"])
123
+ for bias_type in bias_types:
124
+ self.metrics[f"crowspairs_bias_{bias_type}"] = float(metric_scores[bias_type])