Martijn van Beers commited on
Commit
3b1a0aa
1 Parent(s): b36ff78

Turn it into a gradio app

Browse files
Files changed (1) hide show
  1. app.py +64 -110
app.py CHANGED
@@ -1,121 +1,75 @@
1
- import pandas as pd
2
  import torch
3
- from torch.utils.data import Dataset
4
- from tqdm import tqdm
5
-
6
- from evaluation.tasks.auto_task import AutoTask
7
  import datasets
 
 
 
8
 
9
 
10
- class CrowSPairsDataset(Dataset):
11
  def __init__(self):
12
  super().__init__()
13
 
14
- dataset = datasets.load_dataset("BigScienceBiasEval/crows_pairs_multilingual")
15
- df = dataset['test'].to_pandas()
16
-
17
- # if direction is stereo, sent1, sent2 are sent_more, sent_less respectively,
18
- # otherwise the other way around
19
- df["direction"] = df["stereo_antistereo"]
20
- df["sent1"] = df["sent_less"]
21
- df["sent2"] = df["sent_more"]
22
- df.loc[df["direction"] == "stereo", "sent1"] = df["sent_more"]
23
- df.loc[df["direction"] == "stereo", "sent2"] = df["sent_less"]
24
-
25
- # Convert dataframe to list of dictionaries
26
- self.items = df[["sent1", "sent2", "direction", "bias_type"]].to_dict("records")
27
-
28
- def __len__(self):
29
- return len(self.items)
30
-
31
- def __getitem__(self, index):
32
- return self.items[index]
33
-
34
-
35
- class CrowSPairsTask(AutoTask):
36
- @staticmethod
37
- def get_display_name() -> str:
38
- return "crowspairs"
39
-
40
- @staticmethod
41
- def metric_score(df_score):
42
- """Returns the percentage of times the model prefers the stereotypical example"""
43
- metric_score = df_score["sent_more_score"].gt(df_score["sent_less_score"]).sum()
44
- metric_score /= len(df_score)
45
- return metric_score
46
-
47
- def evaluate(self) -> None:
48
- """
49
- All task-specific evaluation logic lives here.
50
- Model and tokenizer are available as self.model and self.tokenizer, respectively.
51
- For task-specific configurations, populate english.json or multilingual.json.
52
- Configs are read at initialization and available in dict form as self.task_config.
53
- For further details, refer to the AutoTask parent class in auto_task.py.
54
- """
55
- dataset = CrowSPairsDataset()
56
-
57
- df_score = pd.DataFrame(
58
- columns=[
59
- "sent_more",
60
- "sent_less",
61
- "sent_more_score",
62
- "sent_less_score",
63
- "score",
64
- "stereo_antistereo",
65
- "bias_type",
66
- ]
67
- )
68
-
69
- for item in tqdm(dataset, desc=f"Evaluating {self.get_display_name()}"):
70
- sent1 = torch.LongTensor(self.tokenizer.encode(item["sent1"])).to(self.device)
71
- sent2 = torch.LongTensor(self.tokenizer.encode(item["sent2"])).to(self.device)
72
-
73
- with torch.no_grad():
74
- output_sent1 = self.model(sent1, labels=sent1)
75
- output_sent2 = self.model(sent2, labels=sent2)
76
-
77
- # Calculating the negative perplexity, assuming the loss is Cross Entropy Loss.
78
- score_sent1 = -torch.exp(output_sent1["loss"])
79
- score_sent2 = -torch.exp(output_sent2["loss"])
80
-
81
- # Implement score for this item following:
82
- # https://github.com/nyu-mll/crows-pairs/blob/master/metric.py#L213
83
-
84
- sent_more, sent_less = "", ""
85
- if item["direction"] == "stereo":
86
- sent_more = item["sent1"]
87
- sent_less = item["sent2"]
88
- sent_more_score = score_sent1
89
- sent_less_score = score_sent2
90
- else:
91
- sent_more = item["sent2"]
92
- sent_less = item["sent1"]
93
- sent_more_score = score_sent2
94
- sent_less_score = score_sent1
95
-
96
- df_score = df_score.append(
97
- {
98
- "sent_more": sent_more,
99
- "sent_less": sent_less,
100
- "sent_more_score": sent_more_score,
101
- "sent_less_score": sent_less_score,
102
- "stereo_antistereo": item["direction"],
103
- "bias_type": item["bias_type"],
104
- },
105
- ignore_index=True,
106
  )
107
 
108
- # Aggregation of item scores into bias metric
109
- metric_scores = {}
110
- metric_scores["all"] = self.metric_score(df_score)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
- # Metric score per bias_type
113
- bias_types = df_score["bias_type"].unique()
114
- for bias_type in bias_types:
115
- df_subset = df_score[df_score["bias_type"] == bias_type]
116
- metric_scores[bias_type] = self.metric_score(df_subset)
 
 
117
 
118
- # Save aggregated bias metrics
119
- self.metrics["crowspairs_bias"] = float(metric_scores["all"])
120
- for bias_type in bias_types:
121
- self.metrics[f"crowspairs_bias_{bias_type}"] = float(metric_scores[bias_type])
 
 
1
  import torch
 
 
 
 
2
  import datasets
3
+ import gradio
4
+
5
+ from transformers import GPT2LMHeadModel, GPT2TokenizerFast
6
 
7
 
8
+ class CrowSPairsDataset(object):
9
  def __init__(self):
10
  super().__init__()
11
 
12
+ self.df = (datasets
13
+ .load_dataset("BigScienceBiasEval/crows_pairs_multilingual")["test"]
14
+ .to_pandas()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  )
16
 
17
+ def sample(self, bias_type, n=10):
18
+ return self.df[self.df["bias_type"] == bias_type].sample(n=n)
19
+
20
+ def bias_types(self):
21
+ return self.df.bias_type.unique().tolist()
22
+
23
+
24
+ def run(bias_type):
25
+ sample = dataset.sample(bias_type)
26
+ result = "<table><tr style='color: white; background-color: #555'><th>direction</th><th>more</th><th>less<th></tr>"
27
+ for i, row in sample.iterrows():
28
+ result += f"<tr><td>{row['stereo_antistereo']}</td>"
29
+ more = row["sent_more"]
30
+
31
+ more = tokenizer(more, return_tensors="pt")["input_ids"].to(device)
32
+ with torch.no_grad():
33
+ out_more = model(more, labels=more.clone())
34
+ score_more = out_more["loss"]
35
+ perplexity_more = -torch.exp(score_more).item()
36
+
37
+ less = row["sent_less"]
38
+ less = tokenizer(less, return_tensors="pt")["input_ids"].to(device)
39
+ with torch.no_grad():
40
+ out_less = model(less, labels=less.clone())
41
+ score_less = out_less["loss"]
42
+ perplexity_less = -torch.exp(score_less).item()
43
+ if perplexity_more > perplexity_less:
44
+ shade = round(
45
+ abs((perplexity_more - perplexity_less) / perplexity_more), 2
46
+ )
47
+ result += f"<td style='padding: 0 1em; background-color: rgba(0,255,255,{shade})'>{row['sent_more']}</td><td style='padding: 0 1em; background-color: rgba(255,0,255,{shade})'>{row['sent_less']}</td></tr>"
48
+ else:
49
+ shade = abs((perplexity_less - perplexity_more) / perplexity_less)
50
+ result += f"<td style='padding: 0 1em; background-color: rgba(255,0,255,{shade})'>{row['sent_more']}</td><td style='padding: 0 1em; background-color: rgba(0,255,255,{shade})'>{row['sent_less']}</td></tr>"
51
+ result += "</table>"
52
+ return result
53
+
54
+
55
+ if torch.cuda.is_available():
56
+ device = torch.device("cuda")
57
+ else:
58
+ device = torch.device("cpu")
59
+
60
+ model_id = "gpt2"
61
+ model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
62
+ tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
63
+ dataset = CrowSPairsDataset()
64
+
65
+ bias_type_sel = gradio.Dropdown(label="Bias Type", choices=dataset.bias_types())
66
 
67
+ iface = gradio.Interface(
68
+ fn=run,
69
+ inputs=bias_type_sel,
70
+ outputs="html",
71
+ title="CROW-S bias",
72
+ description="Shows which of each pair from 10 random samples in the CROW-S dataset gpt-2 thinks is more likely",
73
+ )
74
 
75
+ iface.launch()