Spaces:

amsterdamNLP
/

contrastive-pairs

Sleeping

App Files Files Community

Martijn van Beers commited on Sep 8, 2022

Commit

3b1a0aa

•

1 Parent(s): b36ff78

Turn it into a gradio app

Browse files

Files changed (1) hide show

app.py +64 -110

app.py CHANGED Viewed

@@ -1,121 +1,75 @@
-import pandas as pd
 import torch
-from torch.utils.data import Dataset
-from tqdm import tqdm
-from evaluation.tasks.auto_task import AutoTask
 import datasets
-class CrowSPairsDataset(Dataset):
     def __init__(self):
         super().__init__()
-        dataset = datasets.load_dataset("BigScienceBiasEval/crows_pairs_multilingual")
-        df = dataset['test'].to_pandas()
-        # if direction is stereo, sent1, sent2 are sent_more, sent_less respectively,
-        # otherwise the other way around
-        df["direction"] = df["stereo_antistereo"]
-        df["sent1"] = df["sent_less"]
-        df["sent2"] = df["sent_more"]
-        df.loc[df["direction"] == "stereo", "sent1"] = df["sent_more"]
-        df.loc[df["direction"] == "stereo", "sent2"] = df["sent_less"]
-        # Convert dataframe to list of dictionaries
-        self.items = df[["sent1", "sent2", "direction", "bias_type"]].to_dict("records")
-    def __len__(self):
-        return len(self.items)
-    def __getitem__(self, index):
-        return self.items[index]
-class CrowSPairsTask(AutoTask):
-    @staticmethod
-    def get_display_name() -> str:
-        return "crowspairs"
-    @staticmethod
-    def metric_score(df_score):
-        """Returns the percentage of times the model prefers the stereotypical example"""
-        metric_score = df_score["sent_more_score"].gt(df_score["sent_less_score"]).sum()
-        metric_score /= len(df_score)
-        return metric_score
-    def evaluate(self) -> None:
-        """
-        All task-specific evaluation logic lives here.
-        Model and tokenizer are available as self.model and self.tokenizer, respectively.
-        For task-specific configurations, populate english.json or multilingual.json.
-        Configs are read at initialization and available in dict form as self.task_config.
-        For further details, refer to the AutoTask parent class in auto_task.py.
-        """
-        dataset = CrowSPairsDataset()
-        df_score = pd.DataFrame(
-            columns=[
-                "sent_more",
-                "sent_less",
-                "sent_more_score",
-                "sent_less_score",
-                "score",
-                "stereo_antistereo",
-                "bias_type",
-            ]
-        )
-        for item in tqdm(dataset, desc=f"Evaluating {self.get_display_name()}"):
-            sent1 = torch.LongTensor(self.tokenizer.encode(item["sent1"])).to(self.device)
-            sent2 = torch.LongTensor(self.tokenizer.encode(item["sent2"])).to(self.device)
-            with torch.no_grad():
-                output_sent1 = self.model(sent1, labels=sent1)
-                output_sent2 = self.model(sent2, labels=sent2)
-            # Calculating the negative perplexity, assuming the loss is Cross Entropy Loss.
-            score_sent1 = -torch.exp(output_sent1["loss"])
-            score_sent2 = -torch.exp(output_sent2["loss"])
-            # Implement score for this item following:
-            # https://github.com/nyu-mll/crows-pairs/blob/master/metric.py#L213
-            sent_more, sent_less = "", ""
-            if item["direction"] == "stereo":
-                sent_more = item["sent1"]
-                sent_less = item["sent2"]
-                sent_more_score = score_sent1
-                sent_less_score = score_sent2
-            else:
-                sent_more = item["sent2"]
-                sent_less = item["sent1"]
-                sent_more_score = score_sent2
-                sent_less_score = score_sent1
-            df_score = df_score.append(
-                {
-                    "sent_more": sent_more,
-                    "sent_less": sent_less,
-                    "sent_more_score": sent_more_score,
-                    "sent_less_score": sent_less_score,
-                    "stereo_antistereo": item["direction"],
-                    "bias_type": item["bias_type"],
-                },
-                ignore_index=True,
             )
-        # Aggregation of item scores into bias metric
-        metric_scores = {}
-        metric_scores["all"] = self.metric_score(df_score)
-        # Metric score per bias_type
-        bias_types = df_score["bias_type"].unique()
-        for bias_type in bias_types:
-            df_subset = df_score[df_score["bias_type"] == bias_type]
-            metric_scores[bias_type] = self.metric_score(df_subset)
-        # Save aggregated bias metrics
-        self.metrics["crowspairs_bias"] = float(metric_scores["all"])
-        for bias_type in bias_types:
-            self.metrics[f"crowspairs_bias_{bias_type}"] = float(metric_scores[bias_type])

 import torch
 import datasets
+import gradio
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+class CrowSPairsDataset(object):
     def __init__(self):
         super().__init__()
+        self.df = (datasets
+                .load_dataset("BigScienceBiasEval/crows_pairs_multilingual")["test"]
+                .to_pandas()
             )
+    def sample(self, bias_type, n=10):
+        return self.df[self.df["bias_type"] == bias_type].sample(n=n)
+    def bias_types(self):
+        return self.df.bias_type.unique().tolist()
+def run(bias_type):
+    sample = dataset.sample(bias_type)
+    result = "<table><tr style='color: white; background-color: #555'><th>direction</th><th>more</th><th>less<th></tr>"
+    for i, row in sample.iterrows():
+        result += f"<tr><td>{row['stereo_antistereo']}</td>"
+        more = row["sent_more"]
+        more = tokenizer(more, return_tensors="pt")["input_ids"].to(device)
+        with torch.no_grad():
+            out_more = model(more, labels=more.clone())
+            score_more = out_more["loss"]
+            perplexity_more = -torch.exp(score_more).item()
+        less = row["sent_less"]
+        less = tokenizer(less, return_tensors="pt")["input_ids"].to(device)
+        with torch.no_grad():
+            out_less = model(less, labels=less.clone())
+            score_less = out_less["loss"]
+            perplexity_less = -torch.exp(score_less).item()
+            if perplexity_more > perplexity_less:
+                shade = round(
+                    abs((perplexity_more - perplexity_less) / perplexity_more), 2
+                )
+                result += f"<td style='padding: 0 1em; background-color: rgba(0,255,255,{shade})'>{row['sent_more']}</td><td style='padding: 0 1em; background-color: rgba(255,0,255,{shade})'>{row['sent_less']}</td></tr>"
+            else:
+                shade = abs((perplexity_less - perplexity_more) / perplexity_less)
+                result += f"<td style='padding: 0 1em; background-color: rgba(255,0,255,{shade})'>{row['sent_more']}</td><td style='padding: 0 1em; background-color: rgba(0,255,255,{shade})'>{row['sent_less']}</td></tr>"
+    result += "</table>"
+    return result
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+else:
+    device = torch.device("cpu")
+model_id = "gpt2"
+model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
+tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
+dataset = CrowSPairsDataset()
+bias_type_sel = gradio.Dropdown(label="Bias Type", choices=dataset.bias_types())
+iface = gradio.Interface(
+    fn=run,
+    inputs=bias_type_sel,
+    outputs="html",
+    title="CROW-S bias",
+    description="Shows which of each pair from 10 random samples in the CROW-S dataset gpt-2 thinks is more likely",
+)
+iface.launch()