chenxwh
/

AVeriTeC

Model card Files Files and versions Community

Chenxi Whitehouse commited on Apr 12

Commit

a6e9308

•

1 Parent(s): ce6cd35

update src

Browse files

Files changed (3) hide show

README.md +1 -1
src/prediction/evaluate_veracity.py +3 -8
src/prediction/veracity_prediction.py +49 -5

README.md CHANGED Viewed

@@ -120,7 +120,7 @@ The result for dev and the test set below. We recommend using 0.25 as cut-off sc
 | Model             | Split	| Q only | Q + A | Veracity @ 0.2 | @ 0.25 | @ 0.3 |
 |-------------------|-------|--------|-------|----------------|--------|-------|
-| AVeriTeC-BLOOM-7b | dev	|     	|   	| 	|  	| 	|
 | AVeriTeC-BLOOM-7b | test	|    	|   	|  	|  	| 	|
 ## Citation

 | Model             | Split	| Q only | Q + A | Veracity @ 0.2 | @ 0.25 | @ 0.3 |
 |-------------------|-------|--------|-------|----------------|--------|-------|
+| AVeriTeC-BLOOM-7b | dev	|  0.24  | 0.19  | 	    0.19      |  0.09  |  0.05 |
 | AVeriTeC-BLOOM-7b | test	|    	|   	|  	|  	| 	|
 ## Citation

src/prediction/evaluate_veracity.py CHANGED Viewed

@@ -23,7 +23,7 @@ def compute_all_pairwise_scores(src_data, tgt_data, metric):
     return scores
-def print_with_space(left, right, left_space=40):
     print_spaces = " " * (left_space - len(left))
     print(left + print_spaces + right)
@@ -303,14 +303,9 @@ if __name__ == "__main__":
             str(v_score[i]),
         )
     print("--------------------")
     type_scores = scorer.evaluate_averitec_veracity_by_type(
-        predictions, references, threshold=0.2
-    )
-    for t, v in type_scores.items():
-        print_with_space(" * Veracity scores (" + t + "):", str(v))
-    print("--------------------")
-    type_scores = scorer.evaluate_averitec_veracity_by_type(
-        predictions, references, threshold=0.3
     )
     for t, v in type_scores.items():
         print_with_space(" * Veracity scores (" + t + "):", str(v))

     return scores
+def print_with_space(left, right, left_space=45):
     print_spaces = " " * (left_space - len(left))
     print(left + print_spaces + right)
             str(v_score[i]),
         )
     print("--------------------")
+    print("AVeriTeC scores by type @ 0.25:")
     type_scores = scorer.evaluate_averitec_veracity_by_type(
+        predictions, references, threshold=0.25
     )
     for t, v in type_scores.items():
         print_with_space(" * Veracity scores (" + t + "):", str(v))

src/prediction/veracity_prediction.py CHANGED Viewed

@@ -2,11 +2,9 @@ import argparse
 import json
 import tqdm
 import torch
 from transformers import BertTokenizer, BertForSequenceClassification
-from data_loaders.SequenceClassificationDataLoader import (
-    SequenceClassificationDataLoader,
-)
-from models.SequenceClassificationModule import SequenceClassificationModule
 LABEL = [
@@ -17,6 +15,50 @@ LABEL = [
 ]
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Given a claim and its 3 QA pairs as evidence, we use another pre-trained BERT model to predict the veracity label."
@@ -83,7 +125,9 @@ if __name__ == "__main__":
         tokenized_strings, attention_mask = dataLoader.tokenize_strings(example_strings)
         example_support = torch.argmax(
-            trained_model(tokenized_strings, attention_mask=attention_mask).logits,
             axis=1,
         )

 import json
 import tqdm
 import torch
+import pytorch_lightning as pl
 from transformers import BertTokenizer, BertForSequenceClassification
+from src.models.SequenceClassificationModule import SequenceClassificationModule
 LABEL = [
 ]
+class SequenceClassificationDataLoader(pl.LightningDataModule):
+    def __init__(self, tokenizer, data_file, batch_size, add_extra_nee=False):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.data_file = data_file
+        self.batch_size = batch_size
+        self.add_extra_nee = add_extra_nee
+    def tokenize_strings(
+        self,
+        source_sentences,
+        max_length=512,
+        pad_to_max_length=False,
+        return_tensors="pt",
+    ):
+        encoded_dict = self.tokenizer(
+            source_sentences,
+            max_length=max_length,
+            padding="max_length" if pad_to_max_length else "longest",
+            truncation=True,
+            return_tensors=return_tensors,
+        )
+        input_ids = encoded_dict["input_ids"]
+        attention_masks = encoded_dict["attention_mask"]
+        return input_ids, attention_masks
+    def quadruple_to_string(self, claim, question, answer, bool_explanation=""):
+        if bool_explanation is not None and len(bool_explanation) > 0:
+            bool_explanation = ", because " + bool_explanation.lower().strip()
+        else:
+            bool_explanation = ""
+        return (
+            "[CLAIM] "
+            + claim.strip()
+            + " [QUESTION] "
+            + question.strip()
+            + " "
+            + answer.strip()
+            + bool_explanation
+        )
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Given a claim and its 3 QA pairs as evidence, we use another pre-trained BERT model to predict the veracity label."
         tokenized_strings, attention_mask = dataLoader.tokenize_strings(example_strings)
         example_support = torch.argmax(
+            trained_model(
+                tokenized_strings.to(device), attention_mask=attention_mask.to(device)
+            ).logits,
             axis=1,
         )