saxenarohit commited on
Commit
a0d8a50
·
1 Parent(s): 9563130
src/backend/tasks/cnndm/README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Task-name
2
+
3
+ ### Paper
4
+
5
+ Title: `Know What You Don’t Know: Unanswerable Questions for SQuAD`
6
+ Abstract: https://arxiv.org/abs/1806.03822
7
+
8
+ Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
9
+ consisting of questions posed by crowdworkers on a set of Wikipedia articles,
10
+ where the answer to every question is a segment of text, or span, from the
11
+ corresponding reading passage, or the question might be unanswerable.
12
+ SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
13
+ questions written adversarially by crowdworkers to look similar to answerable ones.
14
+ To do well on SQuAD2.0, systems must not only answer questions when possible, but
15
+ also determine when no answer is supported by the paragraph and abstain from answering.
16
+
17
+ Homepage: https://rajpurkar.github.io/SQuAD-explorer/
18
+
19
+
20
+ ### Citation
21
+
22
+ ```
23
+ @misc{rajpurkar2018know,
24
+ title={Know What You Don't Know: Unanswerable Questions for SQuAD},
25
+ author={Pranav Rajpurkar and Robin Jia and Percy Liang},
26
+ year={2018},
27
+ eprint={1806.03822},
28
+ archivePrefix={arXiv},
29
+ primaryClass={cs.CL}
30
+ }
31
+ ```
32
+
33
+ ### Groups and Tasks
34
+
35
+ #### Groups
36
+
37
+ * Not part of a group yet
38
+
39
+ #### Tasks
40
+
41
+ * `squadv2`: `Default squadv2 task`
42
+
43
+ ### Checklist
44
+
45
+ For adding novel benchmarks/datasets to the library:
46
+ * [ ] Is the task an existing benchmark in the literature?
47
+ * [ ] Have you referenced the original paper that introduced the task?
48
+ * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
49
+
50
+
51
+ If other tasks on this dataset are already supported:
52
+ * [ ] Is the "Main" variant of this task clearly denoted?
53
+ * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
54
+ * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
src/backend/tasks/cnndm/__pycache__/task.cpython-39.pyc ADDED
Binary file (4.27 kB). View file
 
src/backend/tasks/cnndm/__pycache__/utils.cpython-39.pyc ADDED
Binary file (2.81 kB). View file
 
src/backend/tasks/cnndm/task.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lm_eval.api.task import Task
2
+ from lm_eval.api.instance import Instance
3
+ from lm_eval.api.registry import register_task
4
+ from lm_eval.api.metrics import mean
5
+ import datasets
6
+ from src.backend.tasks.cnndm import utils
7
+
8
+
9
+ @register_task("cnndm")
10
+ class CnnDm(Task):
11
+ VERSION = 0
12
+ DATASET_PATH = "cnn_dailymail"
13
+ DATASET_NAME = "3.0.0"
14
+
15
+ def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
16
+ super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
17
+ print('XXX CNNDM!')
18
+
19
+ def has_training_docs(self):
20
+ return True
21
+
22
+ def has_validation_docs(self):
23
+ return True
24
+
25
+ def has_test_docs(self):
26
+ return True
27
+
28
+ def training_docs(self):
29
+ return self.dataset["train"]
30
+
31
+ def validation_docs(self):
32
+ return self.dataset["validation"]
33
+
34
+ def test_docs(self):
35
+ return self.dataset["test"]
36
+
37
+ def doc_to_text(self, doc):
38
+ return f'Document: {doc["article"]}\nSummary:'
39
+
40
+ @staticmethod
41
+ def should_decontaminate():
42
+ return True
43
+
44
+ def doc_to_decontamination_query(self, doc):
45
+ return doc["article"]
46
+
47
+ def doc_to_target(self, doc):
48
+ return doc["highlights"]
49
+
50
+ def construct_requests(self, doc, ctx, **kwargs):
51
+ """Uses RequestFactory to construct Requests and returns an iterable of
52
+ Requests which will be sent to the LM.
53
+
54
+ :param doc:
55
+ The document as returned from training_docs, validation_docs, or test_docs.
56
+ :param ctx: str
57
+ The context string, generated by fewshot_context. This includes the natural
58
+ language description, as well as the few shot examples, and the question
59
+ part of the document for `doc`.
60
+ """
61
+
62
+ return [
63
+ Instance(
64
+ request_type="generate_until",
65
+ doc=doc,
66
+ arguments=(ctx, {"until": ["\n", "."]}),
67
+ idx=0,
68
+ **kwargs
69
+ )
70
+ ]
71
+
72
+ def process_results(self, doc, results):
73
+ return utils.process_results(doc, results)
74
+
75
+ def aggregation(self):
76
+ """
77
+ :returns: {str: [float] -> float}
78
+ A dictionary where keys are the names of submetrics and values are
79
+ functions that aggregate a list of metrics
80
+ """
81
+ return {k: mean for k in ["rouge1", "rouge2", "rougeL"]}
82
+
83
+ def higher_is_better(self):
84
+ """
85
+ :returns: {str: bool}
86
+ A dictionary where keys are the names of submetrics and values are
87
+ whether a higher value of the submetric is better
88
+ """
89
+ return {k: True for k in ["rouge1", "rouge2", "rougeL"]}
90
+
src/backend/tasks/cnndm/utils.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sacrebleu
2
+ import numpy as np
3
+
4
+ from rouge_score import rouge_scorer, scoring
5
+
6
+
7
+ def process_results(doc, results):
8
+ # (Pdb)doc.keys()
9
+ # dict_keys(['document', 'summary', 'id'])
10
+ # (Pdb++) results
11
+ # [' The Welsh Government has announced
12
+
13
+ # breakpoint()
14
+
15
+ completion = results[0]
16
+ # true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
17
+ # all_refs = true_refs + false_refs
18
+
19
+ document = doc["article"]
20
+ true_refs = [doc["highlights"]]
21
+ all_refs = true_refs
22
+
23
+ # ROUGE-N
24
+ rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
25
+ # ROUGE-1
26
+ rouge1_scores = [score["rouge1"] for score in rouge_scores]
27
+ # ROUGE-2
28
+ rouge2_scores = [score["rouge2"] for score in rouge_scores]
29
+ # ROUGE-L
30
+ rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
31
+
32
+ res = {
33
+ "rouge1": rouge1_scores[0],
34
+ "rouge2": rouge2_scores[0],
35
+ "rougeL": rougeL_scores[0],
36
+ }
37
+
38
+ return res
39
+
40
+
41
+ def bleu(refs, preds):
42
+ """
43
+ Returns `t5` style BLEU scores. See the related implementation:
44
+ https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
45
+
46
+ :param refs:
47
+ A `list` of `list` of reference `str`s.
48
+ :param preds:
49
+ A `list` of predicted `str`s.
50
+ """
51
+ score = sacrebleu.corpus_bleu(
52
+ preds,
53
+ refs,
54
+ smooth_method="exp",
55
+ smooth_value=0.0,
56
+ force=False,
57
+ lowercase=False,
58
+ tokenize="intl",
59
+ use_effective_order=False,
60
+ ).score
61
+ return score
62
+
63
+
64
+ def rouge(refs, preds):
65
+ """
66
+ Returns `t5` style ROUGE scores. See the related implementation:
67
+ https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
68
+
69
+ :param refs:
70
+ A `list` of reference `strs`.
71
+ :param preds:
72
+ A `list` of predicted `strs`.
73
+ """
74
+ rouge_types = ["rouge1", "rouge2", "rougeLsum"]
75
+ scorer = rouge_scorer.RougeScorer(rouge_types)
76
+ # Add newlines between sentences to correctly compute `rougeLsum`.
77
+
78
+ def _prepare_summary(summary):
79
+ summary = summary.replace(" . ", ".\n")
80
+ return summary
81
+
82
+ # Accumulate confidence intervals.
83
+ aggregator = scoring.BootstrapAggregator()
84
+ for ref, pred in zip(refs, preds):
85
+ ref = _prepare_summary(ref)
86
+ pred = _prepare_summary(pred)
87
+ aggregator.add_scores(scorer.score(ref, pred))
88
+ result = aggregator.aggregate()
89
+ return {type: result[type].mid.fmeasure * 100 for type in rouge_types}