|
""" |
|
A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers |
|
https://arxiv.org/abs/2105.03011 |
|
|
|
QASPER is a dataset of 5,049 questions over 1,585 Natural Language Processing papers. |
|
Each question is written by an NLP practitioner who read only the title and abstract |
|
of the corresponding paper, and the question seeks information present in the full |
|
text. The questions are then answered by a separate set of NLP practitioners who also |
|
provide supporting evidence to answers. |
|
|
|
Homepage: https://allenai.org/data/qasper |
|
""" |
|
from collections import Counter |
|
import re |
|
import string |
|
from lm_eval.base import rf, Task |
|
from lm_eval.metrics import f1_score, mean |
|
|
|
|
|
_CITATION = """ |
|
@article{DBLP:journals/corr/abs-2105-03011, |
|
author = {Pradeep Dasigi and |
|
Kyle Lo and |
|
Iz Beltagy and |
|
Arman Cohan and |
|
Noah A. Smith and |
|
Matt Gardner}, |
|
title = {A Dataset of Information-Seeking Questions and Answers Anchored in |
|
Research Papers}, |
|
journal = {CoRR}, |
|
volume = {abs/2105.03011}, |
|
year = {2021}, |
|
url = {https://arxiv.org/abs/2105.03011}, |
|
eprinttype = {arXiv}, |
|
eprint = {2105.03011}, |
|
timestamp = {Fri, 14 May 2021 12:13:30 +0200}, |
|
biburl = {https://dblp.org/rec/journals/corr/abs-2105-03011.bib}, |
|
bibsource = {dblp computer science bibliography, https://dblp.org} |
|
} |
|
""" |
|
|
|
|
|
def normalize_answer(s): |
|
""" |
|
Taken from the official evaluation script for v1.1 of the SQuAD dataset. |
|
Lower text and remove punctuation, articles and extra whitespace. |
|
""" |
|
|
|
def remove_articles(text): |
|
return re.sub(r"\b(a|an|the)\b", " ", text) |
|
|
|
def white_space_fix(text): |
|
return " ".join(text.split()) |
|
|
|
def remove_punc(text): |
|
exclude = set(string.punctuation) |
|
return "".join(ch for ch in text if ch not in exclude) |
|
|
|
def lower(text): |
|
return text.lower() |
|
|
|
return white_space_fix(remove_articles(remove_punc(lower(s)))) |
|
|
|
|
|
def categorise_answer(answer_blob): |
|
if answer_blob["unanswerable"]: |
|
answer = "unanswerable" |
|
answer_type = "unanswerable" |
|
return answer, answer_type |
|
elif answer_blob["yes_no"]: |
|
answer = "yes" |
|
answer_type = "bool" |
|
return answer, answer_type |
|
elif answer_blob["free_form_answer"]: |
|
answer = answer_blob["free_form_answer"] |
|
answer_type = "free form answer" |
|
return answer, answer_type |
|
elif answer_blob["extractive_spans"]: |
|
answer = answer_blob["extractive_spans"] |
|
answer_type = "extractive_spans" |
|
return answer, answer_type |
|
elif answer_blob["yes_no"] is False: |
|
answer = "no" |
|
answer_type = "bool" |
|
return answer, answer_type |
|
|
|
|
|
def token_f1_score(prediction, ground_truth): |
|
""" |
|
Taken from the official evaluation script for v1.1 of the SQuAD dataset. |
|
""" |
|
prediction_tokens = normalize_answer(prediction).split() |
|
ground_truth_tokens = normalize_answer(ground_truth).split() |
|
common = Counter(prediction_tokens) & Counter(ground_truth_tokens) |
|
num_same = sum(common.values()) |
|
if num_same == 0: |
|
return 0 |
|
precision = 1.0 * num_same / len(prediction_tokens) |
|
recall = 1.0 * num_same / len(ground_truth_tokens) |
|
f1 = (2 * precision * recall) / (precision + recall) |
|
return f1 |
|
|
|
|
|
class QASPER(Task): |
|
VERSION = 0 |
|
DATASET_PATH = "qasper" |
|
DATASET_NAME = None |
|
|
|
def has_training_docs(self): |
|
return True |
|
|
|
def has_validation_docs(self): |
|
return True |
|
|
|
def has_test_docs(self): |
|
return False |
|
|
|
def doc_to_text(self, doc): |
|
return ( |
|
"TITLE: " |
|
+ doc["title"] |
|
+ "\n" |
|
+ "ABSTRACT: " |
|
+ doc["abstract"] |
|
+ "\n\n" |
|
+ "Q: " |
|
+ doc["question"] |
|
+ "\n\n" |
|
+ "A:" |
|
) |
|
|
|
def doc_to_target(self, doc): |
|
answer = doc["answer"] |
|
if isinstance(answer, list): |
|
answer = ", ".join(answer) |
|
return " " + answer |
|
|
|
def training_docs(self): |
|
for doc in self.dataset["train"]: |
|
yield from self._process_doc(doc) |
|
|
|
def validation_docs(self): |
|
for doc in self.dataset["validation"]: |
|
yield from self._process_doc(doc) |
|
|
|
def _process_doc(self, doc): |
|
"""Given a `doc`, flatten it out so that each JSON blob |
|
contains exactly one question and one answer. Logic taken from |
|
the reference implementation available at |
|
https://github.com/allenai/qasper-led-baseline/blob/main/scripts/evaluator.py |
|
""" |
|
obs_list = [] |
|
for question, answer_list in zip(doc["qas"]["question"], doc["qas"]["answers"]): |
|
for answer_blob in answer_list["answer"]: |
|
answer, answer_type = categorise_answer(answer_blob) |
|
obs_list.append( |
|
{ |
|
"title": doc["title"], |
|
"abstract": doc["abstract"], |
|
"question": question, |
|
"answer": answer, |
|
"answer_type": answer_type, |
|
} |
|
) |
|
return obs_list |
|
|
|
def process_results(self, doc, results): |
|
|
|
|
|
if not results: |
|
return {} |
|
elif len(results) == 1: |
|
[res] = results |
|
elif len(results) == 2: |
|
[ll_yes, ll_no] = results |
|
|
|
|
|
|
|
|
|
|
|
|
|
res_dict = {} |
|
|
|
if doc["answer_type"] == "bool": |
|
gold = 1 if doc["answer"] == "yes" else 0 |
|
pred = ll_yes > ll_no |
|
res_dict["f1_yesno"] = (gold, pred) |
|
|
|
|
|
if doc["answer_type"] == "free form answer": |
|
res_dict["f1_abstractive"] = token_f1_score(res, doc["answer"]) |
|
|
|
|
|
|
|
|
|
return res_dict |
|
|
|
def aggregation(self): |
|
return { |
|
"f1_yesno": f1_score, |
|
"f1_abstractive": mean, |
|
} |
|
|
|
def construct_requests(self, doc, ctx): |
|
"""Uses RequestFactory to construct Requests and returns an iterable of |
|
Requests which will be sent to the LM. |
|
|
|
:param doc: |
|
The document as returned from training_docs, validation_docs, or test_docs. |
|
:param ctx: str |
|
The context string, generated by fewshot_context. This includes the natural |
|
language description, as well as the few shot examples, and the question |
|
part of the document for `doc`. |
|
""" |
|
|
|
if doc["answer_type"] in ("free form answer"): |
|
return [rf.greedy_until(ctx, ["\n"])] |
|
elif doc["answer_type"] in ("bool"): |
|
ll_yes, _ = rf.loglikelihood(ctx, " yes") |
|
ll_no, _ = rf.loglikelihood(ctx, " no") |
|
return [ll_yes, ll_no] |
|
else: |
|
return [] |
|
|
|
def higher_is_better(self): |
|
""" |
|
:returns: {str: bool} |
|
A dictionary where keys are the names of submetrics and values are |
|
whether a higher value of the submetric is better |
|
""" |
|
return { |
|
"f1_yesno": True, |
|
"f1_abstractive": True, |
|
} |
|
|