""" Similarity of Semantic Relations https://arxiv.org/pdf/cs/0608100.pdf SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions; 5 choices per question. Homepage: https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art) """ import inspect import lm_eval.datasets.sat_analogies.sat_analogies from lm_eval.base import MultipleChoiceTask _CITATION = """ @article{article, author = {Turney, Peter}, year = {2006}, month = {09}, pages = {379-416}, title = {Similarity of Semantic Relations}, volume = {32}, journal = {Computational Linguistics}, doi = {10.1162/coli.2006.32.3.379} } """ class SATAnalogies(MultipleChoiceTask): VERSION = 0 DATASET_PATH = inspect.getfile(lm_eval.datasets.sat_analogies.sat_analogies) DATASET_NAME = None def __init__(self, data_dir: str): """ SAT Analog Questions is not publicly available. You must request the data by emailing Peter Turney and then download it to a local directory path which should be passed into the `data_dir` arg. """ super().__init__(data_dir=data_dir) def has_training_docs(self): return False def has_validation_docs(self): return True def has_test_docs(self): return False def training_docs(self): return [] def validation_docs(self): return map(self._process_doc, self.dataset["validation"]) def test_docs(self): return [] def _process_doc(self, doc): return { "source": doc["source"], "query": doc["stem"].split(" ")[:2], "choices": [ "{} is to {}".format(*c.split(" ")[:2]) for c in doc["choices"] ], "gold": ["a", "b", "c", "d", "e"].index(doc["solution"].strip()), } def doc_to_text(self, doc): return "{} is to {} as".format(*doc["query"]) def should_decontaminate(self): return True def doc_to_decontamination_query(self, doc): return doc["source"] + "\n" + " ".join(doc["query"])