Spaces:
Paused
Paused
""" | |
This script reads DPR retriever training data and parses each datapoint. We save a line per datapoint. | |
Each line consists of the query followed by a tab-separated list of Wikipedia page titles constituting | |
positive contexts for a given query. | |
""" | |
import argparse | |
import json | |
from tqdm import tqdm | |
def main(): | |
parser = argparse.ArgumentParser() | |
# Required parameters | |
parser.add_argument( | |
"--src_path", | |
type=str, | |
default="biencoder-nq-dev.json", | |
help="Path to raw DPR training data", | |
) | |
parser.add_argument( | |
"--evaluation_set", | |
type=str, | |
help="where to store parsed evaluation_set file", | |
) | |
parser.add_argument( | |
"--gold_data_path", | |
type=str, | |
help="where to store parsed gold_data_path file", | |
) | |
args = parser.parse_args() | |
with open(args.src_path, "r") as src_file, open(args.evaluation_set, "w") as eval_file, open( | |
args.gold_data_path, "w" | |
) as gold_file: | |
dpr_records = json.load(src_file) | |
for dpr_record in tqdm(dpr_records): | |
question = dpr_record["question"] | |
contexts = [context["title"] for context in dpr_record["positive_ctxs"]] | |
eval_file.write(question + "\n") | |
gold_file.write("\t".join(contexts) + "\n") | |
if __name__ == "__main__": | |
main() | |