|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
|
|
import numpy as np |
|
import pandas as pd |
|
|
|
|
|
def construct_negatives(input_file, output_file, num_passages, num_negatives): |
|
qrels = pd.read_csv(input_file, delimiter="\t", header=None) |
|
with open(output_file, "w") as f: |
|
for i in range(len(qrels)): |
|
query_id, rel_passage_id = qrels[0][i], qrels[2][i] |
|
negatives = np.random.randint(num_passages, size=num_negatives) |
|
output_ids = [query_id, rel_passage_id] + negatives.tolist() |
|
output_str = [str(id_) for id_ in output_ids] |
|
print("\t".join(output_str), file=f) |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description="Negative passages construction") |
|
parser.add_argument("--data", type=str, default="msmarco_dataset", help="path to folder with data") |
|
parser.add_argument("--num_passages", type=int, default=8841823, help="total number of passages") |
|
parser.add_argument("--num_negatives", type=int, default=10, help="number of negatives per positive") |
|
args = parser.parse_args() |
|
|
|
for mode in ["train", "dev"]: |
|
construct_negatives( |
|
input_file=f"{args.data}/qrels.{mode}.tsv", |
|
output_file=f"{args.data}/query2passages.{mode}.tsv", |
|
num_passages=args.num_passages, |
|
num_negatives=args.num_negatives, |
|
) |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|