import json,pandas as pd,os dfs = [] for fold in os.listdir('./datasets'): if fold == 'fiqa': continue print(fold) corpus = pd.DataFrame([json.loads(x) for x in open(f"../beir_data/{fold}/corpus.jsonl").readlines()]) df = pd.DataFrame([json.loads(x) for x in open(f"./datasets/{fold}/training_ids.jsonl").readlines()]) if df.size>1000: df = df.sample(1000,replace=True) rows = [] for _,row in df.iterrows(): query_id = row['query_id'] query_text = row['query_text'] for label in ['easy_positive_ids','hard_positive_ids', 'hard_negative_ids', 'easy_negative_ids']: for did in row[label]: rows.append({"query_id":query_id,"query_text":query_text,'doc_id':did,'label':label.rstrip('_ids')}) df = pd.DataFrame(rows) df = pd.merge(df,corpus[['_id','title','text']],left_on='doc_id',right_on='_id') df['dataset'] = fold dfs.append(df) pd.concat(dfs).to_csv('full_train_dataset.csv')