train-mbed / train_datasets_creation /make_dataset_for_train.py
amos1088's picture
no
cae25d0
raw
history blame contribute delete
998 Bytes
import json,pandas as pd,os
dfs = []
for fold in os.listdir('./datasets'):
if fold == 'fiqa':
continue
print(fold)
corpus = pd.DataFrame([json.loads(x) for x in open(f"../beir_data/{fold}/corpus.jsonl").readlines()])
df = pd.DataFrame([json.loads(x) for x in open(f"./datasets/{fold}/training_ids.jsonl").readlines()])
if df.size>1000:
df = df.sample(1000,replace=True)
rows = []
for _,row in df.iterrows():
query_id = row['query_id']
query_text = row['query_text']
for label in ['easy_positive_ids','hard_positive_ids', 'hard_negative_ids', 'easy_negative_ids']:
for did in row[label]:
rows.append({"query_id":query_id,"query_text":query_text,'doc_id':did,'label':label.rstrip('_ids')})
df = pd.DataFrame(rows)
df = pd.merge(df,corpus[['_id','title','text']],left_on='doc_id',right_on='_id')
df['dataset'] = fold
dfs.append(df)
pd.concat(dfs).to_csv('full_train_dataset.csv')