Spaces:
Paused
Paused
import json,pandas as pd,os | |
dfs = [] | |
for fold in os.listdir('./datasets'): | |
if fold == 'fiqa': | |
continue | |
print(fold) | |
corpus = pd.DataFrame([json.loads(x) for x in open(f"../beir_data/{fold}/corpus.jsonl").readlines()]) | |
df = pd.DataFrame([json.loads(x) for x in open(f"./datasets/{fold}/training_ids.jsonl").readlines()]) | |
if df.size>1000: | |
df = df.sample(1000,replace=True) | |
rows = [] | |
for _,row in df.iterrows(): | |
query_id = row['query_id'] | |
query_text = row['query_text'] | |
for label in ['easy_positive_ids','hard_positive_ids', 'hard_negative_ids', 'easy_negative_ids']: | |
for did in row[label]: | |
rows.append({"query_id":query_id,"query_text":query_text,'doc_id':did,'label':label.rstrip('_ids')}) | |
df = pd.DataFrame(rows) | |
df = pd.merge(df,corpus[['_id','title','text']],left_on='doc_id',right_on='_id') | |
df['dataset'] = fold | |
dfs.append(df) | |
pd.concat(dfs).to_csv('full_train_dataset.csv') |