File size: 998 Bytes
cae25d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import json,pandas as pd,os
dfs = []
for fold in os.listdir('./datasets'):
    if fold == 'fiqa':
        continue
    print(fold)
    corpus = pd.DataFrame([json.loads(x) for x in open(f"../beir_data/{fold}/corpus.jsonl").readlines()])
    df = pd.DataFrame([json.loads(x) for x in open(f"./datasets/{fold}/training_ids.jsonl").readlines()])
    if df.size>1000:
        df = df.sample(1000,replace=True)
    rows = []
    for _,row in df.iterrows():
        query_id = row['query_id']
        query_text = row['query_text']
        for label in ['easy_positive_ids','hard_positive_ids', 'hard_negative_ids', 'easy_negative_ids']:
            for did in row[label]:
                rows.append({"query_id":query_id,"query_text":query_text,'doc_id':did,'label':label.rstrip('_ids')})
        df = pd.DataFrame(rows)
    df = pd.merge(df,corpus[['_id','title','text']],left_on='doc_id',right_on='_id')
    df['dataset'] = fold

    dfs.append(df)

    pd.concat(dfs).to_csv('full_train_dataset.csv')