File size: 2,056 Bytes
a638e43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import h5py
import lmdb
import numpy as np
import msgpack
from utils.basic_utils import load_json, save_json
from tqdm import tqdm
import os

data_path = "/home/renjie.liang/11_TVR-Ranking/ReLoCLNet/data/TVR_Ranking/train_top40.json"
# data_path = "/home/renjie.liang/11_TVR-Ranking/ReLoCLNet/data/TVR_Ranking/val.json"
# data_path = "/home/renjie.liang/11_TVR-Ranking/ReLoCLNet/data/TVR_Ranking/test.json"
old_data = load_json(data_path)

new_data_path = "./data/TVR_Ranking_CONQUER/train_top40.json"
# new_data_path = "./data/TVR_Ranking_CONQUER/val.json"
# new_data_path = "./data/TVR_Ranking_CONQUER/test.json"
new_vr_path = "data/TVR_Ranking_train_top100_hero"
# new_vr_path = "data/TVR_Ranking_val_top100_hero"
# new_vr_path = "data/TVR_Ranking_test_top100_hero"

# Destination LMDB path (for writing)

os.makedirs(new_vr_path, exist_ok=True)

consolidated_path = "/home/renjie.liang/datasets/tvr_feature_release/data/consolidated_vr_results"
vr_pool = lmdb.open(consolidated_path, readonly=True, create=False,  max_readers=4096 * 8, readahead=False)
vr_txn = vr_pool.begin(buffers=True)
    
# Open the new LMDB for writing
new_vr_pool = lmdb.open(new_vr_path, readonly=False, create=True, max_dbs=0, map_size=10 * 1024**3)  # 10 GiB
clean_data = []
with new_vr_pool.begin(write=True) as new_vr_txn:
    for i in tqdm(old_data):
        query_id = i["query_id"]
        # Retrieve the data from the source database
        vr_data = vr_txn.get(str(query_id).encode())
        if vr_data is not None:
            clean_data.append(i)
            # Data exists, so load it using msgpack and then put it into the new database
            vr_res = msgpack.loads(vr_data)
            # Ensure the data is serialized before storing
            vr_data_serialized = msgpack.dumps(vr_res)
            new_vr_txn.put(str(query_id).encode(), vr_data_serialized)

# Close both the source and destination databases after operations are complete
save_json(clean_data, new_data_path)
print(len(old_data), "->", len(clean_data))
vr_pool.close()
new_vr_pool.close()