File size: 1,130 Bytes
5282eae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import os
import shutil
import glob
import random
from pprint import pprint

DIR_COCO_VG = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw"
DIR = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/blip2_pretraining"
OUT_DIR = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/blip2_all_data_ground"


if __name__ == "__main__":
    os.makedirs(OUT_DIR, exist_ok=True)
    ccs_tars = glob.glob(os.path.join(DIR, "ccs_synthetic_filtered_large_ground", "*.tar"))
    coco_tars = glob.glob(os.path.join(DIR_COCO_VG, "karpathy_coco_wds_full_ground", "*.tar"))
    vg_tars = glob.glob(os.path.join(DIR_COCO_VG, "vg_wds_full_ground", "*.tar"))
    laion_part_tars = glob.glob(os.path.join(DIR, "laion_synthetic_filtered_large", "all_ground", "*.tar"))
    tars = []
    tars.extend(ccs_tars)
    for _ in range(5):
        tars.extend(coco_tars)
    tars.extend(vg_tars)
    tars.extend(laion_part_tars)
    random.shuffle(tars)
    print(len(tars))
    pprint(tars[:20])
    for i, tar in enumerate(tars):
        dst = os.path.join(OUT_DIR, f"{str(i).zfill(6)}.tar")
        # print(tar, dst)
        os.symlink(tar, dst)