sail
/

PTP

Model card Files Files and versions Community

File size: 1,678 Bytes

457d89d

import os
from csv import reader
import wget
from multiprocessing import Pool, Value

dl_count = Value('i', 0)
wl_count = Value('i', 0)
data_source = "/SBU/dataset"

# correct:851000 wrong: 148957, 999957/1000000 finished

def image_dl(info):
    global dl_count, wl_count
    row_count = 1000000
    sub_dir = info.split('/')[-2]
    file_name = info.split('/')[-1]
    image_dir = os.path.join(data_source, sub_dir)
    image_path = os.path.join(image_dir, file_name)
    # if not correct download
    if os.path.exists(image_dir) and os.path.exists(image_path):
        with dl_count.get_lock():
            dl_count.value += 1
        return 1
    if not os.path.exists(image_dir):
        os.mkdir(image_dir)
    msg1 = ""
    try:
        wget.download(info, out=image_path)
        with dl_count.get_lock():
            dl_count.value += 1
    except IOError:
        msg1 = "image {} not found".format(info)
        with wl_count.get_lock():
            wl_count.value += 1
    # print(info[0], info[3])
    # video_dir
    if dl_count.value % 1000 == 0:
        print("\n")
        msg2 = "correct:{} wrong: {}, {}/{} finished".format(dl_count.value, wl_count.value, 
        dl_count.value+wl_count.value, row_count)
        print(msg2)
    # with open('webvid_data/download_logs.txt','a') as f:
    #     f.write(msg1 + "\n" + msg2)


# train / val dataset
urls = []
with open('/mnt/aiops/common/wangjp/SBU/SBU_captioned_photo_dataset_urls.txt', 'r') as fh:
    for line in fh:
        url = line.rstrip()
        urls.append(url)
print("{} imgs to be downloaded".format(len(urls)))

num_processes = 16
pool = Pool(num_processes)
pool.map(image_dl, tuple(list(urls)))