|
import os |
|
from csv import reader |
|
import wget |
|
from multiprocessing import Pool, Value |
|
|
|
dl_count = Value('i', 0) |
|
wl_count = Value('i', 0) |
|
data_source = "/SBU/dataset" |
|
|
|
|
|
|
|
def image_dl(info): |
|
global dl_count, wl_count |
|
row_count = 1000000 |
|
sub_dir = info.split('/')[-2] |
|
file_name = info.split('/')[-1] |
|
image_dir = os.path.join(data_source, sub_dir) |
|
image_path = os.path.join(image_dir, file_name) |
|
|
|
if os.path.exists(image_dir) and os.path.exists(image_path): |
|
with dl_count.get_lock(): |
|
dl_count.value += 1 |
|
return 1 |
|
if not os.path.exists(image_dir): |
|
os.mkdir(image_dir) |
|
msg1 = "" |
|
try: |
|
wget.download(info, out=image_path) |
|
with dl_count.get_lock(): |
|
dl_count.value += 1 |
|
except IOError: |
|
msg1 = "image {} not found".format(info) |
|
with wl_count.get_lock(): |
|
wl_count.value += 1 |
|
|
|
|
|
if dl_count.value % 1000 == 0: |
|
print("\n") |
|
msg2 = "correct:{} wrong: {}, {}/{} finished".format(dl_count.value, wl_count.value, |
|
dl_count.value+wl_count.value, row_count) |
|
print(msg2) |
|
|
|
|
|
|
|
|
|
|
|
urls = [] |
|
with open('/mnt/aiops/common/wangjp/SBU/SBU_captioned_photo_dataset_urls.txt', 'r') as fh: |
|
for line in fh: |
|
url = line.rstrip() |
|
urls.append(url) |
|
print("{} imgs to be downloaded".format(len(urls))) |
|
|
|
num_processes = 16 |
|
pool = Pool(num_processes) |
|
pool.map(image_dl, tuple(list(urls))) |