File size: 2,267 Bytes
b2b0303 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import json
import requests
from io import BytesIO
from PIL import Image
from tqdm import tqdm
from multiprocessing import Process
import os
import argparse
def download_subprocess(dii, save_dir):
for image in tqdm(dii):
key, value = image.popitem()
try:
img_data = requests.get(value).content
img = Image.open(BytesIO(img_data)).convert('RGB')
h = img.size[0]
w = img.size[1]
if min(h, w) > 512:
img = img.resize((int(h / (w / 512)), 512) if h > w else (512, int(w / (h / 512))))
img.save('{}/{}.jpg'.format(save_dir, key))
except:
print(key, value)
def main(args):
train_data = json.load(open(os.path.join(args.json_dir, 'train.description-in-isolation.json')))
val_data = json.load(open(os.path.join(args.json_dir, 'val.description-in-isolation.json')))
test_data = json.load(open(os.path.join(args.json_dir, 'test.description-in-isolation.json')))
dii = []
for subset in [train_data, val_data, test_data]:
for image in subset["images"]:
try:
dii.append({image['id']: image['url_o']})
except:
dii.append({image['id']: image['url_m']})
dii = [image for image in dii if not os.path.exists('{}/{}.jpg'.format(args.save_dir, list(image)[0]))]
print('total images: {}'.format(len(dii)))
def splitlist(inlist, chunksize):
return [inlist[x:x + chunksize] for x in range(0, len(inlist), chunksize)]
dii_splitted = splitlist(dii, int((len(dii) / args.num_process)))
process_list = []
for dii_sub_list in dii_splitted:
p = Process(target=download_subprocess, args=(dii_sub_list,))
process_list.append(p)
p.Daemon = True
p.start()
for p in process_list:
p.join()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='arguments for vist images downloading')
parser.add_argument('--json_dir', type=str, required=True, help='dii json file directory')
parser.add_argument('--img_dir', type=str, required=True, help='images saving directory')
parser.add_argument('--num_process', type=int, default=32)
args = parser.parse_args()
main(args)
|