import json import requests from io import BytesIO from PIL import Image from tqdm import tqdm from multiprocessing import Process import os import argparse def download_subprocess(dii, save_dir): for image in tqdm(dii): key, value = image.popitem() try: img_data = requests.get(value).content img = Image.open(BytesIO(img_data)).convert('RGB') h = img.size[0] w = img.size[1] if min(h, w) > 512: img = img.resize((int(h / (w / 512)), 512) if h > w else (512, int(w / (h / 512)))) img.save('{}/{}.jpg'.format(save_dir, key)) except: print(key, value) def main(args): train_data = json.load(open(os.path.join(args.json_dir, 'train.description-in-isolation.json'))) val_data = json.load(open(os.path.join(args.json_dir, 'val.description-in-isolation.json'))) test_data = json.load(open(os.path.join(args.json_dir, 'test.description-in-isolation.json'))) dii = [] for subset in [train_data, val_data, test_data]: for image in subset["images"]: try: dii.append({image['id']: image['url_o']}) except: dii.append({image['id']: image['url_m']}) dii = [image for image in dii if not os.path.exists('{}/{}.jpg'.format(args.save_dir, list(image)[0]))] print('total images: {}'.format(len(dii))) def splitlist(inlist, chunksize): return [inlist[x:x + chunksize] for x in range(0, len(inlist), chunksize)] dii_splitted = splitlist(dii, int((len(dii) / args.num_process))) process_list = [] for dii_sub_list in dii_splitted: p = Process(target=download_subprocess, args=(dii_sub_list,)) process_list.append(p) p.Daemon = True p.start() for p in process_list: p.join() if __name__ == "__main__": parser = argparse.ArgumentParser(description='arguments for vist images downloading') parser.add_argument('--json_dir', type=str, required=True, help='dii json file directory') parser.add_argument('--img_dir', type=str, required=True, help='images saving directory') parser.add_argument('--num_process', type=int, default=32) args = parser.parse_args() main(args)