File size: 2,267 Bytes
b2b0303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import json
import requests
from io import BytesIO
from PIL import Image
from tqdm import tqdm
from multiprocessing import Process
import os
import argparse


def download_subprocess(dii, save_dir):
    for image in tqdm(dii):
        key, value = image.popitem()
        try:
            img_data = requests.get(value).content
            img = Image.open(BytesIO(img_data)).convert('RGB')
            h = img.size[0]
            w = img.size[1]
            if min(h, w) > 512:
                img = img.resize((int(h / (w / 512)), 512) if h > w else (512, int(w / (h / 512))))
            img.save('{}/{}.jpg'.format(save_dir, key))
        except:
            print(key, value)


def main(args):
    train_data = json.load(open(os.path.join(args.json_dir, 'train.description-in-isolation.json')))
    val_data = json.load(open(os.path.join(args.json_dir, 'val.description-in-isolation.json')))
    test_data = json.load(open(os.path.join(args.json_dir, 'test.description-in-isolation.json')))
    dii = []
    for subset in [train_data, val_data, test_data]:
        for image in subset["images"]:
            try:
                dii.append({image['id']: image['url_o']})
            except:
                dii.append({image['id']: image['url_m']})

    dii = [image for image in dii if not os.path.exists('{}/{}.jpg'.format(args.save_dir, list(image)[0]))]
    print('total images: {}'.format(len(dii)))

    def splitlist(inlist, chunksize):
        return [inlist[x:x + chunksize] for x in range(0, len(inlist), chunksize)]

    dii_splitted = splitlist(dii, int((len(dii) / args.num_process)))
    process_list = []
    for dii_sub_list in dii_splitted:
        p = Process(target=download_subprocess, args=(dii_sub_list,))
        process_list.append(p)
        p.Daemon = True
        p.start()
    for p in process_list:
        p.join()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='arguments for vist images downloading')
    parser.add_argument('--json_dir', type=str, required=True, help='dii json file directory')
    parser.add_argument('--img_dir', type=str, required=True, help='images saving directory')
    parser.add_argument('--num_process', type=int, default=32)
    args = parser.parse_args()
    main(args)