Awiny commited on
Commit
1432af7
1 Parent(s): 2a4130d

Upload download_cc3m.py

Browse files
Files changed (1) hide show
  1. download_cc3m.py +157 -0
download_cc3m.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import requests
4
+ import zlib
5
+ import os
6
+ import shelve
7
+ import magic #pip install python-magic
8
+ from multiprocessing import Pool
9
+ from tqdm import tqdm
10
+
11
+ # headers = {
12
+ # #'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
13
+ # 'User-Agent':'Googlebot-Image/1.0', # Pretend to be googlebot
14
+ # 'X-Forwarded-For': '64.18.15.200'
15
+ # }
16
+
17
+ def _df_split_apply(tup_arg):
18
+ split_ind, subset, func = tup_arg
19
+ r = subset.apply(func, axis=1)
20
+ return (split_ind, r)
21
+
22
+ def df_multiprocess(df, processes, chunk_size, func, dataset_name):
23
+ print("Generating parts...")
24
+ with shelve.open('%s_%s_%s_results.tmp' % (dataset_name, func.__name__, chunk_size)) as results:
25
+
26
+ pbar = tqdm(total=len(df), position=0)
27
+ # Resume:
28
+ finished_chunks = set([int(k) for k in results.keys()])
29
+ pbar.desc = "Resuming"
30
+ for k in results.keys():
31
+ pbar.update(len(results[str(k)][1]))
32
+
33
+ pool_data = ((index, df[i:i + chunk_size], func) for index, i in enumerate(range(0, len(df), chunk_size)) if index not in finished_chunks)
34
+ print(int(len(df) / chunk_size), "parts.", chunk_size, "per part.", "Using", processes, "processes")
35
+
36
+ pbar.desc = "Downloading"
37
+ with Pool(processes) as pool:
38
+ for i, result in enumerate(pool.imap_unordered(_df_split_apply, pool_data, 2)):
39
+ results[str(result[0])] = result
40
+ pbar.update(len(result[1]))
41
+ pbar.close()
42
+
43
+ print("Finished Downloading.")
44
+ return
45
+
46
+ # Unique name based on url
47
+ def _file_name(row):
48
+ row.name = str(int(row.name) // 1000)
49
+ return "%s/%s_%s.jpg" % (row['folder'], row.name, (zlib.crc32(row['url'].encode('utf-8')) & 0xffffffff))
50
+
51
+ # For checking mimetypes separately without download
52
+ def check_mimetype(row):
53
+ if os.path.isfile(str(row['file'])):
54
+ row['mimetype'] = magic.from_file(row['file'], mime=True)
55
+ row['size'] = os.stat(row['file']).st_size
56
+ return row
57
+
58
+ # Don't download image, just check with a HEAD request, can't resume.
59
+ # Can use this instead of download_image to get HTTP status codes.
60
+ def check_download(row):
61
+ fname = _file_name(row)
62
+ sub_dir = fname.split('_')[0]
63
+ if not os.path.exists(sub_dir):
64
+ os.mkdir(sub_dir)
65
+ fname = '/'.join(fname.split('_'))
66
+ try:
67
+ # not all sites will support HEAD
68
+ response = requests.head(row['url'], stream=False, timeout=5, allow_redirects=True ) #, headers=headers)
69
+ row['status'] = response.status_code
70
+ row['headers'] = dict(response.headers)
71
+ except:
72
+ # log errors later, set error as 408 timeout
73
+ row['status'] = 408
74
+ return row
75
+ if response.ok:
76
+ row['file'] = fname
77
+ return row
78
+
79
+ def download_image(row):
80
+ # print(row)
81
+ fname = _file_name(row)
82
+ sub_dir = fname.split('_')[0]
83
+ if not os.path.exists(sub_dir):
84
+ os.mkdir(sub_dir)
85
+ fname = '/'.join(fname.split('_'))
86
+ # print(fname)
87
+ # Skip Already downloaded, retry others later
88
+ if os.path.isfile(fname):
89
+ row['status'] = 200
90
+ row['file'] = fname
91
+ row['mimetype'] = magic.from_file(row['file'], mime=True)
92
+ row['size'] = os.stat(row['file']).st_size
93
+ return row
94
+
95
+ try:
96
+ # use smaller timeout to skip errors, but can result in failed downloads
97
+ response = requests.get(row['url'], stream=False, timeout=10, allow_redirects=True ) # , headers=headers)
98
+ row['status'] = response.status_code
99
+ #row['headers'] = dict(response.headers)
100
+ except Exception as e:
101
+ # log errors later, set error as 408 timeout
102
+ row['status'] = 408
103
+ return row
104
+
105
+ if response.ok:
106
+ try:
107
+ with open(fname, 'wb') as out_file:
108
+ # some sites respond with gzip transport encoding
109
+ response.raw.decode_content = True
110
+ out_file.write(response.content)
111
+ row['mimetype'] = magic.from_file(fname, mime=True)
112
+ row['size'] = os.stat(fname).st_size
113
+ except:
114
+ # This is if it times out during a download or decode
115
+ row['status'] = 408
116
+ return row
117
+ row['file'] = fname
118
+ return row
119
+
120
+ def open_tsv(fname, folder):
121
+ print("Opening %s Data File..." % fname)
122
+ df = pd.read_csv(fname, sep='\t', names=["caption","url"], usecols=range(1,2))
123
+ df['folder'] = folder
124
+ print("Processing", len(df), " Images:")
125
+ return df
126
+
127
+ def df_from_shelve(chunk_size, func, dataset_name):
128
+ print("Generating Dataframe from results...")
129
+ with shelve.open('%s_%s_%s_results.tmp' % (dataset_name, func.__name__, chunk_size)) as results:
130
+ keylist = sorted([int(k) for k in results.keys()])
131
+ df = pd.concat([results[str(k)][1] for k in keylist], sort=True)
132
+ return df
133
+
134
+ # number of processes in the pool can be larger than cores
135
+ num_processes = 32
136
+ # chunk_size is how many images per chunk per process - changing this resets progress when restarting.
137
+ images_per_part = 100
138
+
139
+
140
+ # should download 15840
141
+ data_name = "/CC3M/images/validation"
142
+ df = open_tsv("/CC3M/validation.tsv", data_name)
143
+ df_multiprocess(df=df, processes=num_processes, chunk_size=images_per_part, func=download_image, dataset_name=data_name)
144
+ df = df_from_shelve(chunk_size=images_per_part, func=download_image, dataset_name=data_name)
145
+ df.to_csv("%s_report.tsv.gz" % data_name, compression='gzip', sep='\t', header=False, index=False)
146
+ # print("Saved.")
147
+
148
+
149
+ # # should download 3318333
150
+ # data_name = "CC3M/images/train"
151
+ # df = open_tsv("CC3M/Train_GCC-training.tsv",data_name)
152
+ # df_multiprocess(df=df, processes=num_processes, chunk_size=images_per_part, func=download_image, dataset_name=data_name)
153
+ # df = df_from_shelve(chunk_size=images_per_part, func=download_image, dataset_name=data_name)
154
+ # df.to_csv("%s_report.tsv.gz" % data_name, compression='gzip', sep='\t', header=False, index=False)
155
+ # print("Saved.")
156
+
157
+ # # 3334173 images in total