Spaces:
Running
Running
khalidsaifullaah
commited on
Commit
•
75b01a0
1
Parent(s):
182f15a
Added CC3M data downloader script
Browse files- data/CC3M_downloader.py +156 -0
data/CC3M_downloader.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# It expects you to have the train and validation `.tsv` file downloaded in the current directory
|
2 |
+
# Head around to this link to download the `.tsv` files
|
3 |
+
# https://ai.google.com/research/ConceptualCaptions/download
|
4 |
+
|
5 |
+
'''
|
6 |
+
This script was adapted from https://github.com/igorbrigadir/DownloadConceptualCaptions
|
7 |
+
Few changes were made post that (excluding the post processing of data). We'll have
|
8 |
+
only csv file with image url and captions written in different languages but not images
|
9 |
+
as we do not own any of the images in the dataset and hence cannot legally provide them to you.
|
10 |
+
'''
|
11 |
+
import pandas as pd
|
12 |
+
import numpy as np
|
13 |
+
import requests
|
14 |
+
import zlib
|
15 |
+
import os
|
16 |
+
import shelve
|
17 |
+
import magic
|
18 |
+
from multiprocessing import Pool
|
19 |
+
from tqdm import tqdm
|
20 |
+
|
21 |
+
headers = {
|
22 |
+
'User-Agent':'Googlebot-Image/1.0', # Pretend to be googlebot
|
23 |
+
'X-Forwarded-For': '64.18.15.200'
|
24 |
+
}
|
25 |
+
|
26 |
+
def _df_split_apply(tup_arg):
|
27 |
+
split_ind, subset, func = tup_arg
|
28 |
+
r = subset.apply(func, axis=1)
|
29 |
+
return (split_ind, r)
|
30 |
+
|
31 |
+
def df_multiprocess(df, processes, chunk_size, func, dataset_name):
|
32 |
+
print("Generating parts...")
|
33 |
+
with shelve.open('%s_%s_%s_results.tmp' % (dataset_name, func.__name__, chunk_size)) as results:
|
34 |
+
|
35 |
+
pbar = tqdm(total=len(df), position=0)
|
36 |
+
# Resume:
|
37 |
+
finished_chunks = set([int(k) for k in results.keys()])
|
38 |
+
pbar.desc = "Resuming"
|
39 |
+
for k in results.keys():
|
40 |
+
pbar.update(len(results[str(k)][1]))
|
41 |
+
|
42 |
+
pool_data = ((index, df[i:i + chunk_size], func) for index, i in enumerate(range(0, len(df), chunk_size)) if index not in finished_chunks)
|
43 |
+
print(int(len(df) / chunk_size), "parts.", chunk_size, "per part.", "Using", processes, "processes")
|
44 |
+
|
45 |
+
pbar.desc = "Downloading"
|
46 |
+
with Pool(processes) as pool:
|
47 |
+
for i, result in enumerate(pool.imap_unordered(_df_split_apply, pool_data, 2)):
|
48 |
+
results[str(result[0])] = result
|
49 |
+
pbar.update(len(result[1]))
|
50 |
+
pbar.close()
|
51 |
+
|
52 |
+
print("Finished Downloading.")
|
53 |
+
return
|
54 |
+
|
55 |
+
# Unique name based on url
|
56 |
+
def _file_name(row):
|
57 |
+
return "%s/%s_%s" % (row['folder'], row.name, (zlib.crc32(row['url'].encode('utf-8')) & 0xffffffff))
|
58 |
+
|
59 |
+
# For checking mimetypes separately without download
|
60 |
+
def check_mimetype(row):
|
61 |
+
if os.path.isfile(str(row['file'])):
|
62 |
+
row['mimetype'] = magic.from_file(row['file'], mime=True)
|
63 |
+
row['size'] = os.stat(row['file']).st_size
|
64 |
+
return row
|
65 |
+
|
66 |
+
# Don't download image, just check with a HEAD request, can't resume.
|
67 |
+
# Can use this instead of download_image to get HTTP status codes.
|
68 |
+
def check_download(row):
|
69 |
+
fname = _file_name(row)
|
70 |
+
try:
|
71 |
+
# not all sites will support HEAD
|
72 |
+
response = requests.head(row['url'], stream=False, timeout=5, allow_redirects=True, headers=headers)
|
73 |
+
row['status'] = response.status_code
|
74 |
+
row['headers'] = dict(response.headers)
|
75 |
+
except:
|
76 |
+
# log errors later, set error as 408 timeout
|
77 |
+
row['status'] = 408
|
78 |
+
return row
|
79 |
+
if response.ok:
|
80 |
+
row['file'] = fname
|
81 |
+
return row
|
82 |
+
|
83 |
+
def download_image(row):
|
84 |
+
fname = _file_name(row)
|
85 |
+
# Skip Already downloaded, retry others later
|
86 |
+
if os.path.isfile(fname):
|
87 |
+
row['status'] = 200
|
88 |
+
row['file'] = fname
|
89 |
+
row['mimetype'] = magic.from_file(row['file'], mime=True)
|
90 |
+
row['size'] = os.stat(row['file']).st_size
|
91 |
+
return row
|
92 |
+
|
93 |
+
try:
|
94 |
+
# use smaller timeout to skip errors, but can result in failed downloads
|
95 |
+
response = requests.get(row['url'], stream=False, timeout=10, allow_redirects=True, headers=headers)
|
96 |
+
row['status'] = response.status_code
|
97 |
+
#row['headers'] = dict(response.headers)
|
98 |
+
except Exception as e:
|
99 |
+
# log errors later, set error as 408 timeout
|
100 |
+
row['status'] = 408
|
101 |
+
return row
|
102 |
+
|
103 |
+
if response.ok:
|
104 |
+
try:
|
105 |
+
with open(fname, 'wb') as out_file:
|
106 |
+
# some sites respond with gzip transport encoding
|
107 |
+
response.raw.decode_content = True
|
108 |
+
out_file.write(response.content)
|
109 |
+
row['mimetype'] = magic.from_file(fname, mime=True)
|
110 |
+
row['size'] = os.stat(fname).st_size
|
111 |
+
except:
|
112 |
+
# This is if it times out during a download or decode
|
113 |
+
row['status'] = 408
|
114 |
+
return row
|
115 |
+
row['file'] = fname
|
116 |
+
return row
|
117 |
+
|
118 |
+
def open_tsv(fname, folder):
|
119 |
+
print("Opening %s Data File..." % fname)
|
120 |
+
df = pd.read_csv(fname, sep='\t', names=["caption","url"], usecols=range(1,2))
|
121 |
+
df['folder'] = folder
|
122 |
+
print("Processing", len(df), " Images:")
|
123 |
+
return df
|
124 |
+
|
125 |
+
def df_from_shelve(chunk_size, func, dataset_name):
|
126 |
+
print("Generating Dataframe from results...")
|
127 |
+
with shelve.open('%s_%s_%s_results.tmp' % (dataset_name, func.__name__, chunk_size)) as results:
|
128 |
+
keylist = sorted([int(k) for k in results.keys()])
|
129 |
+
df = pd.concat([results[str(k)][1] for k in keylist], sort=True)
|
130 |
+
return df
|
131 |
+
|
132 |
+
# number of processes in the pool can be larger than cores
|
133 |
+
num_processes = 256
|
134 |
+
# chunk_size is how many images per chunk per process - changing this resets progress when restarting.
|
135 |
+
images_per_part = 200
|
136 |
+
|
137 |
+
'''
|
138 |
+
A bunch of them will fail to download, and return web pages instead. These will
|
139 |
+
need to be cleaned up later. See downloaded_validation_report.tsv after it downloads
|
140 |
+
for HTTP errors. Around 10-11% of images are gone, based on validation set results. Setting
|
141 |
+
the user agent could fix some errors too maybe - not sure if any requests are rejected by
|
142 |
+
sites based on this.
|
143 |
+
'''
|
144 |
+
data_name = "validation"
|
145 |
+
df = open_tsv("Validation_GCC-1.1.0-Validation.tsv", data_name)
|
146 |
+
df_multiprocess(df=df, processes=num_processes, chunk_size=images_per_part, func=download_image, dataset_name=data_name)
|
147 |
+
df = df_from_shelve(chunk_size=images_per_part, func=download_image, dataset_name=data_name)
|
148 |
+
df.to_csv("downloaded_%s_report.tsv.gz" % data_name, compression='gzip', sep='\t', header=False, index=False)
|
149 |
+
print("Saved.")
|
150 |
+
|
151 |
+
data_name = "training"
|
152 |
+
df = open_tsv("Train-GCC-training.tsv",data_name)
|
153 |
+
df_multiprocess(df=df, processes=num_processes, chunk_size=images_per_part, func=download_image, dataset_name=data_name)
|
154 |
+
df = df_from_shelve(chunk_size=images_per_part, func=download_image, dataset_name=data_name)
|
155 |
+
df.to_csv("downloaded_%s_report.tsv.gz" % data_name, compression='gzip', sep='\t', header=False, index=False)
|
156 |
+
print("Saved.")
|