Upload download_sbu.py
Browse files- download_sbu.py +56 -0
download_sbu.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from csv import reader
|
3 |
+
import wget
|
4 |
+
from multiprocessing import Pool, Value
|
5 |
+
|
6 |
+
dl_count = Value('i', 0)
|
7 |
+
wl_count = Value('i', 0)
|
8 |
+
data_source = "/SBU/dataset"
|
9 |
+
|
10 |
+
# correct:851000 wrong: 148957, 999957/1000000 finished
|
11 |
+
|
12 |
+
def image_dl(info):
|
13 |
+
global dl_count, wl_count
|
14 |
+
row_count = 1000000
|
15 |
+
sub_dir = info.split('/')[-2]
|
16 |
+
file_name = info.split('/')[-1]
|
17 |
+
image_dir = os.path.join(data_source, sub_dir)
|
18 |
+
image_path = os.path.join(image_dir, file_name)
|
19 |
+
# if not correct download
|
20 |
+
if os.path.exists(image_dir) and os.path.exists(image_path):
|
21 |
+
with dl_count.get_lock():
|
22 |
+
dl_count.value += 1
|
23 |
+
return 1
|
24 |
+
if not os.path.exists(image_dir):
|
25 |
+
os.mkdir(image_dir)
|
26 |
+
msg1 = ""
|
27 |
+
try:
|
28 |
+
wget.download(info, out=image_path)
|
29 |
+
with dl_count.get_lock():
|
30 |
+
dl_count.value += 1
|
31 |
+
except IOError:
|
32 |
+
msg1 = "image {} not found".format(info)
|
33 |
+
with wl_count.get_lock():
|
34 |
+
wl_count.value += 1
|
35 |
+
# print(info[0], info[3])
|
36 |
+
# video_dir
|
37 |
+
if dl_count.value % 1000 == 0:
|
38 |
+
print("\n")
|
39 |
+
msg2 = "correct:{} wrong: {}, {}/{} finished".format(dl_count.value, wl_count.value,
|
40 |
+
dl_count.value+wl_count.value, row_count)
|
41 |
+
print(msg2)
|
42 |
+
# with open('webvid_data/download_logs.txt','a') as f:
|
43 |
+
# f.write(msg1 + "\n" + msg2)
|
44 |
+
|
45 |
+
|
46 |
+
# train / val dataset
|
47 |
+
urls = []
|
48 |
+
with open('/mnt/aiops/common/wangjp/SBU/SBU_captioned_photo_dataset_urls.txt', 'r') as fh:
|
49 |
+
for line in fh:
|
50 |
+
url = line.rstrip()
|
51 |
+
urls.append(url)
|
52 |
+
print("{} imgs to be downloaded".format(len(urls)))
|
53 |
+
|
54 |
+
num_processes = 16
|
55 |
+
pool = Pool(num_processes)
|
56 |
+
pool.map(image_dl, tuple(list(urls)))
|