Awiny commited on
Commit
457d89d
1 Parent(s): 1432af7

Upload download_sbu.py

Browse files
Files changed (1) hide show
  1. download_sbu.py +56 -0
download_sbu.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from csv import reader
3
+ import wget
4
+ from multiprocessing import Pool, Value
5
+
6
+ dl_count = Value('i', 0)
7
+ wl_count = Value('i', 0)
8
+ data_source = "/SBU/dataset"
9
+
10
+ # correct:851000 wrong: 148957, 999957/1000000 finished
11
+
12
+ def image_dl(info):
13
+ global dl_count, wl_count
14
+ row_count = 1000000
15
+ sub_dir = info.split('/')[-2]
16
+ file_name = info.split('/')[-1]
17
+ image_dir = os.path.join(data_source, sub_dir)
18
+ image_path = os.path.join(image_dir, file_name)
19
+ # if not correct download
20
+ if os.path.exists(image_dir) and os.path.exists(image_path):
21
+ with dl_count.get_lock():
22
+ dl_count.value += 1
23
+ return 1
24
+ if not os.path.exists(image_dir):
25
+ os.mkdir(image_dir)
26
+ msg1 = ""
27
+ try:
28
+ wget.download(info, out=image_path)
29
+ with dl_count.get_lock():
30
+ dl_count.value += 1
31
+ except IOError:
32
+ msg1 = "image {} not found".format(info)
33
+ with wl_count.get_lock():
34
+ wl_count.value += 1
35
+ # print(info[0], info[3])
36
+ # video_dir
37
+ if dl_count.value % 1000 == 0:
38
+ print("\n")
39
+ msg2 = "correct:{} wrong: {}, {}/{} finished".format(dl_count.value, wl_count.value,
40
+ dl_count.value+wl_count.value, row_count)
41
+ print(msg2)
42
+ # with open('webvid_data/download_logs.txt','a') as f:
43
+ # f.write(msg1 + "\n" + msg2)
44
+
45
+
46
+ # train / val dataset
47
+ urls = []
48
+ with open('/mnt/aiops/common/wangjp/SBU/SBU_captioned_photo_dataset_urls.txt', 'r') as fh:
49
+ for line in fh:
50
+ url = line.rstrip()
51
+ urls.append(url)
52
+ print("{} imgs to be downloaded".format(len(urls)))
53
+
54
+ num_processes = 16
55
+ pool = Pool(num_processes)
56
+ pool.map(image_dl, tuple(list(urls)))