sail
/

PTP

Model card Files Files and versions Community

Awiny commited on Dec 8, 2022

Commit

457d89d

•

1 Parent(s): 1432af7

Upload download_sbu.py

Browse files

Files changed (1) hide show

download_sbu.py +56 -0

download_sbu.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+from csv import reader
+import wget
+from multiprocessing import Pool, Value
+dl_count = Value('i', 0)
+wl_count = Value('i', 0)
+data_source = "/SBU/dataset"
+# correct:851000 wrong: 148957, 999957/1000000 finished
+def image_dl(info):
+    global dl_count, wl_count
+    row_count = 1000000
+    sub_dir = info.split('/')[-2]
+    file_name = info.split('/')[-1]
+    image_dir = os.path.join(data_source, sub_dir)
+    image_path = os.path.join(image_dir, file_name)
+    # if not correct download
+    if os.path.exists(image_dir) and os.path.exists(image_path):
+        with dl_count.get_lock():
+            dl_count.value += 1
+        return 1
+    if not os.path.exists(image_dir):
+        os.mkdir(image_dir)
+    msg1 = ""
+    try:
+        wget.download(info, out=image_path)
+        with dl_count.get_lock():
+            dl_count.value += 1
+    except IOError:
+        msg1 = "image {} not found".format(info)
+        with wl_count.get_lock():
+            wl_count.value += 1
+    # print(info[0], info[3])
+    # video_dir
+    if dl_count.value % 1000 == 0:
+        print("\n")
+        msg2 = "correct:{} wrong: {}, {}/{} finished".format(dl_count.value, wl_count.value,
+        dl_count.value+wl_count.value, row_count)
+        print(msg2)
+    # with open('webvid_data/download_logs.txt','a') as f:
+    #     f.write(msg1 + "\n" + msg2)
+# train / val dataset
+urls = []
+with open('/mnt/aiops/common/wangjp/SBU/SBU_captioned_photo_dataset_urls.txt', 'r') as fh:
+    for line in fh:
+        url = line.rstrip()
+        urls.append(url)
+print("{} imgs to be downloaded".format(len(urls)))
+num_processes = 16
+pool = Pool(num_processes)
+pool.map(image_dl, tuple(list(urls)))