sail
/

PTP

Model card Files Files and versions Community

PTP / download_sbu.py

Awiny's picture

Upload download_sbu.py

457d89d over 1 year ago

No virus

1.68 kB

	import os
	from csv import reader
	import wget
	from multiprocessing import Pool, Value

	dl_count = Value('i', 0)
	wl_count = Value('i', 0)
	data_source = "/SBU/dataset"

	# correct:851000 wrong: 148957, 999957/1000000 finished

	def image_dl(info):
	global dl_count, wl_count
	row_count = 1000000
	sub_dir = info.split('/')[-2]
	file_name = info.split('/')[-1]
	image_dir = os.path.join(data_source, sub_dir)
	image_path = os.path.join(image_dir, file_name)
	# if not correct download
	if os.path.exists(image_dir) and os.path.exists(image_path):
	with dl_count.get_lock():
	dl_count.value += 1
	return 1
	if not os.path.exists(image_dir):
	os.mkdir(image_dir)
	msg1 = ""
	try:
	wget.download(info, out=image_path)
	with dl_count.get_lock():
	dl_count.value += 1
	except IOError:
	msg1 = "image {} not found".format(info)
	with wl_count.get_lock():
	wl_count.value += 1
	# print(info[0], info[3])
	# video_dir
	if dl_count.value % 1000 == 0:
	print("\n")
	msg2 = "correct:{} wrong: {}, {}/{} finished".format(dl_count.value, wl_count.value,
	dl_count.value+wl_count.value, row_count)
	print(msg2)
	# with open('webvid_data/download_logs.txt','a') as f:
	# f.write(msg1 + "\n" + msg2)


	# train / val dataset
	urls = []
	with open('/mnt/aiops/common/wangjp/SBU/SBU_captioned_photo_dataset_urls.txt', 'r') as fh:
	for line in fh:
	url = line.rstrip()
	urls.append(url)
	print("{} imgs to be downloaded".format(len(urls)))

	num_processes = 16
	pool = Pool(num_processes)
	pool.map(image_dl, tuple(list(urls)))