catalpa-bungei commited on
Commit
8acd4b0
1 Parent(s): 996b6e4

Upload download_images.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. download_images.py +91 -0
download_images.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import json
2
+ # import requests
3
+ # import os
4
+
5
+ # # Create a directory to save the images
6
+ # os.makedirs('images', exist_ok=True)
7
+
8
+ # # Function to download an image from a URL
9
+ # def download_image(url, save_path):
10
+ # response = requests.get(url)
11
+ # if response.status_code == 200:
12
+ # with open(save_path, 'wb') as file:
13
+ # file.write(response.content)
14
+ # else:
15
+ # print(f"Failed to download image from {url}")
16
+
17
+ # # Read the JSONL file and download images
18
+ # # download the ith to jth images
19
+ # def download_itoj_images(i, j):
20
+ # with open('transformed_data.jsonl', 'r') as file:
21
+ # for idx, line in enumerate(file):
22
+ # if idx < i:
23
+ # continue
24
+ # if idx > j:
25
+ # break
26
+ # item = json.loads(line)
27
+ # image_path = item['image_path'].split('/')[-1]
28
+ # # image_id = image_path.split('_')[-1].split('.')[0]
29
+ # url = f"http://images.cocodataset.org/train2014/{image_path}"
30
+ # save_path = os.path.join('images', f"{image_path}")
31
+ # download_image(url, save_path)
32
+ # print(f"Downloaded {image_path}, id: {idx}")
33
+ # # save the message to a file
34
+ # with open('downloaded_images_log.txt', 'a') as f:
35
+ # f.write(f"Downloaded {image_path}, id: {idx}\n")
36
+
37
+ # # read i and j from the arguments
38
+ # import sys
39
+ # i = int(sys.argv[1])
40
+ # j = int(sys.argv[2])
41
+ # download_itoj_images(i, j)
42
+
43
+ import json
44
+ import requests
45
+ import os
46
+ from concurrent.futures import ThreadPoolExecutor
47
+
48
+ # Create a directory to save the images
49
+ os.makedirs('images', exist_ok=True)
50
+
51
+ # Function to download an image from a URL
52
+ def download_image(item):
53
+ image_path = item['image_path'].split('/')[-1]
54
+ # image_id = image_path.split('_')[-1].split('.')[0]
55
+ url = f"http://images.cocodataset.org/train2014/{image_path}"
56
+ save_path = os.path.join('images', f"{image_path}")
57
+
58
+ try:
59
+ response = requests.get(url)
60
+ if response.status_code == 200:
61
+ with open(save_path, 'wb') as file:
62
+ file.write(response.content)
63
+ log_message(f"Downloaded {image_path}, id: {item['id']}")
64
+ else:
65
+ log_message(f"Failed to download image from {url}")
66
+ except Exception as e:
67
+ log_message(f"Exception occurred while downloading {url}: {e}")
68
+
69
+ # Function to log a message to log.txt
70
+ def log_message(message):
71
+ with open('downloaded_images_log.txt', 'a') as log_file:
72
+ log_file.write(message + '\n')
73
+
74
+ # Function to download images from index i to j using ThreadPoolExecutor
75
+ def download_itoj_images(i, j):
76
+ with open('transformed_data.jsonl', 'r') as file:
77
+ items = [json.loads(line) for idx, line in enumerate(file) if i <= idx <= j]
78
+ # note that idx starts from 0. id = idx + 1
79
+ # i and j are idxes, so in the log it will show i+1 and j+1
80
+
81
+ with ThreadPoolExecutor(max_workers=10) as executor:
82
+ executor.map(download_image, items)
83
+
84
+ # read i and j from the arguments
85
+ import sys
86
+ i = int(sys.argv[1])
87
+ j = int(sys.argv[2])
88
+ download_itoj_images(i, j)
89
+
90
+
91
+