catalpa-bungei
commited on
Commit
•
8acd4b0
1
Parent(s):
996b6e4
Upload download_images.py with huggingface_hub
Browse files- download_images.py +91 -0
download_images.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import json
|
2 |
+
# import requests
|
3 |
+
# import os
|
4 |
+
|
5 |
+
# # Create a directory to save the images
|
6 |
+
# os.makedirs('images', exist_ok=True)
|
7 |
+
|
8 |
+
# # Function to download an image from a URL
|
9 |
+
# def download_image(url, save_path):
|
10 |
+
# response = requests.get(url)
|
11 |
+
# if response.status_code == 200:
|
12 |
+
# with open(save_path, 'wb') as file:
|
13 |
+
# file.write(response.content)
|
14 |
+
# else:
|
15 |
+
# print(f"Failed to download image from {url}")
|
16 |
+
|
17 |
+
# # Read the JSONL file and download images
|
18 |
+
# # download the ith to jth images
|
19 |
+
# def download_itoj_images(i, j):
|
20 |
+
# with open('transformed_data.jsonl', 'r') as file:
|
21 |
+
# for idx, line in enumerate(file):
|
22 |
+
# if idx < i:
|
23 |
+
# continue
|
24 |
+
# if idx > j:
|
25 |
+
# break
|
26 |
+
# item = json.loads(line)
|
27 |
+
# image_path = item['image_path'].split('/')[-1]
|
28 |
+
# # image_id = image_path.split('_')[-1].split('.')[0]
|
29 |
+
# url = f"http://images.cocodataset.org/train2014/{image_path}"
|
30 |
+
# save_path = os.path.join('images', f"{image_path}")
|
31 |
+
# download_image(url, save_path)
|
32 |
+
# print(f"Downloaded {image_path}, id: {idx}")
|
33 |
+
# # save the message to a file
|
34 |
+
# with open('downloaded_images_log.txt', 'a') as f:
|
35 |
+
# f.write(f"Downloaded {image_path}, id: {idx}\n")
|
36 |
+
|
37 |
+
# # read i and j from the arguments
|
38 |
+
# import sys
|
39 |
+
# i = int(sys.argv[1])
|
40 |
+
# j = int(sys.argv[2])
|
41 |
+
# download_itoj_images(i, j)
|
42 |
+
|
43 |
+
import json
|
44 |
+
import requests
|
45 |
+
import os
|
46 |
+
from concurrent.futures import ThreadPoolExecutor
|
47 |
+
|
48 |
+
# Create a directory to save the images
|
49 |
+
os.makedirs('images', exist_ok=True)
|
50 |
+
|
51 |
+
# Function to download an image from a URL
|
52 |
+
def download_image(item):
|
53 |
+
image_path = item['image_path'].split('/')[-1]
|
54 |
+
# image_id = image_path.split('_')[-1].split('.')[0]
|
55 |
+
url = f"http://images.cocodataset.org/train2014/{image_path}"
|
56 |
+
save_path = os.path.join('images', f"{image_path}")
|
57 |
+
|
58 |
+
try:
|
59 |
+
response = requests.get(url)
|
60 |
+
if response.status_code == 200:
|
61 |
+
with open(save_path, 'wb') as file:
|
62 |
+
file.write(response.content)
|
63 |
+
log_message(f"Downloaded {image_path}, id: {item['id']}")
|
64 |
+
else:
|
65 |
+
log_message(f"Failed to download image from {url}")
|
66 |
+
except Exception as e:
|
67 |
+
log_message(f"Exception occurred while downloading {url}: {e}")
|
68 |
+
|
69 |
+
# Function to log a message to log.txt
|
70 |
+
def log_message(message):
|
71 |
+
with open('downloaded_images_log.txt', 'a') as log_file:
|
72 |
+
log_file.write(message + '\n')
|
73 |
+
|
74 |
+
# Function to download images from index i to j using ThreadPoolExecutor
|
75 |
+
def download_itoj_images(i, j):
|
76 |
+
with open('transformed_data.jsonl', 'r') as file:
|
77 |
+
items = [json.loads(line) for idx, line in enumerate(file) if i <= idx <= j]
|
78 |
+
# note that idx starts from 0. id = idx + 1
|
79 |
+
# i and j are idxes, so in the log it will show i+1 and j+1
|
80 |
+
|
81 |
+
with ThreadPoolExecutor(max_workers=10) as executor:
|
82 |
+
executor.map(download_image, items)
|
83 |
+
|
84 |
+
# read i and j from the arguments
|
85 |
+
import sys
|
86 |
+
i = int(sys.argv[1])
|
87 |
+
j = int(sys.argv[2])
|
88 |
+
download_itoj_images(i, j)
|
89 |
+
|
90 |
+
|
91 |
+
|