human-detector / item2pic.py
George
upl all codes
b5f33fd
raw
history blame
No virus
2.57 kB
import os
import json
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from utils import *
def download_image(url, save_path):
rand_sleep()
try:
# 发送GET请求下载图片
response = requests.get(url)
response.raise_for_status()
# 确定保存路径
os.makedirs(os.path.dirname(save_path), exist_ok=True)
# 保存图片
with open(save_path, 'wb') as file:
file.write(response.content)
print(f"Image downloaded and saved to {save_path}")
except requests.exceptions.HTTPError as errh:
print("Http Error:", errh)
except requests.exceptions.ConnectionError as errc:
print("Error Connecting:", errc)
except requests.exceptions.Timeout as errt:
print("Timeout Error:", errt)
except requests.exceptions.RequestException as err:
print("OOps: Something Else", err)
def get_pics(id):
rand_sleep()
# selenium
option = webdriver.ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_argument("--disable-blink-features=AutomationControlled")
# option.add_argument('--headless')
browser = webdriver.Chrome(options=option)
browser.get(f'https://www.taobao.com/list/item/{id}.htm')
# browser.minimize_window()
browser.maximize_window()
skip_captcha()
# bs4
soup = BeautifulSoup(browser.page_source, 'html.parser')
srcs = set()
try:
for link in soup.find_all('img', class_='item-thumbnail'):
srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg')
for link in soup.find_all('img', class_='property-img'):
srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg')
for link in soup.find('div', class_='detail-content').find_all('img'):
srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg')
except Exception as err:
print("Error: ", err)
return srcs
if __name__ == "__main__":
create_dir('./images')
with open('./output/items.jsonl', 'r', encoding='utf-8') as jsonl_file:
for line in jsonl_file:
# 将JSON字符串转换为Python对象
data = json.loads(line)
# 获取字典中的'id'键值的值,并添加到列表中
id_value = data.get('id')
if id_value is not None:
pic_urls = get_pics(id_value)
for url in pic_urls:
download_image(url, f'./images/{os.path.basename(url)}')