MuGeminorum
/

human-detector

Object Detection

Inference Endpoints

Model card Files Files and versions

human-detector / item2pic.py

George

upl all codes

b5f33fd 11 months ago

No virus

2.57 kB

	import os
	import json
	import requests
	from bs4 import BeautifulSoup
	from selenium import webdriver
	from utils import *


	def download_image(url, save_path):
	rand_sleep()
	try:
	# 发送GET请求下载图片
	response = requests.get(url)
	response.raise_for_status()

	# 确定保存路径
	os.makedirs(os.path.dirname(save_path), exist_ok=True)

	# 保存图片
	with open(save_path, 'wb') as file:
	file.write(response.content)

	print(f"Image downloaded and saved to {save_path}")
	except requests.exceptions.HTTPError as errh:
	print("Http Error:", errh)
	except requests.exceptions.ConnectionError as errc:
	print("Error Connecting:", errc)
	except requests.exceptions.Timeout as errt:
	print("Timeout Error:", errt)
	except requests.exceptions.RequestException as err:
	print("OOps: Something Else", err)


	def get_pics(id):
	rand_sleep()
	# selenium
	option = webdriver.ChromeOptions()
	option.add_experimental_option('excludeSwitches', ['enable-automation'])
	option.add_argument("--disable-blink-features=AutomationControlled")
	# option.add_argument('--headless')
	browser = webdriver.Chrome(options=option)
	browser.get(f'https://www.taobao.com/list/item/{id}.htm')
	# browser.minimize_window()
	browser.maximize_window()

	skip_captcha()

	# bs4
	soup = BeautifulSoup(browser.page_source, 'html.parser')
	srcs = set()

	try:
	for link in soup.find_all('img', class_='item-thumbnail'):
	srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg')

	for link in soup.find_all('img', class_='property-img'):
	srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg')

	for link in soup.find('div', class_='detail-content').find_all('img'):
	srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg')

	except Exception as err:
	print("Error: ", err)

	return srcs


	if __name__ == "__main__":
	create_dir('./images')

	with open('./output/items.jsonl', 'r', encoding='utf-8') as jsonl_file:
	for line in jsonl_file:
	# 将JSON字符串转换为Python对象
	data = json.loads(line)
	# 获取字典中的'id'键值的值，并添加到列表中
	id_value = data.get('id')
	if id_value is not None:
	pic_urls = get_pics(id_value)
	for url in pic_urls:
	download_image(url, f'./images/{os.path.basename(url)}')