import os import json import requests from bs4 import BeautifulSoup from selenium import webdriver from utils import * def download_image(url, save_path): rand_sleep() try: # 发送GET请求下载图片 response = requests.get(url) response.raise_for_status() # 确定保存路径 os.makedirs(os.path.dirname(save_path), exist_ok=True) # 保存图片 with open(save_path, 'wb') as file: file.write(response.content) print(f"Image downloaded and saved to {save_path}") except requests.exceptions.HTTPError as errh: print("Http Error:", errh) except requests.exceptions.ConnectionError as errc: print("Error Connecting:", errc) except requests.exceptions.Timeout as errt: print("Timeout Error:", errt) except requests.exceptions.RequestException as err: print("OOps: Something Else", err) def get_pics(id): rand_sleep() # selenium option = webdriver.ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) option.add_argument("--disable-blink-features=AutomationControlled") # option.add_argument('--headless') browser = webdriver.Chrome(options=option) browser.get(f'https://www.taobao.com/list/item/{id}.htm') # browser.minimize_window() browser.maximize_window() skip_captcha() # bs4 soup = BeautifulSoup(browser.page_source, 'html.parser') srcs = set() try: for link in soup.find_all('img', class_='item-thumbnail'): srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg') for link in soup.find_all('img', class_='property-img'): srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg') for link in soup.find('div', class_='detail-content').find_all('img'): srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg') except Exception as err: print("Error: ", err) return srcs if __name__ == "__main__": create_dir('./images') with open('./output/items.jsonl', 'r', encoding='utf-8') as jsonl_file: for line in jsonl_file: # 将JSON字符串转换为Python对象 data = json.loads(line) # 获取字典中的'id'键值的值,并添加到列表中 id_value = data.get('id') if id_value is not None: pic_urls = get_pics(id_value) for url in pic_urls: download_image(url, f'./images/{os.path.basename(url)}')