File size: 2,571 Bytes
b5f33fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import os
import json
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from utils import *
def download_image(url, save_path):
rand_sleep()
try:
# 发送GET请求下载图片
response = requests.get(url)
response.raise_for_status()
# 确定保存路径
os.makedirs(os.path.dirname(save_path), exist_ok=True)
# 保存图片
with open(save_path, 'wb') as file:
file.write(response.content)
print(f"Image downloaded and saved to {save_path}")
except requests.exceptions.HTTPError as errh:
print("Http Error:", errh)
except requests.exceptions.ConnectionError as errc:
print("Error Connecting:", errc)
except requests.exceptions.Timeout as errt:
print("Timeout Error:", errt)
except requests.exceptions.RequestException as err:
print("OOps: Something Else", err)
def get_pics(id):
rand_sleep()
# selenium
option = webdriver.ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_argument("--disable-blink-features=AutomationControlled")
# option.add_argument('--headless')
browser = webdriver.Chrome(options=option)
browser.get(f'https://www.taobao.com/list/item/{id}.htm')
# browser.minimize_window()
browser.maximize_window()
skip_captcha()
# bs4
soup = BeautifulSoup(browser.page_source, 'html.parser')
srcs = set()
try:
for link in soup.find_all('img', class_='item-thumbnail'):
srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg')
for link in soup.find_all('img', class_='property-img'):
srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg')
for link in soup.find('div', class_='detail-content').find_all('img'):
srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg')
except Exception as err:
print("Error: ", err)
return srcs
if __name__ == "__main__":
create_dir('./images')
with open('./output/items.jsonl', 'r', encoding='utf-8') as jsonl_file:
for line in jsonl_file:
# 将JSON字符串转换为Python对象
data = json.loads(line)
# 获取字典中的'id'键值的值,并添加到列表中
id_value = data.get('id')
if id_value is not None:
pic_urls = get_pics(id_value)
for url in pic_urls:
download_image(url, f'./images/{os.path.basename(url)}')
|