|
import os |
|
import json |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from selenium import webdriver |
|
from utils import * |
|
|
|
|
|
def download_image(url, save_path): |
|
rand_sleep() |
|
try: |
|
|
|
response = requests.get(url) |
|
response.raise_for_status() |
|
|
|
|
|
os.makedirs(os.path.dirname(save_path), exist_ok=True) |
|
|
|
|
|
with open(save_path, 'wb') as file: |
|
file.write(response.content) |
|
|
|
print(f"Image downloaded and saved to {save_path}") |
|
except requests.exceptions.HTTPError as errh: |
|
print("Http Error:", errh) |
|
except requests.exceptions.ConnectionError as errc: |
|
print("Error Connecting:", errc) |
|
except requests.exceptions.Timeout as errt: |
|
print("Timeout Error:", errt) |
|
except requests.exceptions.RequestException as err: |
|
print("OOps: Something Else", err) |
|
|
|
|
|
def get_pics(id): |
|
rand_sleep() |
|
|
|
option = webdriver.ChromeOptions() |
|
option.add_experimental_option('excludeSwitches', ['enable-automation']) |
|
option.add_argument("--disable-blink-features=AutomationControlled") |
|
|
|
browser = webdriver.Chrome(options=option) |
|
browser.get(f'https://www.taobao.com/list/item/{id}.htm') |
|
|
|
browser.maximize_window() |
|
|
|
skip_captcha() |
|
|
|
|
|
soup = BeautifulSoup(browser.page_source, 'html.parser') |
|
srcs = set() |
|
|
|
try: |
|
for link in soup.find_all('img', class_='item-thumbnail'): |
|
srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg') |
|
|
|
for link in soup.find_all('img', class_='property-img'): |
|
srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg') |
|
|
|
for link in soup.find('div', class_='detail-content').find_all('img'): |
|
srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg') |
|
|
|
except Exception as err: |
|
print("Error: ", err) |
|
|
|
return srcs |
|
|
|
|
|
if __name__ == "__main__": |
|
create_dir('./images') |
|
|
|
with open('./output/items.jsonl', 'r', encoding='utf-8') as jsonl_file: |
|
for line in jsonl_file: |
|
|
|
data = json.loads(line) |
|
|
|
id_value = data.get('id') |
|
if id_value is not None: |
|
pic_urls = get_pics(id_value) |
|
for url in pic_urls: |
|
download_image(url, f'./images/{os.path.basename(url)}') |
|
|