File size: 2,571 Bytes
b5f33fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import json
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from utils import *


def download_image(url, save_path):
    rand_sleep()
    try:
        # 发送GET请求下载图片
        response = requests.get(url)
        response.raise_for_status()

        # 确定保存路径
        os.makedirs(os.path.dirname(save_path), exist_ok=True)

        # 保存图片
        with open(save_path, 'wb') as file:
            file.write(response.content)

        print(f"Image downloaded and saved to {save_path}")
    except requests.exceptions.HTTPError as errh:
        print("Http Error:", errh)
    except requests.exceptions.ConnectionError as errc:
        print("Error Connecting:", errc)
    except requests.exceptions.Timeout as errt:
        print("Timeout Error:", errt)
    except requests.exceptions.RequestException as err:
        print("OOps: Something Else", err)


def get_pics(id):
    rand_sleep()
    # selenium
    option = webdriver.ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    option.add_argument("--disable-blink-features=AutomationControlled")
    # option.add_argument('--headless')
    browser = webdriver.Chrome(options=option)
    browser.get(f'https://www.taobao.com/list/item/{id}.htm')
    # browser.minimize_window()
    browser.maximize_window()

    skip_captcha()

    # bs4
    soup = BeautifulSoup(browser.page_source, 'html.parser')
    srcs = set()

    try:
        for link in soup.find_all('img', class_='item-thumbnail'):
            srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg')

        for link in soup.find_all('img', class_='property-img'):
            srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg')

        for link in soup.find('div', class_='detail-content').find_all('img'):
            srcs.add('https:' + link.get('src').split('.jpg')[0] + '.jpg')

    except Exception as err:
        print("Error: ", err)

    return srcs


if __name__ == "__main__":
    create_dir('./images')

    with open('./output/items.jsonl', 'r', encoding='utf-8') as jsonl_file:
        for line in jsonl_file:
            # 将JSON字符串转换为Python对象
            data = json.loads(line)
            # 获取字典中的'id'键值的值,并添加到列表中
            id_value = data.get('id')
            if id_value is not None:
                pic_urls = get_pics(id_value)
                for url in pic_urls:
                    download_image(url, f'./images/{os.path.basename(url)}')