File size: 3,165 Bytes

b5f33fd

import json
from urllib.parse import quote
from bs4 import BeautifulSoup
from selenium import webdriver
from utils import *


def append_dict_to_jsonl(dictionary, file_path='output/items.jsonl'):
    with open(file_path, 'a', encoding='utf-8') as jsonl_file:
        json.dump(dictionary, jsonl_file)
        jsonl_file.write('\n')


def get_second_links(keyword):
    # selenium
    option = webdriver.ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    option.add_argument("--disable-blink-features=AutomationControlled")
    # option.add_argument('--headless')
    browser = webdriver.Chrome(options=option)
    browser.get(f'https://www.taobao.com/list/product/{quote(keyword)}.htm')
    # browser.minimize_window()
    browser.maximize_window()

    skip_captcha()

    # 遍历product页面下的所有item，直至已加载全部商品
    i = 1
    while i > 0:
        browser.execute_script(
            f'window.scrollTo(0, {i * 500})')
        i += 1
        rand_sleep()
        page_str = str(browser.page_source)
        if "<title>taobao | 淘寶</title>" in page_str:
            return []

        if "已加载全部商品" in page_str:
            break

        if "加载错误，请重试" in page_str:
            break

    html_content = browser.page_source

    # bs4
    soup = BeautifulSoup(html_content, 'html.parser')
    return [link.get('href') for link in soup.find_all('a', class_='item')]


def read_lines_to_array(file_path):
    create_dir('./' + os.path.dirname(file_path))
    lines_array = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            lines_array.append(line.strip())

    return lines_array


def save_to_file(data_list, file_path='output/items.jsonl'):
    with open(file_path, 'w', encoding='utf-8') as jsonl_file:
        for data in data_list:
            json.dump(data, jsonl_file, ensure_ascii=(
                file_path != 'output/items.jsonl'))
            jsonl_file.write('\n')


def rm_duplicates_by_key(file_path='output/items.jsonl', key_to_check='id'):
    data_set = set()
    unique_data = []
    duplicates = set()

    with open(file_path, 'r', encoding='utf-8') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)

            # 提取指定键值的值，并用作判断重复的标识
            key_value = data.get(key_to_check)

            # 如果标识值已存在，表示数据重复
            if key_value in data_set:
                duplicates.add(key_value)
                continue
            else:
                data_set.add(key_value)
                unique_data.append(data)

    save_to_file(unique_data)
    save_to_file(duplicates, file_path='output/duplicates.txt')


if __name__ == "__main__":
    keywords = read_lines_to_array('input/keywords.txt')
    create_dir('./output')

    for key in keywords:
        for url in get_second_links(key):
            append_dict_to_jsonl({
                'keyword': key,
                'id': url.split('.htm?spm=')[0].split('//www.taobao.com/list/item/')[1]
            })

    rm_duplicates_by_key()