|
import json |
|
from urllib.parse import quote |
|
from bs4 import BeautifulSoup |
|
from selenium import webdriver |
|
from utils import * |
|
|
|
|
|
def append_dict_to_jsonl(dictionary, file_path='output/items.jsonl'): |
|
with open(file_path, 'a', encoding='utf-8') as jsonl_file: |
|
json.dump(dictionary, jsonl_file) |
|
jsonl_file.write('\n') |
|
|
|
|
|
def get_second_links(keyword): |
|
|
|
option = webdriver.ChromeOptions() |
|
option.add_experimental_option('excludeSwitches', ['enable-automation']) |
|
option.add_argument("--disable-blink-features=AutomationControlled") |
|
|
|
browser = webdriver.Chrome(options=option) |
|
browser.get(f'https://www.taobao.com/list/product/{quote(keyword)}.htm') |
|
|
|
browser.maximize_window() |
|
|
|
skip_captcha() |
|
|
|
|
|
i = 1 |
|
while i > 0: |
|
browser.execute_script( |
|
f'window.scrollTo(0, {i * 500})') |
|
i += 1 |
|
rand_sleep() |
|
page_str = str(browser.page_source) |
|
if "<title>taobao | 淘寶</title>" in page_str: |
|
return [] |
|
|
|
if "已加载全部商品" in page_str: |
|
break |
|
|
|
if "加载错误,请重试" in page_str: |
|
break |
|
|
|
html_content = browser.page_source |
|
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
return [link.get('href') for link in soup.find_all('a', class_='item')] |
|
|
|
|
|
def read_lines_to_array(file_path): |
|
create_dir('./' + os.path.dirname(file_path)) |
|
lines_array = [] |
|
with open(file_path, 'r', encoding='utf-8') as file: |
|
for line in file: |
|
lines_array.append(line.strip()) |
|
|
|
return lines_array |
|
|
|
|
|
def save_to_file(data_list, file_path='output/items.jsonl'): |
|
with open(file_path, 'w', encoding='utf-8') as jsonl_file: |
|
for data in data_list: |
|
json.dump(data, jsonl_file, ensure_ascii=( |
|
file_path != 'output/items.jsonl')) |
|
jsonl_file.write('\n') |
|
|
|
|
|
def rm_duplicates_by_key(file_path='output/items.jsonl', key_to_check='id'): |
|
data_set = set() |
|
unique_data = [] |
|
duplicates = set() |
|
|
|
with open(file_path, 'r', encoding='utf-8') as jsonl_file: |
|
for line in jsonl_file: |
|
data = json.loads(line) |
|
|
|
|
|
key_value = data.get(key_to_check) |
|
|
|
|
|
if key_value in data_set: |
|
duplicates.add(key_value) |
|
continue |
|
else: |
|
data_set.add(key_value) |
|
unique_data.append(data) |
|
|
|
save_to_file(unique_data) |
|
save_to_file(duplicates, file_path='output/duplicates.txt') |
|
|
|
|
|
if __name__ == "__main__": |
|
keywords = read_lines_to_array('input/keywords.txt') |
|
create_dir('./output') |
|
|
|
for key in keywords: |
|
for url in get_second_links(key): |
|
append_dict_to_jsonl({ |
|
'keyword': key, |
|
'id': url.split('.htm?spm=')[0].split('//www.taobao.com/list/item/')[1] |
|
}) |
|
|
|
rm_duplicates_by_key() |
|
|