human-detector / product2item.py
George
upl all codes
b5f33fd
raw
history blame
No virus
3.17 kB
import json
from urllib.parse import quote
from bs4 import BeautifulSoup
from selenium import webdriver
from utils import *
def append_dict_to_jsonl(dictionary, file_path='output/items.jsonl'):
with open(file_path, 'a', encoding='utf-8') as jsonl_file:
json.dump(dictionary, jsonl_file)
jsonl_file.write('\n')
def get_second_links(keyword):
# selenium
option = webdriver.ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_argument("--disable-blink-features=AutomationControlled")
# option.add_argument('--headless')
browser = webdriver.Chrome(options=option)
browser.get(f'https://www.taobao.com/list/product/{quote(keyword)}.htm')
# browser.minimize_window()
browser.maximize_window()
skip_captcha()
# 遍历product页面下的所有item,直至已加载全部商品
i = 1
while i > 0:
browser.execute_script(
f'window.scrollTo(0, {i * 500})')
i += 1
rand_sleep()
page_str = str(browser.page_source)
if "<title>taobao | 淘寶</title>" in page_str:
return []
if "已加载全部商品" in page_str:
break
if "加载错误,请重试" in page_str:
break
html_content = browser.page_source
# bs4
soup = BeautifulSoup(html_content, 'html.parser')
return [link.get('href') for link in soup.find_all('a', class_='item')]
def read_lines_to_array(file_path):
create_dir('./' + os.path.dirname(file_path))
lines_array = []
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
lines_array.append(line.strip())
return lines_array
def save_to_file(data_list, file_path='output/items.jsonl'):
with open(file_path, 'w', encoding='utf-8') as jsonl_file:
for data in data_list:
json.dump(data, jsonl_file, ensure_ascii=(
file_path != 'output/items.jsonl'))
jsonl_file.write('\n')
def rm_duplicates_by_key(file_path='output/items.jsonl', key_to_check='id'):
data_set = set()
unique_data = []
duplicates = set()
with open(file_path, 'r', encoding='utf-8') as jsonl_file:
for line in jsonl_file:
data = json.loads(line)
# 提取指定键值的值,并用作判断重复的标识
key_value = data.get(key_to_check)
# 如果标识值已存在,表示数据重复
if key_value in data_set:
duplicates.add(key_value)
continue
else:
data_set.add(key_value)
unique_data.append(data)
save_to_file(unique_data)
save_to_file(duplicates, file_path='output/duplicates.txt')
if __name__ == "__main__":
keywords = read_lines_to_array('input/keywords.txt')
create_dir('./output')
for key in keywords:
for url in get_second_links(key):
append_dict_to_jsonl({
'keyword': key,
'id': url.split('.htm?spm=')[0].split('//www.taobao.com/list/item/')[1]
})
rm_duplicates_by_key()