import json from urllib.parse import quote from bs4 import BeautifulSoup from selenium import webdriver from utils import * def append_dict_to_jsonl(dictionary, file_path='output/items.jsonl'): with open(file_path, 'a', encoding='utf-8') as jsonl_file: json.dump(dictionary, jsonl_file) jsonl_file.write('\n') def get_second_links(keyword): # selenium option = webdriver.ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) option.add_argument("--disable-blink-features=AutomationControlled") # option.add_argument('--headless') browser = webdriver.Chrome(options=option) browser.get(f'https://www.taobao.com/list/product/{quote(keyword)}.htm') # browser.minimize_window() browser.maximize_window() skip_captcha() # 遍历product页面下的所有item,直至已加载全部商品 i = 1 while i > 0: browser.execute_script( f'window.scrollTo(0, {i * 500})') i += 1 rand_sleep() page_str = str(browser.page_source) if "taobao | 淘寶" in page_str: return [] if "已加载全部商品" in page_str: break if "加载错误,请重试" in page_str: break html_content = browser.page_source # bs4 soup = BeautifulSoup(html_content, 'html.parser') return [link.get('href') for link in soup.find_all('a', class_='item')] def read_lines_to_array(file_path): create_dir('./' + os.path.dirname(file_path)) lines_array = [] with open(file_path, 'r', encoding='utf-8') as file: for line in file: lines_array.append(line.strip()) return lines_array def save_to_file(data_list, file_path='output/items.jsonl'): with open(file_path, 'w', encoding='utf-8') as jsonl_file: for data in data_list: json.dump(data, jsonl_file, ensure_ascii=( file_path != 'output/items.jsonl')) jsonl_file.write('\n') def rm_duplicates_by_key(file_path='output/items.jsonl', key_to_check='id'): data_set = set() unique_data = [] duplicates = set() with open(file_path, 'r', encoding='utf-8') as jsonl_file: for line in jsonl_file: data = json.loads(line) # 提取指定键值的值,并用作判断重复的标识 key_value = data.get(key_to_check) # 如果标识值已存在,表示数据重复 if key_value in data_set: duplicates.add(key_value) continue else: data_set.add(key_value) unique_data.append(data) save_to_file(unique_data) save_to_file(duplicates, file_path='output/duplicates.txt') if __name__ == "__main__": keywords = read_lines_to_array('input/keywords.txt') create_dir('./output') for key in keywords: for url in get_second_links(key): append_dict_to_jsonl({ 'keyword': key, 'id': url.split('.htm?spm=')[0].split('//www.taobao.com/list/item/')[1] }) rm_duplicates_by_key()