import json from urllib.parse import quote from tqdm import tqdm from bs4 import BeautifulSoup from selenium import webdriver from utils import * MAX_PAGE = 618 def append_dict_to_jsonl(dictionary, file_path='./output/items.jsonl'): with open(file_path, 'a', encoding='utf-8') as jsonl_file: json.dump(dictionary, jsonl_file, ensure_ascii=False) jsonl_file.write('\n') def get_second_links(keyword): # selenium option = webdriver.ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) option.add_argument("--disable-blink-features=AutomationControlled") # option.add_argument('--headless') browser = webdriver.Chrome(options=option) browser.get(f'https://www.taobao.com/list/product/{quote(keyword)}.htm') # browser.minimize_window() browser.maximize_window() skip_captcha() # 遍历product页面下的所有item,直至已加载全部商品 for i in tqdm(range(1, MAX_PAGE + 1)): browser.execute_script(f'window.scrollTo(0, {i * 500})') sleeps(0.5, 1.0) page_str = str(browser.page_source) if "taobao | 淘寶" in page_str: print('遭遇验证码...') return [] if "已加载全部商品" in page_str: print('已加载全部商品!') break if "加载错误,请重试" in page_str: print('加载错误,爬取中断') break html_content = browser.page_source # bs4 soup = BeautifulSoup(html_content, 'html.parser') return [link.get('href') for link in soup.find_all('a', class_='item')] def read_lines_to_array(file_path): create_dir('./' + os.path.dirname(file_path)) lines_array = [] with open(file_path, 'r', encoding='utf-8') as file: for line in file: lines_array.append(line.strip()) return lines_array def product_to_items(): keywords = read_lines_to_array('./input/keywords.txt') create_dir('./output') for key in keywords: urls = list(get_second_links(key)) print(f'Saving url into jsonl for keyword [{key}]') for url in tqdm(urls): tmp_dict = { 'keyword': key, 'id': url.split('.htm?spm=')[0].split('//www.taobao.com/list/item/')[1] } append_dict_to_jsonl(tmp_dict) rm_duplicates_by_key() if __name__ == "__main__": keywords = read_lines_to_array('./input/keywords.txt') create_dir('./output') for key in keywords: urls = list(get_second_links(key)) print(f'Saving url into jsonl for keyword [{key}]') for url in tqdm(urls): tmp_dict = { 'keyword': key, 'id': url.split('.htm?spm=')[0].split('//www.taobao.com/list/item/')[1] } append_dict_to_jsonl(tmp_dict) rm_duplicates_by_key()