from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.service import Service import time # Global driver to use throughout the script DRIVER = None # Wrapper to close driver if its created def close_driver(): global DRIVER if DRIVER is not None: DRIVER.close() DRIVER = None # Function to (re)start driver def start_driver(force_restart=False): global DRIVER if force_restart: close_driver() # Setting up the driver service = Service() options = Options() options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background options.add_argument('-no-sandbox') options.add_argument('-disable-dev-shm-usage') DRIVER = webdriver.Chrome(service=service, options=options) ### Function to extract product info from the necessary html and json tags def get_shopee_product_info_single(product_element, extra_info): """ Extract info from a single product element from the driver. Args: product_item: (WebDriverElement) the product whose info needs to be extracted. Returns: info: (dict) a dictionary of info of the product. Every product should at least have four pieces of information: name, price, link to the product page, and link to the product image. """ info = {'source': 'shopee', 'name':'', 'price':-1, 'product_url':'', 'image':''} print(product_element.get_attribute('outerHTML')) try: # Find the element within the
product_title_element = product_element.find_element(By.CLASS_NAME, "line-clamp-2") # Get the text content of the element info['name'] = product_title_element.text print(info['name']) except NoSuchElementException: info['name'] = "" # price try: # Find the element within the
price_element = product_element.find_element(By.XPATH,'//div[@class="truncate flex items-baseline"]/span[@class="text-base/5 truncate"]') # Get the text content of the element price_text = price_element.text # Extract the price value info['price'] = int(price_text.split(" ")[0].replace('.', '')) print(info['price']) except (NoSuchElementException, ValueError): pass # link try: # Find the element within the
product_link_element = product_element.find_element(By.XPATH, '//a[@class="contents"]') # Get the href attribute of the element product_link = product_link_element.get_attribute("href") # Extract the URL from the href attribute info['product_url'] = product_link except NoSuchElementException: pass # thumbnail try: # Find the element within the
image_element = product_element.find_element(By.XPATH, '//img[@class="inset-y-0 w-full h-full pointer-events-none object-contain absolute"]') # Get the src attribute of the element info['image'] = image_element.get_attribute("src") except NoSuchElementException: pass # If we decide to get extra information if extra_info: # sales try: # Find the element within the
sold_element = product_element.find_element(By.XPATH, '//div[@class="truncate text-shopee-black87 text-xs min-h-4 flex-shrink-1"]') # Get the text content of the element info['sales'] = sold_element.text except (NoSuchElementException, ValueError): info['sales'] = 0 try: # Find the element within the
discount_element = product_element.find_element(By.XPATH, '//div[@class="truncate bg-shopee-voucher-yellow text-white leading-4 text-sp10"]') # Get the text content of the element info['discount'] = discount_element.text except (NoSuchElementException, ValueError): info['discount'] = '0' return info ### Function to scrape all products from a page def get_shopee_product_info_from_page(page_url, extra_info=False): """ Extract info from all products of a specfic page_url on Tiki website Args: page_url: (string) url of the page to scrape Returns: data: (list) a list of dictionary of products info. If no products found, return empty list. """ global DRIVER data = [] DRIVER.get(page_url) # Use the driver to get info from the product page time.sleep(3) # FIND ALL PRODUCT ITEMS products = DRIVER.find_elements(By.CLASS_NAME, 'col-xs-2-4 shopee-search-item-result__item') print(f'Found {len(products)} products') print(products) if len(products)>0: for i in products: product_dict = get_shopee_product_info_single(i, extra_info) data.append(product_dict) return data ### Function to get product info from a main category def get_shopee_product_info_from_category(search_product, max_page=0, extra_info=False): ''' Scrape for multiple pages of products of a category. Uses get_product_info_from_page(). Args: cat_url: (string) a url string of a category max_page: (int) an integer denoting the maximum number of pages to scrape. Default value is 0 to scrape all pages. Returns: products: a list in which every element is a dictionary of one product's information ''' products = [] page_n = 0 cat_page_url = 'https://shopee.vn/search?keyword=' + search_product product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info) while len(product_list)>0: products.extend(product_list) page_n += 1 # stop_flag = False if max_page <= 0 else (page_n > max_page) stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page if stop_flag: break cat_page_url = cat_page_url + f'&page={page_n}' product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info) return products def scrap_shopee(search_product, num_max_page, extra_info): # # #test Shopee # start_driver() # URL = 'https://shopee.vn/search?keyword=megaduo&page=0&sortBy=relevancy' # DRIVER.get(URL) # time.sleep(3) # print(URL) # products = DRIVER.find_elements(By.CLASS_NAME, 'shopee-search-item-result') # # products = DRIVER.find_element("css selector", 'li[class="col-xs-2-4 shopee-search-item-result__item"]') # product = products[0] # # Wait for the element to be present on the page # info = get_shopee_product_info_single(product, True) # print(info) start_driver(force_restart=True) prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE prod_per_cat = get_shopee_product_info_from_category(search_product, num_max_page, extra_info=extra_info) prod_data.extend(prod_per_cat) close_driver() # Close driver when we're done return prod_data