Spaces:

LinhVuu
/

price-comparison

Sleeping

File size: 7,453 Bytes

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import time

# Global driver to use throughout the script
DRIVER = None

# Wrapper to close driver if its created
def close_driver():
    global DRIVER
    if DRIVER is not None:
        DRIVER.close()
    DRIVER = None

# Function to (re)start driver
def start_driver(force_restart=False):
    global DRIVER

    if force_restart:
        close_driver()

    # Setting up the driver
    service = Service()
    options = Options()
    options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
    options.add_argument('-no-sandbox')
    options.add_argument('-disable-dev-shm-usage')

    DRIVER = webdriver.Chrome(service=service, options=options)

### Function to extract product info from the necessary html and json tags
def get_shopee_product_info_single(product_element, extra_info):
    """
    Extract info from a single product element from the driver.
    Args:
        product_item: (WebDriverElement) the product whose info needs to be
                        extracted.
    Returns:
        info: (dict) a dictionary of info of the product. Every product
                should at least have four pieces of information: name, price,
                link to the product page, and link to the product image.
    """
    info = {'source': 'shopee',
            'name':'',
            'price':-1,
            'product_url':'',
            'image':''}
    print(product_element.get_attribute('outerHTML'))
    try:
        # Find the <a> element within the <div class>
        product_title_element = product_element.find_element(By.CLASS_NAME, "line-clamp-2")

        # Get the text content of the <a> element
        info['name'] = product_title_element.text
        print(info['name'])

    except NoSuchElementException:
        info['name'] = ""


    # price
    try:
        # Find the <span> element within the <div class>
        price_element = product_element.find_element(By.XPATH,'//div[@class="truncate flex items-baseline"]/span[@class="text-base/5 truncate"]')

        # Get the text content of the <span> element
        price_text = price_element.text

        # Extract the price value
        info['price'] = int(price_text.split(" ")[0].replace('.', ''))
        print(info['price'])

    except (NoSuchElementException, ValueError):
        pass

    # link
    try:
        # Find the <a> element within the <div class>
        product_link_element = product_element.find_element(By.XPATH, '//a[@class="contents"]')

        # Get the href attribute of the <a> element
        product_link = product_link_element.get_attribute("href")

        # Extract the URL from the href attribute
        info['product_url'] = product_link 

    except NoSuchElementException:
        pass

    # thumbnail
    try:
        # Find the <img> element within the <div class>
        image_element = product_element.find_element(By.XPATH, '//img[@class="inset-y-0 w-full h-full pointer-events-none object-contain absolute"]')

        # Get the src attribute of the <img> element
        info['image'] = image_element.get_attribute("src")

    except NoSuchElementException:
        pass

    # If we decide to get extra information
    if extra_info:
        # sales
        try:
            # Find the <span> element within the <div class>
            sold_element = product_element.find_element(By.XPATH, '//div[@class="truncate text-shopee-black87 text-xs min-h-4 flex-shrink-1"]')

            # Get the text content of the <span> element
            info['sales'] = sold_element.text

        except (NoSuchElementException, ValueError):
            info['sales'] = 0

        try:
            # Find the <span> element within the <div class>
            discount_element = product_element.find_element(By.XPATH, '//div[@class="truncate bg-shopee-voucher-yellow text-white leading-4 text-sp10"]')

            # Get the text content of the <span> element
            info['discount'] = discount_element.text

        except (NoSuchElementException, ValueError):
            info['discount'] = '0'

    return info

### Function to scrape all products from a page
def get_shopee_product_info_from_page(page_url, extra_info=False):
    """
    Extract info from all products of a specfic page_url on Tiki website
    Args:
        page_url: (string) url of the page to scrape
    Returns:
        data: (list) a list of dictionary of products info. If no products
                found, return empty list.
    """
    global DRIVER

    data = []
    DRIVER.get(page_url) # Use the driver to get info from the product page
    time.sleep(3)

    # FIND ALL PRODUCT ITEMS
    products = DRIVER.find_elements(By.CLASS_NAME, 'col-xs-2-4 shopee-search-item-result__item')
    print(f'Found {len(products)} products')
    print(products)

    if len(products)>0:
        for i in products:
            product_dict = get_shopee_product_info_single(i, extra_info)
            data.append(product_dict)
    return data

### Function to get product info from a main category
def get_shopee_product_info_from_category(search_product, max_page=0, extra_info=False):
    '''
    Scrape for multiple pages of products of a category.
    Uses get_product_info_from_page().

    Args:
        cat_url: (string) a url string of a category
        max_page: (int) an integer denoting the maximum number of pages to scrape.
                  Default value is 0 to scrape all pages.
    Returns:
        products: a list in which every element is a dictionary of one product's information
    '''
    products = []

    page_n = 0
    cat_page_url = 'https://shopee.vn/search?keyword=' + search_product
    product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)

    while len(product_list)>0:
        products.extend(product_list)
        page_n += 1

        # stop_flag = False if max_page <= 0 else (page_n > max_page)
        stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
        if stop_flag:
            break

        cat_page_url = cat_page_url + f'&page={page_n}'
        product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)

    return products

def scrap_shopee(search_product, num_max_page, extra_info):

    # # #test Shopee
    # start_driver()
    # URL = 'https://shopee.vn/search?keyword=megaduo&page=0&sortBy=relevancy'
    # DRIVER.get(URL)
    # time.sleep(3)
    # print(URL)
    # products = DRIVER.find_elements(By.CLASS_NAME, 'shopee-search-item-result')
    # # products = DRIVER.find_element("css selector", 'li[class="col-xs-2-4 shopee-search-item-result__item"]')
    # product = products[0]
    # # Wait for the element to be present on the page
    # info = get_shopee_product_info_single(product, True)
    # print(info)

    start_driver(force_restart=True)    

    prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE

    prod_per_cat = get_shopee_product_info_from_category(search_product, num_max_page, extra_info=extra_info)
    prod_data.extend(prod_per_cat)

    close_driver() # Close driver when we're done

    return prod_data