price-comparison / scraper_lazada.py
Linh Vuu
updated files
6d0cb99
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import time
# Global driver to use throughout the script
DRIVER = None
# Wrapper to close driver if its created
def close_driver():
global DRIVER
if DRIVER is not None:
DRIVER.close()
DRIVER = None
# Function to (re)start driver
def start_driver(force_restart=False):
global DRIVER
if force_restart:
close_driver()
# Setting up the driver
service = Service()
options = Options()
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
options.add_argument('-no-sandbox')
options.add_argument('-disable-dev-shm-usage')
DRIVER = webdriver.Chrome(service=service, options=options)
### Function to extract product info from the necessary html and json tags
def get_lazada_product_info_single(product_element, extra_info):
"""
Extract info from a single product element from the driver.
Args:
product_item: (WebDriverElement) the product whose info needs to be
extracted.
Returns:
info: (dict) a dictionary of info of the product. Every product
should at least have four pieces of information: name, price,
link to the product page, and link to the product image.
"""
info = {'source': 'lazada',
'name':'',
'price':-1,
'product_url':'',
'image':''}
# print(product_element.get_attribute('outerHTML'))
try:
# Find the <a> element within the <div class="RfADt">
product_title_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
# Get the text content of the <a> element
info['name'] = product_title_element.text
except NoSuchElementException:
info['name'] = ""
# price
try:
# Find the <span> element with class "ooOxS" within the <div class="aBrP0">
price_element = product_element.find_element(By.XPATH, "//div[@class='aBrP0']/span[@class='ooOxS']")
# Get the text content of the <span> element
price_text = price_element.text
# Extract the price value
info['price'] = int(price_text.split(" ")[0].replace('.', ''))
except (NoSuchElementException, ValueError):
pass
# link
try:
# Find the <a> element within the <div class="RfADt">
product_link_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
# Get the href attribute of the <a> element
product_link = product_link_element.get_attribute("href")
# Extract the URL from the href attribute
info['product_url'] = product_link.split("//")[1]
except NoSuchElementException:
pass
# thumbnail
try:
# Find the <img> element within the <div class="_95X4G">
image_element = product_element.find_element(By.XPATH, "//div[@class='_95X4G']/a/div/img")
# Get the src attribute of the <img> element
info['image'] = image_element.get_attribute("src")
except NoSuchElementException:
pass
# If we decide to get extra information
if extra_info:
# sales
try:
# Find the <span> element within the <div class="_6uN7R">
sold_element = product_element.find_element(By.XPATH, "//div[@class='_6uN7R']/span[@class='_1cEkb']/span[1]")
# Get the text content of the <span> element
info['sales'] = sold_element.text
except (NoSuchElementException, ValueError):
info['sales'] = 0
try:
# Find the <span> element within the <div class="WNoq3">
discount_element = product_element.find_element(By.XPATH, "//div[@class='WNoq3']/span[@class='IcOsH']")
# Get the text content of the <span> element
info['discount'] = discount_element.text
except (NoSuchElementException, ValueError):
info['discount'] = '0'
return info
### Function to scrape all products from a page
def get_lazada_product_info_from_page(page_url, extra_info=False):
"""
Extract info from all products of a specfic page_url on Tiki website
Args:
page_url: (string) url of the page to scrape
Returns:
data: (list) a list of dictionary of products info. If no products
found, return empty list.
"""
global DRIVER
data = []
DRIVER.get(page_url) # Use the driver to get info from the product page
time.sleep(3)
# FIND ALL PRODUCT ITEMS
products = DRIVER.find_elements(By.CLASS_NAME, 'Bm3ON')
print(products)
print(f'Found {len(products)} products')
if len(products)>0:
for i in products:
product_dict = get_lazada_product_info_single(i, extra_info)
data.append(product_dict)
return data
### Function to get product info from a main category
def get_lazada_product_info_from_category(search_product, max_page=0, extra_info=False):
'''
Scrape for multiple pages of products of a category.
Uses get_product_info_from_page().
Args:
cat_url: (string) a url string of a category
max_page: (int) an integer denoting the maximum number of pages to scrape.
Default value is 0 to scrape all pages.
Returns:
products: a list in which every element is a dictionary of one product's information
'''
products = []
page_n = 1
cat_url = 'https://www.lazada.vn/catalog/?q=' + search_product
product_list = get_lazada_product_info_from_page(cat_url, extra_info=extra_info)
while len(product_list)>0:
products.extend(product_list)
page_n += 1
# stop_flag = False if max_page <= 0 else (page_n > max_page)
stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
if stop_flag:
break
cat_url = 'https://www.lazada.vn/catalog/?page=' + page_n + '&q=' + search_product
product_list = get_lazada_product_info_from_page(cat_url, extra_info=extra_info)
return products
def scrap_lazada(search_product, num_max_page, extra_info):
start_driver(force_restart=True)
prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
prod_per_cat = get_lazada_product_info_from_category(search_product, num_max_page, extra_info=extra_info)
prod_data.extend(prod_per_cat)
close_driver() # Close driver when we're done
return prod_data