Spaces:
Sleeping
Sleeping
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
from selenium.common.exceptions import NoSuchElementException | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.service import Service | |
import time | |
# Global driver to use throughout the script | |
DRIVER = None | |
# Wrapper to close driver if its created | |
def close_driver(): | |
global DRIVER | |
if DRIVER is not None: | |
DRIVER.close() | |
DRIVER = None | |
# Function to (re)start driver | |
def start_driver(force_restart=False): | |
global DRIVER | |
if force_restart: | |
close_driver() | |
# Setting up the driver | |
service = Service() | |
options = Options() | |
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background | |
options.add_argument('-no-sandbox') | |
options.add_argument('-disable-dev-shm-usage') | |
DRIVER = webdriver.Chrome(service=service, options=options) | |
### Function to extract product info from the necessary html and json tags | |
def get_tiki_product_info_single(product_element, extra_info): | |
""" | |
Extract info from a single product element from the driver. | |
Args: | |
product_item: (WebDriverElement) the product whose info needs to be | |
extracted. | |
Returns: | |
info: (dict) a dictionary of info of the product. Every product | |
should at least have four pieces of information: name, price, | |
link to the product page, and link to the product image. | |
""" | |
info = {'source': 'tiki', | |
'name':'', | |
'price':-1, | |
'product_url':'', | |
'image':''} | |
# print(product_element.get_attribute('outerHTML')) | |
try: | |
# name = product_element.find_element(By.XPATH, ".//div[@class='name']/h3") | |
# name = product_element.find_element(By.CLASS_NAME, 'style__NameStyled-sc-139nb47-8 ibOlar').find_element(By.TAG_NAME, 'h3') | |
name = product_element.find_element(By.CLASS_NAME, 'name').find_element(By.TAG_NAME, 'h3') | |
info['name'] = name.get_attribute('innerHTML').strip() | |
except NoSuchElementException: | |
# Find the <h3> element by class name | |
name = product_element.find_element(By.CLASS_NAME, 'style__NameStyled-sc-139nb47-8') | |
# Get the text content of the element | |
info['name'] = name.text | |
# price = product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML') | |
# print(price) | |
# price | |
try: | |
# price=product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML').strip() | |
price = product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML') | |
# price = product_element.find_element(By.XPATH, ".//div[@class='price-discount__price']").get_attribute('innerHTML') | |
info['price']=int(price.replace('<sup>₫</sup>', '').replace('.', '')) | |
# info['price'] = int(re.sub(r'[\.\s₫]', '', price)) # With regex | |
# info['price'] = int(''.join([c for c in price if c not in '.₫ '])) # Without regex | |
except (NoSuchElementException, ValueError): | |
pass | |
# link | |
try: | |
product_link = product_element.get_attribute('href') | |
info['product_url'] = product_link | |
except NoSuchElementException: | |
pass | |
# thumbnail | |
try: | |
# thumbnail = product_element.find_elements(By.XPATH, ".//div[@class='thumbnail']//child::img")[-1] | |
# thumbnail = product_element.find_element(By.CLASS_NAME, 'thumbnail').find_element(By.TAG_NAME, 'img') | |
# info['image'] = thumbnail.get_attribute('src') | |
# Find the <div> element with class "image-wrapper" | |
image_div = product_element.find_element(By.CLASS_NAME, 'image-wrapper') | |
# Find the <img> element within the <div> element | |
img_element = image_div.find_element(By.TAG_NAME, 'img') | |
# Get the value of the "srcset" attribute | |
srcset_value = img_element.get_attribute('srcset') | |
# Extract the link of the image from the srcset value | |
image_link = srcset_value.split(',')[0].split(' ')[0] | |
info['image'] = image_link | |
except NoSuchElementException: | |
pass | |
# If we decide to get extra information | |
if extra_info: | |
# sales | |
try: | |
# sales_elem = product_element.find_element(By.XPATH, ".//div[@class='styles__StyledQtySold-sc-732h27-2']") | |
# sales_elem = product_element.find_element(By.CLASS_NAME, 'quantity has-border') | |
# info['sales'] = sales_elem | |
# info['sales'] = int(re.sub(r'\D', '', sales_elem.get_attribute('innerHTML'))) | |
# Find the <span> element with class "quantity" | |
quantity_span = product_element.find_element(By.CLASS_NAME, 'quantity') | |
# Get the text content of the element | |
info['sales'] = quantity_span.text | |
except (NoSuchElementException, ValueError): | |
info['sales'] = 0 | |
# # rating | |
# try: | |
# # rating = product_element.find_element(By.XPATH, ".//div[@class='average']").get_attribute('style') | |
# rating = product_element.find_element(By.CLASS_NAME, 'average').get_attribute('style') | |
# # info['rating'] = float(re.sub(r'\D','', rating))/100*5 # With regex | |
# info['rating'] = float(''.join([c for c in rating if c.isdigit()]))/100*5 # Without regex | |
# except NoSuchElementException: | |
# info['rating'] = 0 | |
try: | |
# Try to get discount using class name | |
discount = product_element.find_element(By.CLASS_NAME, 'price-discount__discount').get_attribute('innerHTML') | |
info['discount'] = discount.replace('-', '') # Remove any dashes | |
except (NoSuchElementException, ValueError): | |
try: | |
# Try to get discount using another method | |
discount_div = product_element.find_element(By.CLASS_NAME, 'style__DiscountPercentStyled-sc-e9h7mj-1') | |
info['discount'] = discount_div.text.replace('-', '') # Remove any dashes | |
except NoSuchElementException: | |
# If both attempts fail, set discount to 0 | |
info['discount'] = '0' | |
# # tiki now | |
# try: | |
# info['tiki_now'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-service').find_element(By.CLASS_NAME, 'item')) | |
# except NoSuchElementException: | |
# info['tiki_now'] = False | |
# # freeship, official seller, and/or trusted seller | |
# try: | |
# info['freeship'] = False | |
# info['official'] = False | |
# info['trusted'] = False | |
# thumbnail_tag = product_element.find_element(By.CLASS_NAME, 'thumbnail') | |
# list_img = thumbnail_tag.find_elements(By.TAG_NAME, 'img') | |
# # list_img = product_element.find_elements(By.XPATH, ".//div[@class='thumbnail']/img") | |
# for img in list_img: | |
# if img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/dc/0d/49/3251737db2de83b74eba8a9ad6d03338.png': | |
# info['freeship'] = True | |
# elif img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/b9/1f/4b/557eac9c67a4466ccebfa74cde854215.png': | |
# info['official'] = True | |
# elif img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/e0/41/da/bb0fc684a838eff5e264ce0534a148f0.png': | |
# info['trusted'] = True | |
# except NoSuchElementException: | |
# pass | |
# # under price | |
# try: | |
# # info['under_price'] = bool(product_element.find_element(By.XPATH, ".//div[@class='badge-under-price']/child::div[@class='item']")) | |
# info['under_price'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-under-price').find_element(By.CLASS_NAME, 'item')) | |
# except NoSuchElementException: | |
# info['under_price'] = False | |
# # installment | |
# try: | |
# # info['installment'] = bool(product_element.find_element(By.XPATH, ".//div[@class='badge-benefits']//child::img[1]")) | |
# info['installment'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-benefits').find_element(By.TAG_NAME, 'img')) | |
# except NoSuchElementException: | |
# info['installment'] = False | |
# # gift | |
# try: | |
# # info['gift'] = bool(product_element.find_element(By.XPATH, ".//div[@class='freegift-list']")) | |
# info['gift'] = bool(product_element.find_element(By.CLASS_NAME, 'freegift-list')) | |
# except NoSuchElementException: | |
# info['gift'] = False | |
return info | |
### Function to scrape all products from a page | |
def get_tiki_product_info_from_page(page_url, extra_info=False): | |
""" | |
Extract info from all products of a specfic page_url on Tiki website | |
Args: | |
page_url: (string) url of the page to scrape | |
Returns: | |
data: (list) a list of dictionary of products info. If no products | |
found, return empty list. | |
""" | |
global DRIVER | |
data = [] | |
DRIVER.get(page_url) # Use the driver to get info from the product page | |
time.sleep(3) | |
try: | |
# no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']")) | |
no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0')) | |
print("EMPTY PAGE") | |
return data | |
except NoSuchElementException: | |
no_product_found = False | |
# FIND ALL PRODUCT ITEMS | |
# products = DRIVER.find_elements(By.XPATH, "//a[@class='product-item']") | |
products = DRIVER.find_elements(By.CLASS_NAME, 'product-item') | |
print(f'Found {len(products)} products') | |
if (not no_product_found) and len(products)>0: | |
for i in products: | |
product_dict = get_tiki_product_info_single(i, extra_info) | |
data.append(product_dict) | |
return data | |
### Function to get product info from a main category | |
def get_tiki_product_info_from_category(cat_url, max_page=0, extra_info=False): | |
''' | |
Scrape for multiple pages of products of a category. | |
Uses get_product_info_from_page(). | |
Args: | |
cat_url: (string) a url string of a category | |
max_page: (int) an integer denoting the maximum number of pages to scrape. | |
Default value is 0 to scrape all pages. | |
Returns: | |
products: a list in which every element is a dictionary of one product's information | |
''' | |
products = [] | |
page_n = 1 | |
cat_page_url = cat_url + f'&page={page_n}' | |
print(cat_page_url) | |
product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info) | |
while len(product_list)>0: | |
products.extend(product_list) | |
page_n += 1 | |
# stop_flag = False if max_page <= 0 else (page_n > max_page) | |
stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page | |
if stop_flag: | |
break | |
cat_page_url = cat_url + f'&page={page_n}' | |
print(cat_page_url) | |
product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info) | |
return products | |
def scrap_tiki(search_product, num_max_page, extra_info): | |
start_driver(force_restart=True) | |
url = 'https://tiki.vn/search?sort=default&q="' + search_product +'"' | |
prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE | |
# prod_per_cat = get_product_info_from_category(main_cat['URL'], num_max_page, extra_info=extra_info) | |
prod_per_cat = get_tiki_product_info_from_category(url, num_max_page, extra_info = extra_info) | |
prod_data.extend(prod_per_cat) | |
close_driver() # Close driver when we're done | |
return prod_data |