Spaces:

LinhVuu
/

price-comparison

Sleeping

price-comparison / scraper_tiki.py

Linh Vuu

updated files

6d0cb99 3 months ago

No virus

11.9 kB

	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options
	from selenium.common.exceptions import NoSuchElementException
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.service import Service
	import time

	# Global driver to use throughout the script
	DRIVER = None

	# Wrapper to close driver if its created
	def close_driver():
	global DRIVER
	if DRIVER is not None:
	DRIVER.close()
	DRIVER = None

	# Function to (re)start driver
	def start_driver(force_restart=False):
	global DRIVER

	if force_restart:
	close_driver()

	# Setting up the driver
	service = Service()
	options = Options()
	options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
	options.add_argument('-no-sandbox')
	options.add_argument('-disable-dev-shm-usage')

	DRIVER = webdriver.Chrome(service=service, options=options)

	### Function to extract product info from the necessary html and json tags
	def get_tiki_product_info_single(product_element, extra_info):
	"""
	Extract info from a single product element from the driver.
	Args:
	product_item: (WebDriverElement) the product whose info needs to be
	extracted.
	Returns:
	info: (dict) a dictionary of info of the product. Every product
	should at least have four pieces of information: name, price,
	link to the product page, and link to the product image.
	"""
	info = {'source': 'tiki',
	'name':'',
	'price':-1,
	'product_url':'',
	'image':''}
	# print(product_element.get_attribute('outerHTML'))
	try:
	# name = product_element.find_element(By.XPATH, ".//div[@class='name']/h3")
	# name = product_element.find_element(By.CLASS_NAME, 'style__NameStyled-sc-139nb47-8 ibOlar').find_element(By.TAG_NAME, 'h3')

	name = product_element.find_element(By.CLASS_NAME, 'name').find_element(By.TAG_NAME, 'h3')

	info['name'] = name.get_attribute('innerHTML').strip()

	except NoSuchElementException:

	# Find the <h3> element by class name
	name = product_element.find_element(By.CLASS_NAME, 'style__NameStyled-sc-139nb47-8')

	# Get the text content of the element
	info['name'] = name.text

	# price = product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML')
	# print(price)

	# price
	try:
	# price=product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML').strip()
	price = product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML')
	# price = product_element.find_element(By.XPATH, ".//div[@class='price-discount__price']").get_attribute('innerHTML')

	info['price']=int(price.replace('<sup>₫</sup>', '').replace('.', ''))
	# info['price'] = int(re.sub(r'[\.\s₫]', '', price)) # With regex
	# info['price'] = int(''.join([c for c in price if c not in '.₫ '])) # Without regex
	except (NoSuchElementException, ValueError):
	pass

	# link
	try:
	product_link = product_element.get_attribute('href')
	info['product_url'] = product_link
	except NoSuchElementException:
	pass

	# thumbnail
	try:
	# thumbnail = product_element.find_elements(By.XPATH, ".//div[@class='thumbnail']//child::img")[-1]

	# thumbnail = product_element.find_element(By.CLASS_NAME, 'thumbnail').find_element(By.TAG_NAME, 'img')
	# info['image'] = thumbnail.get_attribute('src')

	# Find the <div> element with class "image-wrapper"
	image_div = product_element.find_element(By.CLASS_NAME, 'image-wrapper')

	# Find the <img> element within the <div> element
	img_element = image_div.find_element(By.TAG_NAME, 'img')

	# Get the value of the "srcset" attribute
	srcset_value = img_element.get_attribute('srcset')

	# Extract the link of the image from the srcset value
	image_link = srcset_value.split(',')[0].split(' ')[0]
	info['image'] = image_link

	except NoSuchElementException:
	pass

	# If we decide to get extra information
	if extra_info:
	# sales
	try:
	# sales_elem = product_element.find_element(By.XPATH, ".//div[@class='styles__StyledQtySold-sc-732h27-2']")
	# sales_elem = product_element.find_element(By.CLASS_NAME, 'quantity has-border')
	# info['sales'] = sales_elem
	# info['sales'] = int(re.sub(r'\D', '', sales_elem.get_attribute('innerHTML')))

	# Find the <span> element with class "quantity"
	quantity_span = product_element.find_element(By.CLASS_NAME, 'quantity')

	# Get the text content of the element
	info['sales'] = quantity_span.text

	except (NoSuchElementException, ValueError):
	info['sales'] = 0

	# # rating
	# try:
	# # rating = product_element.find_element(By.XPATH, ".//div[@class='average']").get_attribute('style')
	# rating = product_element.find_element(By.CLASS_NAME, 'average').get_attribute('style')
	# # info['rating'] = float(re.sub(r'\D','', rating))/100*5 # With regex
	# info['rating'] = float(''.join([c for c in rating if c.isdigit()]))/100*5 # Without regex
	# except NoSuchElementException:
	# info['rating'] = 0

	try:
	# Try to get discount using class name
	discount = product_element.find_element(By.CLASS_NAME, 'price-discount__discount').get_attribute('innerHTML')
	info['discount'] = discount.replace('-', '') # Remove any dashes

	except (NoSuchElementException, ValueError):
	try:
	# Try to get discount using another method
	discount_div = product_element.find_element(By.CLASS_NAME, 'style__DiscountPercentStyled-sc-e9h7mj-1')
	info['discount'] = discount_div.text.replace('-', '') # Remove any dashes

	except NoSuchElementException:
	# If both attempts fail, set discount to 0
	info['discount'] = '0'

	# # tiki now
	# try:
	# info['tiki_now'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-service').find_element(By.CLASS_NAME, 'item'))
	# except NoSuchElementException:
	# info['tiki_now'] = False

	# # freeship, official seller, and/or trusted seller
	# try:
	# info['freeship'] = False
	# info['official'] = False
	# info['trusted'] = False
	# thumbnail_tag = product_element.find_element(By.CLASS_NAME, 'thumbnail')
	# list_img = thumbnail_tag.find_elements(By.TAG_NAME, 'img')
	# # list_img = product_element.find_elements(By.XPATH, ".//div[@class='thumbnail']/img")
	# for img in list_img:
	# if img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/dc/0d/49/3251737db2de83b74eba8a9ad6d03338.png':
	# info['freeship'] = True
	# elif img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/b9/1f/4b/557eac9c67a4466ccebfa74cde854215.png':
	# info['official'] = True
	# elif img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/e0/41/da/bb0fc684a838eff5e264ce0534a148f0.png':
	# info['trusted'] = True
	# except NoSuchElementException:
	# pass

	# # under price
	# try:
	# # info['under_price'] = bool(product_element.find_element(By.XPATH, ".//div[@class='badge-under-price']/child::div[@class='item']"))
	# info['under_price'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-under-price').find_element(By.CLASS_NAME, 'item'))
	# except NoSuchElementException:
	# info['under_price'] = False

	# # installment
	# try:
	# # info['installment'] = bool(product_element.find_element(By.XPATH, ".//div[@class='badge-benefits']//child::img[1]"))
	# info['installment'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-benefits').find_element(By.TAG_NAME, 'img'))
	# except NoSuchElementException:
	# info['installment'] = False

	# # gift
	# try:
	# # info['gift'] = bool(product_element.find_element(By.XPATH, ".//div[@class='freegift-list']"))
	# info['gift'] = bool(product_element.find_element(By.CLASS_NAME, 'freegift-list'))
	# except NoSuchElementException:
	# info['gift'] = False

	return info


	### Function to scrape all products from a page
	def get_tiki_product_info_from_page(page_url, extra_info=False):
	"""
	Extract info from all products of a specfic page_url on Tiki website
	Args:
	page_url: (string) url of the page to scrape
	Returns:
	data: (list) a list of dictionary of products info. If no products
	found, return empty list.
	"""
	global DRIVER

	data = []
	DRIVER.get(page_url) # Use the driver to get info from the product page
	time.sleep(3)


	try:
	# no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
	no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
	print("EMPTY PAGE")
	return data
	except NoSuchElementException:
	no_product_found = False

	# FIND ALL PRODUCT ITEMS
	# products = DRIVER.find_elements(By.XPATH, "//a[@class='product-item']")
	products = DRIVER.find_elements(By.CLASS_NAME, 'product-item')
	print(f'Found {len(products)} products')

	if (not no_product_found) and len(products)>0:
	for i in products:
	product_dict = get_tiki_product_info_single(i, extra_info)
	data.append(product_dict)
	return data

	### Function to get product info from a main category
	def get_tiki_product_info_from_category(cat_url, max_page=0, extra_info=False):
	'''
	Scrape for multiple pages of products of a category.
	Uses get_product_info_from_page().

	Args:
	cat_url: (string) a url string of a category
	max_page: (int) an integer denoting the maximum number of pages to scrape.
	Default value is 0 to scrape all pages.
	Returns:
	products: a list in which every element is a dictionary of one product's information
	'''
	products = []

	page_n = 1
	cat_page_url = cat_url + f'&page={page_n}'
	print(cat_page_url)
	product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)

	while len(product_list)>0:
	products.extend(product_list)
	page_n += 1

	# stop_flag = False if max_page <= 0 else (page_n > max_page)
	stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
	if stop_flag:
	break

	cat_page_url = cat_url + f'&page={page_n}'
	print(cat_page_url)
	product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)

	return products

	def scrap_tiki(search_product, num_max_page, extra_info):

	start_driver(force_restart=True)

	url = 'https://tiki.vn/search?sort=default&q="' + search_product +'"'

	prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE

	# prod_per_cat = get_product_info_from_category(main_cat['URL'], num_max_page, extra_info=extra_info)
	prod_per_cat = get_tiki_product_info_from_category(url, num_max_page, extra_info = extra_info)
	prod_data.extend(prod_per_cat)
	close_driver() # Close driver when we're done

	return prod_data