from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service import time # Global driver to use throughout the script DRIVER = None # Wrapper to close driver if its created def close_driver(): global DRIVER if DRIVER is not None: DRIVER.close() DRIVER = None # Function to (re)start driver def start_driver(force_restart=False): global DRIVER if force_restart: close_driver() # Setting up the driver service = Service() options = Options() options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background options.add_argument('-no-sandbox') options.add_argument('-disable-dev-shm-usage') DRIVER = webdriver.Chrome(service=service, options=options) ### Function to extract product info from the necessary html and json tags def get_tiki_product_info_single(product_element, extra_info): """ Extract info from a single product element from the driver. Args: product_item: (WebDriverElement) the product whose info needs to be extracted. Returns: info: (dict) a dictionary of info of the product. Every product should at least have four pieces of information: name, price, link to the product page, and link to the product image. """ info = {'source': 'tiki', 'name':'', 'price':-1, 'product_url':'', 'image':''} # print(product_element.get_attribute('outerHTML')) try: # name = product_element.find_element(By.XPATH, ".//div[@class='name']/h3") # name = product_element.find_element(By.CLASS_NAME, 'style__NameStyled-sc-139nb47-8 ibOlar').find_element(By.TAG_NAME, 'h3') name = product_element.find_element(By.CLASS_NAME, 'name').find_element(By.TAG_NAME, 'h3') info['name'] = name.get_attribute('innerHTML').strip() except NoSuchElementException: # Find the

element by class name name = product_element.find_element(By.CLASS_NAME, 'style__NameStyled-sc-139nb47-8') # Get the text content of the element info['name'] = name.text # price = product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML') # print(price) # price try: # price=product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML').strip() price = product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML') # price = product_element.find_element(By.XPATH, ".//div[@class='price-discount__price']").get_attribute('innerHTML') info['price']=int(price.replace('', '').replace('.', '')) # info['price'] = int(re.sub(r'[\.\s₫]', '', price)) # With regex # info['price'] = int(''.join([c for c in price if c not in '.₫ '])) # Without regex except (NoSuchElementException, ValueError): pass # link try: product_link = product_element.get_attribute('href') info['product_url'] = product_link except NoSuchElementException: pass # thumbnail try: # thumbnail = product_element.find_elements(By.XPATH, ".//div[@class='thumbnail']//child::img")[-1] # thumbnail = product_element.find_element(By.CLASS_NAME, 'thumbnail').find_element(By.TAG_NAME, 'img') # info['image'] = thumbnail.get_attribute('src') # Find the
element with class "image-wrapper" image_div = product_element.find_element(By.CLASS_NAME, 'image-wrapper') # Find the element within the
element img_element = image_div.find_element(By.TAG_NAME, 'img') # Get the value of the "srcset" attribute srcset_value = img_element.get_attribute('srcset') # Extract the link of the image from the srcset value image_link = srcset_value.split(',')[0].split(' ')[0] info['image'] = image_link except NoSuchElementException: pass # If we decide to get extra information if extra_info: # sales try: # sales_elem = product_element.find_element(By.XPATH, ".//div[@class='styles__StyledQtySold-sc-732h27-2']") # sales_elem = product_element.find_element(By.CLASS_NAME, 'quantity has-border') # info['sales'] = sales_elem # info['sales'] = int(re.sub(r'\D', '', sales_elem.get_attribute('innerHTML'))) # Find the element with class "quantity" quantity_span = product_element.find_element(By.CLASS_NAME, 'quantity') # Get the text content of the element info['sales'] = quantity_span.text except (NoSuchElementException, ValueError): info['sales'] = 0 # # rating # try: # # rating = product_element.find_element(By.XPATH, ".//div[@class='average']").get_attribute('style') # rating = product_element.find_element(By.CLASS_NAME, 'average').get_attribute('style') # # info['rating'] = float(re.sub(r'\D','', rating))/100*5 # With regex # info['rating'] = float(''.join([c for c in rating if c.isdigit()]))/100*5 # Without regex # except NoSuchElementException: # info['rating'] = 0 try: # Try to get discount using class name discount = product_element.find_element(By.CLASS_NAME, 'price-discount__discount').get_attribute('innerHTML') info['discount'] = discount.replace('-', '') # Remove any dashes except (NoSuchElementException, ValueError): try: # Try to get discount using another method discount_div = product_element.find_element(By.CLASS_NAME, 'style__DiscountPercentStyled-sc-e9h7mj-1') info['discount'] = discount_div.text.replace('-', '') # Remove any dashes except NoSuchElementException: # If both attempts fail, set discount to 0 info['discount'] = '0' # # tiki now # try: # info['tiki_now'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-service').find_element(By.CLASS_NAME, 'item')) # except NoSuchElementException: # info['tiki_now'] = False # # freeship, official seller, and/or trusted seller # try: # info['freeship'] = False # info['official'] = False # info['trusted'] = False # thumbnail_tag = product_element.find_element(By.CLASS_NAME, 'thumbnail') # list_img = thumbnail_tag.find_elements(By.TAG_NAME, 'img') # # list_img = product_element.find_elements(By.XPATH, ".//div[@class='thumbnail']/img") # for img in list_img: # if img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/dc/0d/49/3251737db2de83b74eba8a9ad6d03338.png': # info['freeship'] = True # elif img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/b9/1f/4b/557eac9c67a4466ccebfa74cde854215.png': # info['official'] = True # elif img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/e0/41/da/bb0fc684a838eff5e264ce0534a148f0.png': # info['trusted'] = True # except NoSuchElementException: # pass # # under price # try: # # info['under_price'] = bool(product_element.find_element(By.XPATH, ".//div[@class='badge-under-price']/child::div[@class='item']")) # info['under_price'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-under-price').find_element(By.CLASS_NAME, 'item')) # except NoSuchElementException: # info['under_price'] = False # # installment # try: # # info['installment'] = bool(product_element.find_element(By.XPATH, ".//div[@class='badge-benefits']//child::img[1]")) # info['installment'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-benefits').find_element(By.TAG_NAME, 'img')) # except NoSuchElementException: # info['installment'] = False # # gift # try: # # info['gift'] = bool(product_element.find_element(By.XPATH, ".//div[@class='freegift-list']")) # info['gift'] = bool(product_element.find_element(By.CLASS_NAME, 'freegift-list')) # except NoSuchElementException: # info['gift'] = False return info ### Function to scrape all products from a page def get_tiki_product_info_from_page(page_url, extra_info=False): """ Extract info from all products of a specfic page_url on Tiki website Args: page_url: (string) url of the page to scrape Returns: data: (list) a list of dictionary of products info. If no products found, return empty list. """ global DRIVER data = [] DRIVER.get(page_url) # Use the driver to get info from the product page time.sleep(3) try: # no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']")) no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0')) print("EMPTY PAGE") return data except NoSuchElementException: no_product_found = False # FIND ALL PRODUCT ITEMS # products = DRIVER.find_elements(By.XPATH, "//a[@class='product-item']") products = DRIVER.find_elements(By.CLASS_NAME, 'product-item') print(f'Found {len(products)} products') if (not no_product_found) and len(products)>0: for i in products: product_dict = get_tiki_product_info_single(i, extra_info) data.append(product_dict) return data ### Function to get product info from a main category def get_tiki_product_info_from_category(cat_url, max_page=0, extra_info=False): ''' Scrape for multiple pages of products of a category. Uses get_product_info_from_page(). Args: cat_url: (string) a url string of a category max_page: (int) an integer denoting the maximum number of pages to scrape. Default value is 0 to scrape all pages. Returns: products: a list in which every element is a dictionary of one product's information ''' products = [] page_n = 1 cat_page_url = cat_url + f'&page={page_n}' print(cat_page_url) product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info) while len(product_list)>0: products.extend(product_list) page_n += 1 # stop_flag = False if max_page <= 0 else (page_n > max_page) stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page if stop_flag: break cat_page_url = cat_url + f'&page={page_n}' print(cat_page_url) product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info) return products def scrap_tiki(search_product, num_max_page, extra_info): start_driver(force_restart=True) url = 'https://tiki.vn/search?sort=default&q="' + search_product +'"' prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE # prod_per_cat = get_product_info_from_category(main_cat['URL'], num_max_page, extra_info=extra_info) prod_per_cat = get_tiki_product_info_from_category(url, num_max_page, extra_info = extra_info) prod_data.extend(prod_per_cat) close_driver() # Close driver when we're done return prod_data