Spaces:
Running
Running
Linh Vuu
commited on
Commit
β’
39a482a
1
Parent(s):
a088ba6
added files
Browse files- README copy.md +12 -0
- app.py +85 -0
- requirements.txt +4 -0
- scraper_lazada.py +208 -0
- scraper_shopee.py +229 -0
- scraper_tiki.py +283 -0
README copy.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: PriceComparison
|
3 |
+
emoji: π
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.33.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from scraper_tiki import *
|
2 |
+
from scraper_lazada import *
|
3 |
+
from scraper_shopee import *
|
4 |
+
import pandas as pd
|
5 |
+
import streamlit as st
|
6 |
+
|
7 |
+
# #test Tiki
|
8 |
+
# start_driver()
|
9 |
+
# DRIVER.get('https://tiki.vn/search?sort=price%2Casc&q=megaduo')
|
10 |
+
# time.sleep(3)
|
11 |
+
# products = DRIVER.find_elements(By.CLASS_NAME, 'product-item')
|
12 |
+
# product = products[2]
|
13 |
+
# info = get_tiki_product_info_single(product, True)
|
14 |
+
# print(info)
|
15 |
+
|
16 |
+
# # Test Lazada
|
17 |
+
# start_driver()
|
18 |
+
# DRIVER.get('https://www.lazada.vn/catalog/?page=1&q=megaduo&sort=priceasc')
|
19 |
+
# time.sleep(3)
|
20 |
+
# products = DRIVER.find_elements(By.CLASS_NAME, 'Bm3ON')
|
21 |
+
# product = products[2]
|
22 |
+
# info = get_lazada_product_info_single(product, True)
|
23 |
+
# print(info)
|
24 |
+
|
25 |
+
def main():
|
26 |
+
|
27 |
+
st.subheader("Price Comparison (So SΓ‘nh GiΓ‘)")
|
28 |
+
|
29 |
+
with st.form(key="user_input_form"):
|
30 |
+
|
31 |
+
search_product = st.text_input("What would you like to buy? (BαΊ‘n muα»n mua gΓ¬?)")
|
32 |
+
submit_button = st.form_submit_button(label="Search")
|
33 |
+
|
34 |
+
if submit_button:
|
35 |
+
print('Scraping', search_product)
|
36 |
+
# search_product = "megaduo"
|
37 |
+
# search_product = input("Search for what? ")
|
38 |
+
num_max_page = 1
|
39 |
+
extra_info = True
|
40 |
+
n_products_to_view = 5 # Change this as you like to check more products
|
41 |
+
col_to_display = ['name', 'price', 'product_url', 'image']
|
42 |
+
|
43 |
+
st.subheader("Shopee")
|
44 |
+
shopee_data = scrap_shopee(search_product, num_max_page, extra_info)
|
45 |
+
if shopee_data:
|
46 |
+
df_shopee = pd.DataFrame(data=shopee_data, columns=shopee_data[0].keys())
|
47 |
+
print(df_shopee.head())
|
48 |
+
st.write(df_shopee[col_to_display].sort_values(by='price').head(n_products_to_view))
|
49 |
+
else:
|
50 |
+
df_shopee = pd.DataFrame(columns = col_to_display)
|
51 |
+
st.write("Not found.")
|
52 |
+
|
53 |
+
st.subheader("Lazada")
|
54 |
+
lazada_data = scrap_lazada(search_product, num_max_page, extra_info)
|
55 |
+
if lazada_data:
|
56 |
+
df_lazada = pd.DataFrame(data=lazada_data, columns=lazada_data[0].keys())
|
57 |
+
print(df_lazada.head())
|
58 |
+
st.write(df_lazada[col_to_display].sort_values(by='price').head(n_products_to_view))
|
59 |
+
else:
|
60 |
+
df_lazada = pd.DataFrame(columns = col_to_display)
|
61 |
+
st.write("Not found.")
|
62 |
+
|
63 |
+
|
64 |
+
st.subheader("Tiki")
|
65 |
+
tiki_data = scrap_tiki(search_product, num_max_page, extra_info)
|
66 |
+
if tiki_data:
|
67 |
+
df_tiki = pd.DataFrame(data=tiki_data, columns=tiki_data[0].keys())
|
68 |
+
print(df_tiki.head())
|
69 |
+
st.write(df_tiki[col_to_display].sort_values(by='price').head(n_products_to_view))
|
70 |
+
else:
|
71 |
+
df_tiki = pd.DataFrame(columns = col_to_display)
|
72 |
+
st.write("Not found.")
|
73 |
+
|
74 |
+
# Merge the two dataframes
|
75 |
+
merged_df = pd.concat([df_tiki, df_lazada, df_shopee])
|
76 |
+
|
77 |
+
# Sort the merged dataframe by price
|
78 |
+
sorted_merged_df = merged_df.sort_values(by='price')
|
79 |
+
|
80 |
+
print(sorted_merged_df.head(n_products_to_view))
|
81 |
+
st.subheader("All sites, sorted by price ascending (SαΊ―p xαΊΏp theo giΓ‘ tΔng dαΊ§n)")
|
82 |
+
st.write(sorted_merged_df.head(n_products_to_view))
|
83 |
+
|
84 |
+
if __name__ == "__main__":
|
85 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
selenium
|
2 |
+
pandas
|
3 |
+
streamlit==1.13.0
|
4 |
+
altair==4.1.0
|
scraper_lazada.py
ADDED
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from selenium.webdriver.chrome.options import Options
|
3 |
+
from selenium.common.exceptions import NoSuchElementException
|
4 |
+
from selenium.webdriver.common.by import By
|
5 |
+
import time
|
6 |
+
|
7 |
+
# Global driver to use throughout the script
|
8 |
+
DRIVER = None
|
9 |
+
|
10 |
+
# Wrapper to close driver if its created
|
11 |
+
def close_driver():
|
12 |
+
global DRIVER
|
13 |
+
if DRIVER is not None:
|
14 |
+
DRIVER.close()
|
15 |
+
DRIVER = None
|
16 |
+
|
17 |
+
# Function to (re)start driver
|
18 |
+
def start_driver(force_restart=False):
|
19 |
+
global DRIVER
|
20 |
+
|
21 |
+
if force_restart:
|
22 |
+
close_driver()
|
23 |
+
|
24 |
+
# Setting up the driver
|
25 |
+
options = Options()
|
26 |
+
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
|
27 |
+
options.add_argument('-no-sandbox')
|
28 |
+
options.add_argument('-disable-dev-shm-usage')
|
29 |
+
|
30 |
+
DRIVER = webdriver.Chrome(options=options)
|
31 |
+
|
32 |
+
### Function to extract product info from the necessary html and json tags
|
33 |
+
def get_lazada_product_info_single(product_element, extra_info):
|
34 |
+
"""
|
35 |
+
Extract info from a single product element from the driver.
|
36 |
+
Args:
|
37 |
+
product_item: (WebDriverElement) the product whose info needs to be
|
38 |
+
extracted.
|
39 |
+
Returns:
|
40 |
+
info: (dict) a dictionary of info of the product. Every product
|
41 |
+
should at least have four pieces of information: name, price,
|
42 |
+
link to the product page, and link to the product image.
|
43 |
+
"""
|
44 |
+
info = {'source': 'lazada',
|
45 |
+
'name':'',
|
46 |
+
'price':-1,
|
47 |
+
'product_url':'',
|
48 |
+
'image':''}
|
49 |
+
|
50 |
+
# print(product_element.get_attribute('outerHTML'))
|
51 |
+
try:
|
52 |
+
# Find the <a> element within the <div class="RfADt">
|
53 |
+
product_title_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
|
54 |
+
|
55 |
+
# Get the text content of the <a> element
|
56 |
+
info['name'] = product_title_element.text
|
57 |
+
|
58 |
+
except NoSuchElementException:
|
59 |
+
info['name'] = ""
|
60 |
+
|
61 |
+
|
62 |
+
# price
|
63 |
+
try:
|
64 |
+
# Find the <span> element with class "ooOxS" within the <div class="aBrP0">
|
65 |
+
price_element = product_element.find_element(By.XPATH, "//div[@class='aBrP0']/span[@class='ooOxS']")
|
66 |
+
|
67 |
+
# Get the text content of the <span> element
|
68 |
+
price_text = price_element.text
|
69 |
+
|
70 |
+
# Extract the price value
|
71 |
+
info['price'] = int(price_text.split(" ")[0].replace('.', ''))
|
72 |
+
|
73 |
+
except (NoSuchElementException, ValueError):
|
74 |
+
pass
|
75 |
+
|
76 |
+
# link
|
77 |
+
try:
|
78 |
+
# Find the <a> element within the <div class="RfADt">
|
79 |
+
product_link_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
|
80 |
+
|
81 |
+
# Get the href attribute of the <a> element
|
82 |
+
product_link = product_link_element.get_attribute("href")
|
83 |
+
|
84 |
+
# Extract the URL from the href attribute
|
85 |
+
info['product_url'] = product_link.split("//")[1]
|
86 |
+
|
87 |
+
except NoSuchElementException:
|
88 |
+
pass
|
89 |
+
|
90 |
+
# thumbnail
|
91 |
+
try:
|
92 |
+
# Find the <img> element within the <div class="_95X4G">
|
93 |
+
image_element = product_element.find_element(By.XPATH, "//div[@class='_95X4G']/a/div/img")
|
94 |
+
|
95 |
+
# Get the src attribute of the <img> element
|
96 |
+
info['image'] = image_element.get_attribute("src")
|
97 |
+
|
98 |
+
except NoSuchElementException:
|
99 |
+
pass
|
100 |
+
|
101 |
+
# If we decide to get extra information
|
102 |
+
if extra_info:
|
103 |
+
# sales
|
104 |
+
try:
|
105 |
+
# Find the <span> element within the <div class="_6uN7R">
|
106 |
+
sold_element = product_element.find_element(By.XPATH, "//div[@class='_6uN7R']/span[@class='_1cEkb']/span[1]")
|
107 |
+
|
108 |
+
# Get the text content of the <span> element
|
109 |
+
info['sales'] = sold_element.text
|
110 |
+
|
111 |
+
except (NoSuchElementException, ValueError):
|
112 |
+
info['sales'] = 0
|
113 |
+
|
114 |
+
try:
|
115 |
+
# Find the <span> element within the <div class="WNoq3">
|
116 |
+
discount_element = product_element.find_element(By.XPATH, "//div[@class='WNoq3']/span[@class='IcOsH']")
|
117 |
+
|
118 |
+
# Get the text content of the <span> element
|
119 |
+
info['discount'] = discount_element.text
|
120 |
+
|
121 |
+
except (NoSuchElementException, ValueError):
|
122 |
+
info['discount'] = '0'
|
123 |
+
|
124 |
+
return info
|
125 |
+
|
126 |
+
### Function to scrape all products from a page
|
127 |
+
def get_lazada_product_info_from_page(page_url, extra_info=False):
|
128 |
+
"""
|
129 |
+
Extract info from all products of a specfic page_url on Tiki website
|
130 |
+
Args:
|
131 |
+
page_url: (string) url of the page to scrape
|
132 |
+
Returns:
|
133 |
+
data: (list) a list of dictionary of products info. If no products
|
134 |
+
found, return empty list.
|
135 |
+
"""
|
136 |
+
global DRIVER
|
137 |
+
|
138 |
+
data = []
|
139 |
+
DRIVER.get(page_url) # Use the driver to get info from the product page
|
140 |
+
time.sleep(3)
|
141 |
+
|
142 |
+
try:
|
143 |
+
# no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
|
144 |
+
no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
|
145 |
+
print("EMPTY PAGE")
|
146 |
+
return data
|
147 |
+
except NoSuchElementException:
|
148 |
+
no_product_found = False
|
149 |
+
|
150 |
+
|
151 |
+
# FIND ALL PRODUCT ITEMS
|
152 |
+
products = DRIVER.find_elements(By.CLASS_NAME, 'Bm3ON')
|
153 |
+
print(f'Found {len(products)} products')
|
154 |
+
|
155 |
+
if (not no_product_found) and len(products)>0:
|
156 |
+
for i in products:
|
157 |
+
product_dict = get_lazada_product_info_single(i, extra_info)
|
158 |
+
data.append(product_dict)
|
159 |
+
return data
|
160 |
+
|
161 |
+
### Function to get product info from a main category
|
162 |
+
def get_lazada_product_info_from_category(cat_url, max_page=0, extra_info=False):
|
163 |
+
'''
|
164 |
+
Scrape for multiple pages of products of a category.
|
165 |
+
Uses get_product_info_from_page().
|
166 |
+
|
167 |
+
Args:
|
168 |
+
cat_url: (string) a url string of a category
|
169 |
+
max_page: (int) an integer denoting the maximum number of pages to scrape.
|
170 |
+
Default value is 0 to scrape all pages.
|
171 |
+
Returns:
|
172 |
+
products: a list in which every element is a dictionary of one product's information
|
173 |
+
'''
|
174 |
+
products = []
|
175 |
+
|
176 |
+
page_n = 1
|
177 |
+
cat_page_url = cat_url + f'?page={page_n}'
|
178 |
+
product_list = get_lazada_product_info_from_page(cat_page_url, extra_info=extra_info)
|
179 |
+
|
180 |
+
while len(product_list)>0:
|
181 |
+
products.extend(product_list)
|
182 |
+
page_n += 1
|
183 |
+
|
184 |
+
# stop_flag = False if max_page <= 0 else (page_n > max_page)
|
185 |
+
stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
|
186 |
+
if stop_flag:
|
187 |
+
break
|
188 |
+
|
189 |
+
cat_page_url = cat_url + f'?page={page_n}'
|
190 |
+
product_list = get_lazada_product_info_from_page(cat_page_url, extra_info=extra_info)
|
191 |
+
|
192 |
+
return products
|
193 |
+
|
194 |
+
def scrap_lazada(search_product, num_max_page, extra_info):
|
195 |
+
|
196 |
+
start_driver(force_restart=True)
|
197 |
+
|
198 |
+
url = 'https://www.lazada.vn/catalog/?q=' + search_product
|
199 |
+
|
200 |
+
prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
|
201 |
+
|
202 |
+
# prod_per_cat = get_product_info_from_category(main_cat['URL'], num_max_page, extra_info=extra_info)
|
203 |
+
prod_per_cat = get_lazada_product_info_from_category(url, num_max_page, extra_info=extra_info)
|
204 |
+
prod_data.extend(prod_per_cat)
|
205 |
+
|
206 |
+
close_driver() # Close driver when we're done
|
207 |
+
|
208 |
+
return prod_data
|
scraper_shopee.py
ADDED
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from selenium.webdriver.chrome.options import Options
|
3 |
+
from selenium.common.exceptions import NoSuchElementException
|
4 |
+
from selenium.webdriver.common.by import By
|
5 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
6 |
+
from selenium.webdriver.support import expected_conditions as EC
|
7 |
+
import time
|
8 |
+
|
9 |
+
# Global driver to use throughout the script
|
10 |
+
DRIVER = None
|
11 |
+
|
12 |
+
# Wrapper to close driver if its created
|
13 |
+
def close_driver():
|
14 |
+
global DRIVER
|
15 |
+
if DRIVER is not None:
|
16 |
+
DRIVER.close()
|
17 |
+
DRIVER = None
|
18 |
+
|
19 |
+
# Function to (re)start driver
|
20 |
+
def start_driver(force_restart=False):
|
21 |
+
global DRIVER
|
22 |
+
|
23 |
+
if force_restart:
|
24 |
+
close_driver()
|
25 |
+
|
26 |
+
# Setting up the driver
|
27 |
+
options = Options()
|
28 |
+
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
|
29 |
+
options.add_argument('-no-sandbox')
|
30 |
+
options.add_argument('-disable-dev-shm-usage')
|
31 |
+
|
32 |
+
DRIVER = webdriver.Chrome(options=options)
|
33 |
+
|
34 |
+
### Function to extract product info from the necessary html and json tags
|
35 |
+
def get_shopee_product_info_single(product_element, extra_info):
|
36 |
+
"""
|
37 |
+
Extract info from a single product element from the driver.
|
38 |
+
Args:
|
39 |
+
product_item: (WebDriverElement) the product whose info needs to be
|
40 |
+
extracted.
|
41 |
+
Returns:
|
42 |
+
info: (dict) a dictionary of info of the product. Every product
|
43 |
+
should at least have four pieces of information: name, price,
|
44 |
+
link to the product page, and link to the product image.
|
45 |
+
"""
|
46 |
+
info = {'source': 'shopee',
|
47 |
+
'name':'',
|
48 |
+
'price':-1,
|
49 |
+
'product_url':'',
|
50 |
+
'image':''}
|
51 |
+
print(product_element.get_attribute('outerHTML'))
|
52 |
+
try:
|
53 |
+
# Find the <a> element within the <div class>
|
54 |
+
product_title_element = product_element.find_element(By.CLASS_NAME, "line-clamp-2")
|
55 |
+
|
56 |
+
# Get the text content of the <a> element
|
57 |
+
info['name'] = product_title_element.text
|
58 |
+
print(info['name'])
|
59 |
+
|
60 |
+
except NoSuchElementException:
|
61 |
+
info['name'] = ""
|
62 |
+
|
63 |
+
|
64 |
+
# price
|
65 |
+
try:
|
66 |
+
# Find the <span> element within the <div class>
|
67 |
+
price_element = product_element.find_element(By.XPATH,'//div[@class="truncate flex items-baseline"]/span[@class="text-base/5 truncate"]')
|
68 |
+
|
69 |
+
# Get the text content of the <span> element
|
70 |
+
price_text = price_element.text
|
71 |
+
|
72 |
+
# Extract the price value
|
73 |
+
info['price'] = int(price_text.split(" ")[0].replace('.', ''))
|
74 |
+
print(info['price'])
|
75 |
+
|
76 |
+
except (NoSuchElementException, ValueError):
|
77 |
+
pass
|
78 |
+
|
79 |
+
# link
|
80 |
+
try:
|
81 |
+
# Find the <a> element within the <div class>
|
82 |
+
product_link_element = product_element.find_element(By.XPATH, '//a[@class="contents"]')
|
83 |
+
|
84 |
+
# Get the href attribute of the <a> element
|
85 |
+
product_link = product_link_element.get_attribute("href")
|
86 |
+
|
87 |
+
# Extract the URL from the href attribute
|
88 |
+
info['product_url'] = product_link
|
89 |
+
|
90 |
+
except NoSuchElementException:
|
91 |
+
pass
|
92 |
+
|
93 |
+
# thumbnail
|
94 |
+
try:
|
95 |
+
# Find the <img> element within the <div class>
|
96 |
+
image_element = product_element.find_element(By.XPATH, '//img[@class="inset-y-0 w-full h-full pointer-events-none object-contain absolute"]')
|
97 |
+
|
98 |
+
# Get the src attribute of the <img> element
|
99 |
+
info['image'] = image_element.get_attribute("src")
|
100 |
+
|
101 |
+
except NoSuchElementException:
|
102 |
+
pass
|
103 |
+
|
104 |
+
# If we decide to get extra information
|
105 |
+
if extra_info:
|
106 |
+
# sales
|
107 |
+
try:
|
108 |
+
# Find the <span> element within the <div class>
|
109 |
+
sold_element = product_element.find_element(By.XPATH, '//div[@class="truncate text-shopee-black87 text-xs min-h-4 flex-shrink-1"]')
|
110 |
+
|
111 |
+
# Get the text content of the <span> element
|
112 |
+
info['sales'] = sold_element.text
|
113 |
+
|
114 |
+
except (NoSuchElementException, ValueError):
|
115 |
+
info['sales'] = 0
|
116 |
+
|
117 |
+
try:
|
118 |
+
# Find the <span> element within the <div class>
|
119 |
+
discount_element = product_element.find_element(By.XPATH, '//div[@class="truncate bg-shopee-voucher-yellow text-white leading-4 text-sp10"]')
|
120 |
+
|
121 |
+
# Get the text content of the <span> element
|
122 |
+
info['discount'] = discount_element.text
|
123 |
+
|
124 |
+
except (NoSuchElementException, ValueError):
|
125 |
+
info['discount'] = '0'
|
126 |
+
|
127 |
+
return info
|
128 |
+
|
129 |
+
### Function to scrape all products from a page
|
130 |
+
def get_shopee_product_info_from_page(page_url, extra_info=False):
|
131 |
+
"""
|
132 |
+
Extract info from all products of a specfic page_url on Tiki website
|
133 |
+
Args:
|
134 |
+
page_url: (string) url of the page to scrape
|
135 |
+
Returns:
|
136 |
+
data: (list) a list of dictionary of products info. If no products
|
137 |
+
found, return empty list.
|
138 |
+
"""
|
139 |
+
global DRIVER
|
140 |
+
|
141 |
+
data = []
|
142 |
+
DRIVER.get(page_url) # Use the driver to get info from the product page
|
143 |
+
time.sleep(3)
|
144 |
+
|
145 |
+
|
146 |
+
try:
|
147 |
+
# no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
|
148 |
+
no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
|
149 |
+
print("EMPTY PAGE")
|
150 |
+
return data
|
151 |
+
except NoSuchElementException:
|
152 |
+
no_product_found = False
|
153 |
+
|
154 |
+
|
155 |
+
# FIND ALL PRODUCT ITEMS
|
156 |
+
# products = DRIVER.find_elements(By.XPATH, "//a[@class='product-item']")
|
157 |
+
products = DRIVER.find_elements(By.CLASS_NAME, 'col-xs-2-4 shopee-search-item-result__item')
|
158 |
+
print(f'Found {len(products)} products')
|
159 |
+
print(products)
|
160 |
+
|
161 |
+
if (not no_product_found) and len(products)>0:
|
162 |
+
for i in products:
|
163 |
+
product_dict = get_shopee_product_info_single(i, extra_info)
|
164 |
+
print(i)
|
165 |
+
print(product_dict)
|
166 |
+
data.append(product_dict)
|
167 |
+
return data
|
168 |
+
|
169 |
+
### Function to get product info from a main category
|
170 |
+
def get_shopee_product_info_from_category(cat_url, max_page=0, extra_info=False):
|
171 |
+
'''
|
172 |
+
Scrape for multiple pages of products of a category.
|
173 |
+
Uses get_product_info_from_page().
|
174 |
+
|
175 |
+
Args:
|
176 |
+
cat_url: (string) a url string of a category
|
177 |
+
max_page: (int) an integer denoting the maximum number of pages to scrape.
|
178 |
+
Default value is 0 to scrape all pages.
|
179 |
+
Returns:
|
180 |
+
products: a list in which every element is a dictionary of one product's information
|
181 |
+
'''
|
182 |
+
products = []
|
183 |
+
|
184 |
+
page_n = 1
|
185 |
+
cat_page_url = cat_url + f'?page={page_n}'
|
186 |
+
product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
|
187 |
+
|
188 |
+
while len(product_list)>0:
|
189 |
+
products.extend(product_list)
|
190 |
+
page_n += 1
|
191 |
+
|
192 |
+
# stop_flag = False if max_page <= 0 else (page_n > max_page)
|
193 |
+
stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
|
194 |
+
if stop_flag:
|
195 |
+
break
|
196 |
+
|
197 |
+
cat_page_url = cat_url + f'?page={page_n}'
|
198 |
+
product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
|
199 |
+
|
200 |
+
return products
|
201 |
+
|
202 |
+
def scrap_shopee(search_product, num_max_page, extra_info):
|
203 |
+
|
204 |
+
# # #test Shopee
|
205 |
+
# start_driver()
|
206 |
+
# URL = 'https://shopee.vn/search?keyword=megaduo&page=0&sortBy=relevancy'
|
207 |
+
# DRIVER.get(URL)
|
208 |
+
# time.sleep(3)
|
209 |
+
# print(URL)
|
210 |
+
# products = DRIVER.find_elements(By.CLASS_NAME, 'shopee-search-item-result')
|
211 |
+
# # products = DRIVER.find_element("css selector", 'li[class="col-xs-2-4 shopee-search-item-result__item"]')
|
212 |
+
# product = products[0]
|
213 |
+
# # Wait for the element to be present on the page
|
214 |
+
# info = get_shopee_product_info_single(product, True)
|
215 |
+
# print(info)
|
216 |
+
|
217 |
+
start_driver(force_restart=True)
|
218 |
+
|
219 |
+
url = 'https://shopee.vn/search?keyword=' + search_product
|
220 |
+
|
221 |
+
prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
|
222 |
+
|
223 |
+
# prod_per_cat = get_product_info_from_category(main_cat['URL'], num_max_page, extra_info=extra_info)
|
224 |
+
prod_per_cat = get_shopee_product_info_from_category(url, num_max_page, extra_info=extra_info)
|
225 |
+
prod_data.extend(prod_per_cat)
|
226 |
+
|
227 |
+
close_driver() # Close driver when we're done
|
228 |
+
|
229 |
+
return prod_data
|
scraper_tiki.py
ADDED
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from selenium.webdriver.chrome.options import Options
|
3 |
+
from selenium.common.exceptions import NoSuchElementException
|
4 |
+
from selenium.webdriver.common.by import By
|
5 |
+
import time
|
6 |
+
|
7 |
+
# Global driver to use throughout the script
|
8 |
+
DRIVER = None
|
9 |
+
|
10 |
+
# Wrapper to close driver if its created
|
11 |
+
def close_driver():
|
12 |
+
global DRIVER
|
13 |
+
if DRIVER is not None:
|
14 |
+
DRIVER.close()
|
15 |
+
DRIVER = None
|
16 |
+
|
17 |
+
# Function to (re)start driver
|
18 |
+
def start_driver(force_restart=False):
|
19 |
+
global DRIVER
|
20 |
+
|
21 |
+
if force_restart:
|
22 |
+
close_driver()
|
23 |
+
|
24 |
+
# Setting up the driver
|
25 |
+
options = Options()
|
26 |
+
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
|
27 |
+
options.add_argument('-no-sandbox')
|
28 |
+
options.add_argument('-disable-dev-shm-usage')
|
29 |
+
|
30 |
+
DRIVER = webdriver.Chrome(options=options)
|
31 |
+
|
32 |
+
### Function to extract product info from the necessary html and json tags
|
33 |
+
def get_tiki_product_info_single(product_element, extra_info):
|
34 |
+
"""
|
35 |
+
Extract info from a single product element from the driver.
|
36 |
+
Args:
|
37 |
+
product_item: (WebDriverElement) the product whose info needs to be
|
38 |
+
extracted.
|
39 |
+
Returns:
|
40 |
+
info: (dict) a dictionary of info of the product. Every product
|
41 |
+
should at least have four pieces of information: name, price,
|
42 |
+
link to the product page, and link to the product image.
|
43 |
+
"""
|
44 |
+
info = {'source': 'tiki',
|
45 |
+
'name':'',
|
46 |
+
'price':-1,
|
47 |
+
'product_url':'',
|
48 |
+
'image':''}
|
49 |
+
# print(product_element.get_attribute('outerHTML'))
|
50 |
+
try:
|
51 |
+
# name = product_element.find_element(By.XPATH, ".//div[@class='name']/h3")
|
52 |
+
# name = product_element.find_element(By.CLASS_NAME, 'style__NameStyled-sc-139nb47-8 ibOlar').find_element(By.TAG_NAME, 'h3')
|
53 |
+
|
54 |
+
name = product_element.find_element(By.CLASS_NAME, 'name').find_element(By.TAG_NAME, 'h3')
|
55 |
+
|
56 |
+
info['name'] = name.get_attribute('innerHTML').strip()
|
57 |
+
|
58 |
+
except NoSuchElementException:
|
59 |
+
|
60 |
+
# Find the <h3> element by class name
|
61 |
+
name = product_element.find_element(By.CLASS_NAME, 'style__NameStyled-sc-139nb47-8')
|
62 |
+
|
63 |
+
# Get the text content of the element
|
64 |
+
info['name'] = name.text
|
65 |
+
|
66 |
+
# price = product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML')
|
67 |
+
# print(price)
|
68 |
+
|
69 |
+
# price
|
70 |
+
try:
|
71 |
+
# price=product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML').strip()
|
72 |
+
price = product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML')
|
73 |
+
# price = product_element.find_element(By.XPATH, ".//div[@class='price-discount__price']").get_attribute('innerHTML')
|
74 |
+
|
75 |
+
info['price']=int(price.replace('<sup>β«</sup>', '').replace('.', ''))
|
76 |
+
# info['price'] = int(re.sub(r'[\.\sβ«]', '', price)) # With regex
|
77 |
+
# info['price'] = int(''.join([c for c in price if c not in '.β« '])) # Without regex
|
78 |
+
except (NoSuchElementException, ValueError):
|
79 |
+
pass
|
80 |
+
|
81 |
+
# link
|
82 |
+
try:
|
83 |
+
product_link = product_element.get_attribute('href')
|
84 |
+
info['product_url'] = product_link
|
85 |
+
except NoSuchElementException:
|
86 |
+
pass
|
87 |
+
|
88 |
+
# thumbnail
|
89 |
+
try:
|
90 |
+
# thumbnail = product_element.find_elements(By.XPATH, ".//div[@class='thumbnail']//child::img")[-1]
|
91 |
+
|
92 |
+
# thumbnail = product_element.find_element(By.CLASS_NAME, 'thumbnail').find_element(By.TAG_NAME, 'img')
|
93 |
+
# info['image'] = thumbnail.get_attribute('src')
|
94 |
+
|
95 |
+
# Find the <div> element with class "image-wrapper"
|
96 |
+
image_div = product_element.find_element(By.CLASS_NAME, 'image-wrapper')
|
97 |
+
|
98 |
+
# Find the <img> element within the <div> element
|
99 |
+
img_element = image_div.find_element(By.TAG_NAME, 'img')
|
100 |
+
|
101 |
+
# Get the value of the "srcset" attribute
|
102 |
+
srcset_value = img_element.get_attribute('srcset')
|
103 |
+
|
104 |
+
# Extract the link of the image from the srcset value
|
105 |
+
image_link = srcset_value.split(',')[0].split(' ')[0]
|
106 |
+
info['image'] = image_link
|
107 |
+
|
108 |
+
except NoSuchElementException:
|
109 |
+
pass
|
110 |
+
|
111 |
+
# If we decide to get extra information
|
112 |
+
if extra_info:
|
113 |
+
# sales
|
114 |
+
try:
|
115 |
+
# sales_elem = product_element.find_element(By.XPATH, ".//div[@class='styles__StyledQtySold-sc-732h27-2']")
|
116 |
+
# sales_elem = product_element.find_element(By.CLASS_NAME, 'quantity has-border')
|
117 |
+
# info['sales'] = sales_elem
|
118 |
+
# info['sales'] = int(re.sub(r'\D', '', sales_elem.get_attribute('innerHTML')))
|
119 |
+
|
120 |
+
# Find the <span> element with class "quantity"
|
121 |
+
quantity_span = product_element.find_element(By.CLASS_NAME, 'quantity')
|
122 |
+
|
123 |
+
# Get the text content of the element
|
124 |
+
info['sales'] = quantity_span.text
|
125 |
+
|
126 |
+
except (NoSuchElementException, ValueError):
|
127 |
+
info['sales'] = 0
|
128 |
+
|
129 |
+
# # rating
|
130 |
+
# try:
|
131 |
+
# # rating = product_element.find_element(By.XPATH, ".//div[@class='average']").get_attribute('style')
|
132 |
+
# rating = product_element.find_element(By.CLASS_NAME, 'average').get_attribute('style')
|
133 |
+
# # info['rating'] = float(re.sub(r'\D','', rating))/100*5 # With regex
|
134 |
+
# info['rating'] = float(''.join([c for c in rating if c.isdigit()]))/100*5 # Without regex
|
135 |
+
# except NoSuchElementException:
|
136 |
+
# info['rating'] = 0
|
137 |
+
|
138 |
+
try:
|
139 |
+
# Try to get discount using class name
|
140 |
+
discount = product_element.find_element(By.CLASS_NAME, 'price-discount__discount').get_attribute('innerHTML')
|
141 |
+
info['discount'] = discount.replace('-', '') # Remove any dashes
|
142 |
+
|
143 |
+
except (NoSuchElementException, ValueError):
|
144 |
+
try:
|
145 |
+
# Try to get discount using another method
|
146 |
+
discount_div = product_element.find_element(By.CLASS_NAME, 'style__DiscountPercentStyled-sc-e9h7mj-1')
|
147 |
+
info['discount'] = discount_div.text.replace('-', '') # Remove any dashes
|
148 |
+
|
149 |
+
except NoSuchElementException:
|
150 |
+
# If both attempts fail, set discount to 0
|
151 |
+
info['discount'] = '0'
|
152 |
+
|
153 |
+
# # tiki now
|
154 |
+
# try:
|
155 |
+
# info['tiki_now'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-service').find_element(By.CLASS_NAME, 'item'))
|
156 |
+
# except NoSuchElementException:
|
157 |
+
# info['tiki_now'] = False
|
158 |
+
|
159 |
+
# # freeship, official seller, and/or trusted seller
|
160 |
+
# try:
|
161 |
+
# info['freeship'] = False
|
162 |
+
# info['official'] = False
|
163 |
+
# info['trusted'] = False
|
164 |
+
# thumbnail_tag = product_element.find_element(By.CLASS_NAME, 'thumbnail')
|
165 |
+
# list_img = thumbnail_tag.find_elements(By.TAG_NAME, 'img')
|
166 |
+
# # list_img = product_element.find_elements(By.XPATH, ".//div[@class='thumbnail']/img")
|
167 |
+
# for img in list_img:
|
168 |
+
# if img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/dc/0d/49/3251737db2de83b74eba8a9ad6d03338.png':
|
169 |
+
# info['freeship'] = True
|
170 |
+
# elif img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/b9/1f/4b/557eac9c67a4466ccebfa74cde854215.png':
|
171 |
+
# info['official'] = True
|
172 |
+
# elif img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/e0/41/da/bb0fc684a838eff5e264ce0534a148f0.png':
|
173 |
+
# info['trusted'] = True
|
174 |
+
# except NoSuchElementException:
|
175 |
+
# pass
|
176 |
+
|
177 |
+
# # under price
|
178 |
+
# try:
|
179 |
+
# # info['under_price'] = bool(product_element.find_element(By.XPATH, ".//div[@class='badge-under-price']/child::div[@class='item']"))
|
180 |
+
# info['under_price'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-under-price').find_element(By.CLASS_NAME, 'item'))
|
181 |
+
# except NoSuchElementException:
|
182 |
+
# info['under_price'] = False
|
183 |
+
|
184 |
+
# # installment
|
185 |
+
# try:
|
186 |
+
# # info['installment'] = bool(product_element.find_element(By.XPATH, ".//div[@class='badge-benefits']//child::img[1]"))
|
187 |
+
# info['installment'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-benefits').find_element(By.TAG_NAME, 'img'))
|
188 |
+
# except NoSuchElementException:
|
189 |
+
# info['installment'] = False
|
190 |
+
|
191 |
+
# # gift
|
192 |
+
# try:
|
193 |
+
# # info['gift'] = bool(product_element.find_element(By.XPATH, ".//div[@class='freegift-list']"))
|
194 |
+
# info['gift'] = bool(product_element.find_element(By.CLASS_NAME, 'freegift-list'))
|
195 |
+
# except NoSuchElementException:
|
196 |
+
# info['gift'] = False
|
197 |
+
|
198 |
+
return info
|
199 |
+
|
200 |
+
|
201 |
+
### Function to scrape all products from a page
|
202 |
+
def get_tiki_product_info_from_page(page_url, extra_info=False):
|
203 |
+
"""
|
204 |
+
Extract info from all products of a specfic page_url on Tiki website
|
205 |
+
Args:
|
206 |
+
page_url: (string) url of the page to scrape
|
207 |
+
Returns:
|
208 |
+
data: (list) a list of dictionary of products info. If no products
|
209 |
+
found, return empty list.
|
210 |
+
"""
|
211 |
+
global DRIVER
|
212 |
+
|
213 |
+
data = []
|
214 |
+
DRIVER.get(page_url) # Use the driver to get info from the product page
|
215 |
+
time.sleep(3)
|
216 |
+
|
217 |
+
|
218 |
+
try:
|
219 |
+
# no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
|
220 |
+
no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
|
221 |
+
print("EMPTY PAGE")
|
222 |
+
return data
|
223 |
+
except NoSuchElementException:
|
224 |
+
no_product_found = False
|
225 |
+
|
226 |
+
# FIND ALL PRODUCT ITEMS
|
227 |
+
# products = DRIVER.find_elements(By.XPATH, "//a[@class='product-item']")
|
228 |
+
products = DRIVER.find_elements(By.CLASS_NAME, 'product-item')
|
229 |
+
print(f'Found {len(products)} products')
|
230 |
+
|
231 |
+
if (not no_product_found) and len(products)>0:
|
232 |
+
for i in products:
|
233 |
+
product_dict = get_tiki_product_info_single(i, extra_info)
|
234 |
+
data.append(product_dict)
|
235 |
+
return data
|
236 |
+
|
237 |
+
### Function to get product info from a main category
|
238 |
+
def get_tiki_product_info_from_category(cat_url, max_page=0, extra_info=False):
|
239 |
+
'''
|
240 |
+
Scrape for multiple pages of products of a category.
|
241 |
+
Uses get_product_info_from_page().
|
242 |
+
|
243 |
+
Args:
|
244 |
+
cat_url: (string) a url string of a category
|
245 |
+
max_page: (int) an integer denoting the maximum number of pages to scrape.
|
246 |
+
Default value is 0 to scrape all pages.
|
247 |
+
Returns:
|
248 |
+
products: a list in which every element is a dictionary of one product's information
|
249 |
+
'''
|
250 |
+
products = []
|
251 |
+
|
252 |
+
page_n = 1
|
253 |
+
cat_page_url = cat_url + f'?page={page_n}'
|
254 |
+
product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
|
255 |
+
|
256 |
+
while len(product_list)>0:
|
257 |
+
products.extend(product_list)
|
258 |
+
page_n += 1
|
259 |
+
|
260 |
+
# stop_flag = False if max_page <= 0 else (page_n > max_page)
|
261 |
+
stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
|
262 |
+
if stop_flag:
|
263 |
+
break
|
264 |
+
|
265 |
+
cat_page_url = cat_url + f'?page={page_n}'
|
266 |
+
product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
|
267 |
+
|
268 |
+
return products
|
269 |
+
|
270 |
+
def scrap_tiki(search_product, num_max_page, extra_info):
|
271 |
+
|
272 |
+
start_driver(force_restart=True)
|
273 |
+
|
274 |
+
url = 'https://tiki.vn/search?sort=default&q="' + search_product +'"'
|
275 |
+
|
276 |
+
prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
|
277 |
+
|
278 |
+
# prod_per_cat = get_product_info_from_category(main_cat['URL'], num_max_page, extra_info=extra_info)
|
279 |
+
prod_per_cat = get_tiki_product_info_from_category(url, num_max_page, extra_info = extra_info)
|
280 |
+
prod_data.extend(prod_per_cat)
|
281 |
+
close_driver() # Close driver when we're done
|
282 |
+
|
283 |
+
return prod_data
|