Spaces:
Sleeping
Sleeping
Linh Vuu
commited on
Commit
•
6d0cb99
1
Parent(s):
3f1beca
updated files
Browse files- __pycache__/scraper_lazada.cpython-39.pyc +0 -0
- __pycache__/scraper_shopee.cpython-39.pyc +0 -0
- __pycache__/scraper_tiki.cpython-39.pyc +0 -0
- app.py +2 -2
- requirements.txt +3 -4
- scraper_lazada.py +20 -30
- scraper_shopee.py +15 -37
- scraper_tiki.py +7 -3
__pycache__/scraper_lazada.cpython-39.pyc
ADDED
Binary file (4.45 kB). View file
|
|
__pycache__/scraper_shopee.cpython-39.pyc
ADDED
Binary file (4.85 kB). View file
|
|
__pycache__/scraper_tiki.cpython-39.pyc
ADDED
Binary file (4.62 kB). View file
|
|
app.py
CHANGED
@@ -60,7 +60,6 @@ def main():
|
|
60 |
df_lazada = pd.DataFrame(columns = col_to_display)
|
61 |
st.write("Not found.")
|
62 |
|
63 |
-
|
64 |
st.subheader("Tiki")
|
65 |
tiki_data = scrap_tiki(search_product, num_max_page, extra_info)
|
66 |
if tiki_data:
|
@@ -72,7 +71,8 @@ def main():
|
|
72 |
st.write("Not found.")
|
73 |
|
74 |
# Merge the two dataframes
|
75 |
-
merged_df = pd.concat([df_tiki, df_lazada, df_shopee])
|
|
|
76 |
|
77 |
# Sort the merged dataframe by price
|
78 |
sorted_merged_df = merged_df.sort_values(by='price')
|
|
|
60 |
df_lazada = pd.DataFrame(columns = col_to_display)
|
61 |
st.write("Not found.")
|
62 |
|
|
|
63 |
st.subheader("Tiki")
|
64 |
tiki_data = scrap_tiki(search_product, num_max_page, extra_info)
|
65 |
if tiki_data:
|
|
|
71 |
st.write("Not found.")
|
72 |
|
73 |
# Merge the two dataframes
|
74 |
+
# merged_df = pd.concat([df_tiki, df_lazada, df_shopee])
|
75 |
+
merged_df = pd.concat([df_lazada])
|
76 |
|
77 |
# Sort the merged dataframe by price
|
78 |
sorted_merged_df = merged_df.sort_values(by='price')
|
requirements.txt
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
-
selenium
|
2 |
-
pandas
|
3 |
streamlit==1.13.0
|
4 |
-
altair==4.2.0
|
5 |
-
webdriver-manager==3.7.1
|
|
|
1 |
+
selenium
|
2 |
+
pandas
|
3 |
streamlit==1.13.0
|
4 |
+
altair==4.2.0
|
|
scraper_lazada.py
CHANGED
@@ -2,6 +2,7 @@ from selenium import webdriver
|
|
2 |
from selenium.webdriver.chrome.options import Options
|
3 |
from selenium.common.exceptions import NoSuchElementException
|
4 |
from selenium.webdriver.common.by import By
|
|
|
5 |
import time
|
6 |
|
7 |
# Global driver to use throughout the script
|
@@ -22,12 +23,13 @@ def start_driver(force_restart=False):
|
|
22 |
close_driver()
|
23 |
|
24 |
# Setting up the driver
|
|
|
25 |
options = Options()
|
26 |
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
|
27 |
options.add_argument('-no-sandbox')
|
28 |
options.add_argument('-disable-dev-shm-usage')
|
29 |
|
30 |
-
DRIVER = webdriver.Chrome(options=options)
|
31 |
|
32 |
### Function to extract product info from the necessary html and json tags
|
33 |
def get_lazada_product_info_single(product_element, extra_info):
|
@@ -51,7 +53,7 @@ def get_lazada_product_info_single(product_element, extra_info):
|
|
51 |
try:
|
52 |
# Find the <a> element within the <div class="RfADt">
|
53 |
product_title_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
|
54 |
-
|
55 |
# Get the text content of the <a> element
|
56 |
info['name'] = product_title_element.text
|
57 |
|
@@ -63,7 +65,7 @@ def get_lazada_product_info_single(product_element, extra_info):
|
|
63 |
try:
|
64 |
# Find the <span> element with class "ooOxS" within the <div class="aBrP0">
|
65 |
price_element = product_element.find_element(By.XPATH, "//div[@class='aBrP0']/span[@class='ooOxS']")
|
66 |
-
|
67 |
# Get the text content of the <span> element
|
68 |
price_text = price_element.text
|
69 |
|
@@ -77,10 +79,10 @@ def get_lazada_product_info_single(product_element, extra_info):
|
|
77 |
try:
|
78 |
# Find the <a> element within the <div class="RfADt">
|
79 |
product_link_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
|
80 |
-
|
81 |
# Get the href attribute of the <a> element
|
82 |
product_link = product_link_element.get_attribute("href")
|
83 |
-
|
84 |
# Extract the URL from the href attribute
|
85 |
info['product_url'] = product_link.split("//")[1]
|
86 |
|
@@ -91,7 +93,7 @@ def get_lazada_product_info_single(product_element, extra_info):
|
|
91 |
try:
|
92 |
# Find the <img> element within the <div class="_95X4G">
|
93 |
image_element = product_element.find_element(By.XPATH, "//div[@class='_95X4G']/a/div/img")
|
94 |
-
|
95 |
# Get the src attribute of the <img> element
|
96 |
info['image'] = image_element.get_attribute("src")
|
97 |
|
@@ -104,7 +106,7 @@ def get_lazada_product_info_single(product_element, extra_info):
|
|
104 |
try:
|
105 |
# Find the <span> element within the <div class="_6uN7R">
|
106 |
sold_element = product_element.find_element(By.XPATH, "//div[@class='_6uN7R']/span[@class='_1cEkb']/span[1]")
|
107 |
-
|
108 |
# Get the text content of the <span> element
|
109 |
info['sales'] = sold_element.text
|
110 |
|
@@ -114,7 +116,7 @@ def get_lazada_product_info_single(product_element, extra_info):
|
|
114 |
try:
|
115 |
# Find the <span> element within the <div class="WNoq3">
|
116 |
discount_element = product_element.find_element(By.XPATH, "//div[@class='WNoq3']/span[@class='IcOsH']")
|
117 |
-
|
118 |
# Get the text content of the <span> element
|
119 |
info['discount'] = discount_element.text
|
120 |
|
@@ -134,32 +136,24 @@ def get_lazada_product_info_from_page(page_url, extra_info=False):
|
|
134 |
found, return empty list.
|
135 |
"""
|
136 |
global DRIVER
|
137 |
-
|
138 |
data = []
|
139 |
DRIVER.get(page_url) # Use the driver to get info from the product page
|
140 |
time.sleep(3)
|
141 |
|
142 |
-
try:
|
143 |
-
# no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
|
144 |
-
no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
|
145 |
-
print("EMPTY PAGE")
|
146 |
-
return data
|
147 |
-
except NoSuchElementException:
|
148 |
-
no_product_found = False
|
149 |
-
|
150 |
-
|
151 |
# FIND ALL PRODUCT ITEMS
|
152 |
products = DRIVER.find_elements(By.CLASS_NAME, 'Bm3ON')
|
|
|
153 |
print(f'Found {len(products)} products')
|
154 |
|
155 |
-
if
|
156 |
for i in products:
|
157 |
product_dict = get_lazada_product_info_single(i, extra_info)
|
158 |
data.append(product_dict)
|
159 |
return data
|
160 |
|
161 |
### Function to get product info from a main category
|
162 |
-
def get_lazada_product_info_from_category(
|
163 |
'''
|
164 |
Scrape for multiple pages of products of a category.
|
165 |
Uses get_product_info_from_page().
|
@@ -172,10 +166,10 @@ def get_lazada_product_info_from_category(cat_url, max_page=0, extra_info=False)
|
|
172 |
products: a list in which every element is a dictionary of one product's information
|
173 |
'''
|
174 |
products = []
|
175 |
-
|
176 |
page_n = 1
|
177 |
-
|
178 |
-
|
|
|
179 |
|
180 |
while len(product_list)>0:
|
181 |
products.extend(product_list)
|
@@ -185,9 +179,8 @@ def get_lazada_product_info_from_category(cat_url, max_page=0, extra_info=False)
|
|
185 |
stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
|
186 |
if stop_flag:
|
187 |
break
|
188 |
-
|
189 |
-
|
190 |
-
product_list = get_lazada_product_info_from_page(cat_page_url, extra_info=extra_info)
|
191 |
|
192 |
return products
|
193 |
|
@@ -195,12 +188,9 @@ def scrap_lazada(search_product, num_max_page, extra_info):
|
|
195 |
|
196 |
start_driver(force_restart=True)
|
197 |
|
198 |
-
url = 'https://www.lazada.vn/catalog/?q=' + search_product
|
199 |
-
|
200 |
prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
|
201 |
|
202 |
-
|
203 |
-
prod_per_cat = get_lazada_product_info_from_category(url, num_max_page, extra_info=extra_info)
|
204 |
prod_data.extend(prod_per_cat)
|
205 |
|
206 |
close_driver() # Close driver when we're done
|
|
|
2 |
from selenium.webdriver.chrome.options import Options
|
3 |
from selenium.common.exceptions import NoSuchElementException
|
4 |
from selenium.webdriver.common.by import By
|
5 |
+
from selenium.webdriver.chrome.service import Service
|
6 |
import time
|
7 |
|
8 |
# Global driver to use throughout the script
|
|
|
23 |
close_driver()
|
24 |
|
25 |
# Setting up the driver
|
26 |
+
service = Service()
|
27 |
options = Options()
|
28 |
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
|
29 |
options.add_argument('-no-sandbox')
|
30 |
options.add_argument('-disable-dev-shm-usage')
|
31 |
|
32 |
+
DRIVER = webdriver.Chrome(service=service, options=options)
|
33 |
|
34 |
### Function to extract product info from the necessary html and json tags
|
35 |
def get_lazada_product_info_single(product_element, extra_info):
|
|
|
53 |
try:
|
54 |
# Find the <a> element within the <div class="RfADt">
|
55 |
product_title_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
|
56 |
+
|
57 |
# Get the text content of the <a> element
|
58 |
info['name'] = product_title_element.text
|
59 |
|
|
|
65 |
try:
|
66 |
# Find the <span> element with class "ooOxS" within the <div class="aBrP0">
|
67 |
price_element = product_element.find_element(By.XPATH, "//div[@class='aBrP0']/span[@class='ooOxS']")
|
68 |
+
|
69 |
# Get the text content of the <span> element
|
70 |
price_text = price_element.text
|
71 |
|
|
|
79 |
try:
|
80 |
# Find the <a> element within the <div class="RfADt">
|
81 |
product_link_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
|
82 |
+
|
83 |
# Get the href attribute of the <a> element
|
84 |
product_link = product_link_element.get_attribute("href")
|
85 |
+
|
86 |
# Extract the URL from the href attribute
|
87 |
info['product_url'] = product_link.split("//")[1]
|
88 |
|
|
|
93 |
try:
|
94 |
# Find the <img> element within the <div class="_95X4G">
|
95 |
image_element = product_element.find_element(By.XPATH, "//div[@class='_95X4G']/a/div/img")
|
96 |
+
|
97 |
# Get the src attribute of the <img> element
|
98 |
info['image'] = image_element.get_attribute("src")
|
99 |
|
|
|
106 |
try:
|
107 |
# Find the <span> element within the <div class="_6uN7R">
|
108 |
sold_element = product_element.find_element(By.XPATH, "//div[@class='_6uN7R']/span[@class='_1cEkb']/span[1]")
|
109 |
+
|
110 |
# Get the text content of the <span> element
|
111 |
info['sales'] = sold_element.text
|
112 |
|
|
|
116 |
try:
|
117 |
# Find the <span> element within the <div class="WNoq3">
|
118 |
discount_element = product_element.find_element(By.XPATH, "//div[@class='WNoq3']/span[@class='IcOsH']")
|
119 |
+
|
120 |
# Get the text content of the <span> element
|
121 |
info['discount'] = discount_element.text
|
122 |
|
|
|
136 |
found, return empty list.
|
137 |
"""
|
138 |
global DRIVER
|
139 |
+
|
140 |
data = []
|
141 |
DRIVER.get(page_url) # Use the driver to get info from the product page
|
142 |
time.sleep(3)
|
143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
# FIND ALL PRODUCT ITEMS
|
145 |
products = DRIVER.find_elements(By.CLASS_NAME, 'Bm3ON')
|
146 |
+
print(products)
|
147 |
print(f'Found {len(products)} products')
|
148 |
|
149 |
+
if len(products)>0:
|
150 |
for i in products:
|
151 |
product_dict = get_lazada_product_info_single(i, extra_info)
|
152 |
data.append(product_dict)
|
153 |
return data
|
154 |
|
155 |
### Function to get product info from a main category
|
156 |
+
def get_lazada_product_info_from_category(search_product, max_page=0, extra_info=False):
|
157 |
'''
|
158 |
Scrape for multiple pages of products of a category.
|
159 |
Uses get_product_info_from_page().
|
|
|
166 |
products: a list in which every element is a dictionary of one product's information
|
167 |
'''
|
168 |
products = []
|
|
|
169 |
page_n = 1
|
170 |
+
cat_url = 'https://www.lazada.vn/catalog/?q=' + search_product
|
171 |
+
|
172 |
+
product_list = get_lazada_product_info_from_page(cat_url, extra_info=extra_info)
|
173 |
|
174 |
while len(product_list)>0:
|
175 |
products.extend(product_list)
|
|
|
179 |
stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
|
180 |
if stop_flag:
|
181 |
break
|
182 |
+
cat_url = 'https://www.lazada.vn/catalog/?page=' + page_n + '&q=' + search_product
|
183 |
+
product_list = get_lazada_product_info_from_page(cat_url, extra_info=extra_info)
|
|
|
184 |
|
185 |
return products
|
186 |
|
|
|
188 |
|
189 |
start_driver(force_restart=True)
|
190 |
|
|
|
|
|
191 |
prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
|
192 |
|
193 |
+
prod_per_cat = get_lazada_product_info_from_category(search_product, num_max_page, extra_info=extra_info)
|
|
|
194 |
prod_data.extend(prod_per_cat)
|
195 |
|
196 |
close_driver() # Close driver when we're done
|
scraper_shopee.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
from selenium import webdriver
|
2 |
-
from
|
3 |
-
# from selenium.webdriver.chrome.options import Options
|
4 |
from selenium.common.exceptions import NoSuchElementException
|
5 |
from selenium.webdriver.common.by import By
|
6 |
from selenium.webdriver.support import expected_conditions as EC
|
|
|
7 |
import time
|
8 |
|
9 |
# Global driver to use throughout the script
|
@@ -24,19 +24,13 @@ def start_driver(force_restart=False):
|
|
24 |
close_driver()
|
25 |
|
26 |
# Setting up the driver
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
gChromeOptions = webdriver.ChromeOptions()
|
35 |
-
gChromeOptions.add_argument("window-size=1920x1480")
|
36 |
-
gChromeOptions.add_argument("disable-dev-shm-usage")
|
37 |
-
DRIVER = webdriver.Chrome(
|
38 |
-
chrome_options=gChromeOptions, executable_path=ChromeDriverManager().install()
|
39 |
-
)
|
40 |
|
41 |
### Function to extract product info from the necessary html and json tags
|
42 |
def get_shopee_product_info_single(product_element, extra_info):
|
@@ -149,32 +143,19 @@ def get_shopee_product_info_from_page(page_url, extra_info=False):
|
|
149 |
DRIVER.get(page_url) # Use the driver to get info from the product page
|
150 |
time.sleep(3)
|
151 |
|
152 |
-
|
153 |
-
try:
|
154 |
-
# no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
|
155 |
-
no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
|
156 |
-
print("EMPTY PAGE")
|
157 |
-
return data
|
158 |
-
except NoSuchElementException:
|
159 |
-
no_product_found = False
|
160 |
-
|
161 |
-
|
162 |
# FIND ALL PRODUCT ITEMS
|
163 |
-
# products = DRIVER.find_elements(By.XPATH, "//a[@class='product-item']")
|
164 |
products = DRIVER.find_elements(By.CLASS_NAME, 'col-xs-2-4 shopee-search-item-result__item')
|
165 |
print(f'Found {len(products)} products')
|
166 |
print(products)
|
167 |
|
168 |
-
if
|
169 |
for i in products:
|
170 |
product_dict = get_shopee_product_info_single(i, extra_info)
|
171 |
-
print(i)
|
172 |
-
print(product_dict)
|
173 |
data.append(product_dict)
|
174 |
return data
|
175 |
|
176 |
### Function to get product info from a main category
|
177 |
-
def get_shopee_product_info_from_category(
|
178 |
'''
|
179 |
Scrape for multiple pages of products of a category.
|
180 |
Uses get_product_info_from_page().
|
@@ -188,8 +169,8 @@ def get_shopee_product_info_from_category(cat_url, max_page=0, extra_info=False)
|
|
188 |
'''
|
189 |
products = []
|
190 |
|
191 |
-
page_n =
|
192 |
-
cat_page_url =
|
193 |
product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
|
194 |
|
195 |
while len(product_list)>0:
|
@@ -201,7 +182,7 @@ def get_shopee_product_info_from_category(cat_url, max_page=0, extra_info=False)
|
|
201 |
if stop_flag:
|
202 |
break
|
203 |
|
204 |
-
cat_page_url =
|
205 |
product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
|
206 |
|
207 |
return products
|
@@ -221,14 +202,11 @@ def scrap_shopee(search_product, num_max_page, extra_info):
|
|
221 |
# info = get_shopee_product_info_single(product, True)
|
222 |
# print(info)
|
223 |
|
224 |
-
start_driver(force_restart=True)
|
225 |
-
|
226 |
-
url = 'https://shopee.vn/search?keyword=' + search_product
|
227 |
|
228 |
prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
|
229 |
|
230 |
-
|
231 |
-
prod_per_cat = get_shopee_product_info_from_category(url, num_max_page, extra_info=extra_info)
|
232 |
prod_data.extend(prod_per_cat)
|
233 |
|
234 |
close_driver() # Close driver when we're done
|
|
|
1 |
from selenium import webdriver
|
2 |
+
from selenium.webdriver.chrome.options import Options
|
|
|
3 |
from selenium.common.exceptions import NoSuchElementException
|
4 |
from selenium.webdriver.common.by import By
|
5 |
from selenium.webdriver.support import expected_conditions as EC
|
6 |
+
from selenium.webdriver.chrome.service import Service
|
7 |
import time
|
8 |
|
9 |
# Global driver to use throughout the script
|
|
|
24 |
close_driver()
|
25 |
|
26 |
# Setting up the driver
|
27 |
+
service = Service()
|
28 |
+
options = Options()
|
29 |
+
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
|
30 |
+
options.add_argument('-no-sandbox')
|
31 |
+
options.add_argument('-disable-dev-shm-usage')
|
32 |
|
33 |
+
DRIVER = webdriver.Chrome(service=service, options=options)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
### Function to extract product info from the necessary html and json tags
|
36 |
def get_shopee_product_info_single(product_element, extra_info):
|
|
|
143 |
DRIVER.get(page_url) # Use the driver to get info from the product page
|
144 |
time.sleep(3)
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
# FIND ALL PRODUCT ITEMS
|
|
|
147 |
products = DRIVER.find_elements(By.CLASS_NAME, 'col-xs-2-4 shopee-search-item-result__item')
|
148 |
print(f'Found {len(products)} products')
|
149 |
print(products)
|
150 |
|
151 |
+
if len(products)>0:
|
152 |
for i in products:
|
153 |
product_dict = get_shopee_product_info_single(i, extra_info)
|
|
|
|
|
154 |
data.append(product_dict)
|
155 |
return data
|
156 |
|
157 |
### Function to get product info from a main category
|
158 |
+
def get_shopee_product_info_from_category(search_product, max_page=0, extra_info=False):
|
159 |
'''
|
160 |
Scrape for multiple pages of products of a category.
|
161 |
Uses get_product_info_from_page().
|
|
|
169 |
'''
|
170 |
products = []
|
171 |
|
172 |
+
page_n = 0
|
173 |
+
cat_page_url = 'https://shopee.vn/search?keyword=' + search_product
|
174 |
product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
|
175 |
|
176 |
while len(product_list)>0:
|
|
|
182 |
if stop_flag:
|
183 |
break
|
184 |
|
185 |
+
cat_page_url = cat_page_url + f'&page={page_n}'
|
186 |
product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
|
187 |
|
188 |
return products
|
|
|
202 |
# info = get_shopee_product_info_single(product, True)
|
203 |
# print(info)
|
204 |
|
205 |
+
start_driver(force_restart=True)
|
|
|
|
|
206 |
|
207 |
prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
|
208 |
|
209 |
+
prod_per_cat = get_shopee_product_info_from_category(search_product, num_max_page, extra_info=extra_info)
|
|
|
210 |
prod_data.extend(prod_per_cat)
|
211 |
|
212 |
close_driver() # Close driver when we're done
|
scraper_tiki.py
CHANGED
@@ -2,6 +2,7 @@ from selenium import webdriver
|
|
2 |
from selenium.webdriver.chrome.options import Options
|
3 |
from selenium.common.exceptions import NoSuchElementException
|
4 |
from selenium.webdriver.common.by import By
|
|
|
5 |
import time
|
6 |
|
7 |
# Global driver to use throughout the script
|
@@ -22,12 +23,13 @@ def start_driver(force_restart=False):
|
|
22 |
close_driver()
|
23 |
|
24 |
# Setting up the driver
|
|
|
25 |
options = Options()
|
26 |
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
|
27 |
options.add_argument('-no-sandbox')
|
28 |
options.add_argument('-disable-dev-shm-usage')
|
29 |
|
30 |
-
DRIVER = webdriver.Chrome(options=options)
|
31 |
|
32 |
### Function to extract product info from the necessary html and json tags
|
33 |
def get_tiki_product_info_single(product_element, extra_info):
|
@@ -250,7 +252,8 @@ def get_tiki_product_info_from_category(cat_url, max_page=0, extra_info=False):
|
|
250 |
products = []
|
251 |
|
252 |
page_n = 1
|
253 |
-
cat_page_url = cat_url + f'
|
|
|
254 |
product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
|
255 |
|
256 |
while len(product_list)>0:
|
@@ -262,7 +265,8 @@ def get_tiki_product_info_from_category(cat_url, max_page=0, extra_info=False):
|
|
262 |
if stop_flag:
|
263 |
break
|
264 |
|
265 |
-
cat_page_url = cat_url + f'
|
|
|
266 |
product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
|
267 |
|
268 |
return products
|
|
|
2 |
from selenium.webdriver.chrome.options import Options
|
3 |
from selenium.common.exceptions import NoSuchElementException
|
4 |
from selenium.webdriver.common.by import By
|
5 |
+
from selenium.webdriver.chrome.service import Service
|
6 |
import time
|
7 |
|
8 |
# Global driver to use throughout the script
|
|
|
23 |
close_driver()
|
24 |
|
25 |
# Setting up the driver
|
26 |
+
service = Service()
|
27 |
options = Options()
|
28 |
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
|
29 |
options.add_argument('-no-sandbox')
|
30 |
options.add_argument('-disable-dev-shm-usage')
|
31 |
|
32 |
+
DRIVER = webdriver.Chrome(service=service, options=options)
|
33 |
|
34 |
### Function to extract product info from the necessary html and json tags
|
35 |
def get_tiki_product_info_single(product_element, extra_info):
|
|
|
252 |
products = []
|
253 |
|
254 |
page_n = 1
|
255 |
+
cat_page_url = cat_url + f'&page={page_n}'
|
256 |
+
print(cat_page_url)
|
257 |
product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
|
258 |
|
259 |
while len(product_list)>0:
|
|
|
265 |
if stop_flag:
|
266 |
break
|
267 |
|
268 |
+
cat_page_url = cat_url + f'&page={page_n}'
|
269 |
+
print(cat_page_url)
|
270 |
product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
|
271 |
|
272 |
return products
|