Linh Vuu commited on
Commit
6d0cb99
1 Parent(s): 3f1beca

updated files

Browse files
__pycache__/scraper_lazada.cpython-39.pyc ADDED
Binary file (4.45 kB). View file
 
__pycache__/scraper_shopee.cpython-39.pyc ADDED
Binary file (4.85 kB). View file
 
__pycache__/scraper_tiki.cpython-39.pyc ADDED
Binary file (4.62 kB). View file
 
app.py CHANGED
@@ -60,7 +60,6 @@ def main():
60
  df_lazada = pd.DataFrame(columns = col_to_display)
61
  st.write("Not found.")
62
 
63
-
64
  st.subheader("Tiki")
65
  tiki_data = scrap_tiki(search_product, num_max_page, extra_info)
66
  if tiki_data:
@@ -72,7 +71,8 @@ def main():
72
  st.write("Not found.")
73
 
74
  # Merge the two dataframes
75
- merged_df = pd.concat([df_tiki, df_lazada, df_shopee])
 
76
 
77
  # Sort the merged dataframe by price
78
  sorted_merged_df = merged_df.sort_values(by='price')
 
60
  df_lazada = pd.DataFrame(columns = col_to_display)
61
  st.write("Not found.")
62
 
 
63
  st.subheader("Tiki")
64
  tiki_data = scrap_tiki(search_product, num_max_page, extra_info)
65
  if tiki_data:
 
71
  st.write("Not found.")
72
 
73
  # Merge the two dataframes
74
+ # merged_df = pd.concat([df_tiki, df_lazada, df_shopee])
75
+ merged_df = pd.concat([df_lazada])
76
 
77
  # Sort the merged dataframe by price
78
  sorted_merged_df = merged_df.sort_values(by='price')
requirements.txt CHANGED
@@ -1,5 +1,4 @@
1
- selenium==4.3.0
2
- pandas==1.2
3
  streamlit==1.13.0
4
- altair==4.2.0
5
- webdriver-manager==3.7.1
 
1
+ selenium
2
+ pandas
3
  streamlit==1.13.0
4
+ altair==4.2.0
 
scraper_lazada.py CHANGED
@@ -2,6 +2,7 @@ from selenium import webdriver
2
  from selenium.webdriver.chrome.options import Options
3
  from selenium.common.exceptions import NoSuchElementException
4
  from selenium.webdriver.common.by import By
 
5
  import time
6
 
7
  # Global driver to use throughout the script
@@ -22,12 +23,13 @@ def start_driver(force_restart=False):
22
  close_driver()
23
 
24
  # Setting up the driver
 
25
  options = Options()
26
  options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
27
  options.add_argument('-no-sandbox')
28
  options.add_argument('-disable-dev-shm-usage')
29
 
30
- DRIVER = webdriver.Chrome(options=options)
31
 
32
  ### Function to extract product info from the necessary html and json tags
33
  def get_lazada_product_info_single(product_element, extra_info):
@@ -51,7 +53,7 @@ def get_lazada_product_info_single(product_element, extra_info):
51
  try:
52
  # Find the <a> element within the <div class="RfADt">
53
  product_title_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
54
-
55
  # Get the text content of the <a> element
56
  info['name'] = product_title_element.text
57
 
@@ -63,7 +65,7 @@ def get_lazada_product_info_single(product_element, extra_info):
63
  try:
64
  # Find the <span> element with class "ooOxS" within the <div class="aBrP0">
65
  price_element = product_element.find_element(By.XPATH, "//div[@class='aBrP0']/span[@class='ooOxS']")
66
-
67
  # Get the text content of the <span> element
68
  price_text = price_element.text
69
 
@@ -77,10 +79,10 @@ def get_lazada_product_info_single(product_element, extra_info):
77
  try:
78
  # Find the <a> element within the <div class="RfADt">
79
  product_link_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
80
-
81
  # Get the href attribute of the <a> element
82
  product_link = product_link_element.get_attribute("href")
83
-
84
  # Extract the URL from the href attribute
85
  info['product_url'] = product_link.split("//")[1]
86
 
@@ -91,7 +93,7 @@ def get_lazada_product_info_single(product_element, extra_info):
91
  try:
92
  # Find the <img> element within the <div class="_95X4G">
93
  image_element = product_element.find_element(By.XPATH, "//div[@class='_95X4G']/a/div/img")
94
-
95
  # Get the src attribute of the <img> element
96
  info['image'] = image_element.get_attribute("src")
97
 
@@ -104,7 +106,7 @@ def get_lazada_product_info_single(product_element, extra_info):
104
  try:
105
  # Find the <span> element within the <div class="_6uN7R">
106
  sold_element = product_element.find_element(By.XPATH, "//div[@class='_6uN7R']/span[@class='_1cEkb']/span[1]")
107
-
108
  # Get the text content of the <span> element
109
  info['sales'] = sold_element.text
110
 
@@ -114,7 +116,7 @@ def get_lazada_product_info_single(product_element, extra_info):
114
  try:
115
  # Find the <span> element within the <div class="WNoq3">
116
  discount_element = product_element.find_element(By.XPATH, "//div[@class='WNoq3']/span[@class='IcOsH']")
117
-
118
  # Get the text content of the <span> element
119
  info['discount'] = discount_element.text
120
 
@@ -134,32 +136,24 @@ def get_lazada_product_info_from_page(page_url, extra_info=False):
134
  found, return empty list.
135
  """
136
  global DRIVER
137
-
138
  data = []
139
  DRIVER.get(page_url) # Use the driver to get info from the product page
140
  time.sleep(3)
141
 
142
- try:
143
- # no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
144
- no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
145
- print("EMPTY PAGE")
146
- return data
147
- except NoSuchElementException:
148
- no_product_found = False
149
-
150
-
151
  # FIND ALL PRODUCT ITEMS
152
  products = DRIVER.find_elements(By.CLASS_NAME, 'Bm3ON')
 
153
  print(f'Found {len(products)} products')
154
 
155
- if (not no_product_found) and len(products)>0:
156
  for i in products:
157
  product_dict = get_lazada_product_info_single(i, extra_info)
158
  data.append(product_dict)
159
  return data
160
 
161
  ### Function to get product info from a main category
162
- def get_lazada_product_info_from_category(cat_url, max_page=0, extra_info=False):
163
  '''
164
  Scrape for multiple pages of products of a category.
165
  Uses get_product_info_from_page().
@@ -172,10 +166,10 @@ def get_lazada_product_info_from_category(cat_url, max_page=0, extra_info=False)
172
  products: a list in which every element is a dictionary of one product's information
173
  '''
174
  products = []
175
-
176
  page_n = 1
177
- cat_page_url = cat_url + f'?page={page_n}'
178
- product_list = get_lazada_product_info_from_page(cat_page_url, extra_info=extra_info)
 
179
 
180
  while len(product_list)>0:
181
  products.extend(product_list)
@@ -185,9 +179,8 @@ def get_lazada_product_info_from_category(cat_url, max_page=0, extra_info=False)
185
  stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
186
  if stop_flag:
187
  break
188
-
189
- cat_page_url = cat_url + f'?page={page_n}'
190
- product_list = get_lazada_product_info_from_page(cat_page_url, extra_info=extra_info)
191
 
192
  return products
193
 
@@ -195,12 +188,9 @@ def scrap_lazada(search_product, num_max_page, extra_info):
195
 
196
  start_driver(force_restart=True)
197
 
198
- url = 'https://www.lazada.vn/catalog/?q=' + search_product
199
-
200
  prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
201
 
202
- # prod_per_cat = get_product_info_from_category(main_cat['URL'], num_max_page, extra_info=extra_info)
203
- prod_per_cat = get_lazada_product_info_from_category(url, num_max_page, extra_info=extra_info)
204
  prod_data.extend(prod_per_cat)
205
 
206
  close_driver() # Close driver when we're done
 
2
  from selenium.webdriver.chrome.options import Options
3
  from selenium.common.exceptions import NoSuchElementException
4
  from selenium.webdriver.common.by import By
5
+ from selenium.webdriver.chrome.service import Service
6
  import time
7
 
8
  # Global driver to use throughout the script
 
23
  close_driver()
24
 
25
  # Setting up the driver
26
+ service = Service()
27
  options = Options()
28
  options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
29
  options.add_argument('-no-sandbox')
30
  options.add_argument('-disable-dev-shm-usage')
31
 
32
+ DRIVER = webdriver.Chrome(service=service, options=options)
33
 
34
  ### Function to extract product info from the necessary html and json tags
35
  def get_lazada_product_info_single(product_element, extra_info):
 
53
  try:
54
  # Find the <a> element within the <div class="RfADt">
55
  product_title_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
56
+
57
  # Get the text content of the <a> element
58
  info['name'] = product_title_element.text
59
 
 
65
  try:
66
  # Find the <span> element with class "ooOxS" within the <div class="aBrP0">
67
  price_element = product_element.find_element(By.XPATH, "//div[@class='aBrP0']/span[@class='ooOxS']")
68
+
69
  # Get the text content of the <span> element
70
  price_text = price_element.text
71
 
 
79
  try:
80
  # Find the <a> element within the <div class="RfADt">
81
  product_link_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
82
+
83
  # Get the href attribute of the <a> element
84
  product_link = product_link_element.get_attribute("href")
85
+
86
  # Extract the URL from the href attribute
87
  info['product_url'] = product_link.split("//")[1]
88
 
 
93
  try:
94
  # Find the <img> element within the <div class="_95X4G">
95
  image_element = product_element.find_element(By.XPATH, "//div[@class='_95X4G']/a/div/img")
96
+
97
  # Get the src attribute of the <img> element
98
  info['image'] = image_element.get_attribute("src")
99
 
 
106
  try:
107
  # Find the <span> element within the <div class="_6uN7R">
108
  sold_element = product_element.find_element(By.XPATH, "//div[@class='_6uN7R']/span[@class='_1cEkb']/span[1]")
109
+
110
  # Get the text content of the <span> element
111
  info['sales'] = sold_element.text
112
 
 
116
  try:
117
  # Find the <span> element within the <div class="WNoq3">
118
  discount_element = product_element.find_element(By.XPATH, "//div[@class='WNoq3']/span[@class='IcOsH']")
119
+
120
  # Get the text content of the <span> element
121
  info['discount'] = discount_element.text
122
 
 
136
  found, return empty list.
137
  """
138
  global DRIVER
139
+
140
  data = []
141
  DRIVER.get(page_url) # Use the driver to get info from the product page
142
  time.sleep(3)
143
 
 
 
 
 
 
 
 
 
 
144
  # FIND ALL PRODUCT ITEMS
145
  products = DRIVER.find_elements(By.CLASS_NAME, 'Bm3ON')
146
+ print(products)
147
  print(f'Found {len(products)} products')
148
 
149
+ if len(products)>0:
150
  for i in products:
151
  product_dict = get_lazada_product_info_single(i, extra_info)
152
  data.append(product_dict)
153
  return data
154
 
155
  ### Function to get product info from a main category
156
+ def get_lazada_product_info_from_category(search_product, max_page=0, extra_info=False):
157
  '''
158
  Scrape for multiple pages of products of a category.
159
  Uses get_product_info_from_page().
 
166
  products: a list in which every element is a dictionary of one product's information
167
  '''
168
  products = []
 
169
  page_n = 1
170
+ cat_url = 'https://www.lazada.vn/catalog/?q=' + search_product
171
+
172
+ product_list = get_lazada_product_info_from_page(cat_url, extra_info=extra_info)
173
 
174
  while len(product_list)>0:
175
  products.extend(product_list)
 
179
  stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
180
  if stop_flag:
181
  break
182
+ cat_url = 'https://www.lazada.vn/catalog/?page=' + page_n + '&q=' + search_product
183
+ product_list = get_lazada_product_info_from_page(cat_url, extra_info=extra_info)
 
184
 
185
  return products
186
 
 
188
 
189
  start_driver(force_restart=True)
190
 
 
 
191
  prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
192
 
193
+ prod_per_cat = get_lazada_product_info_from_category(search_product, num_max_page, extra_info=extra_info)
 
194
  prod_data.extend(prod_per_cat)
195
 
196
  close_driver() # Close driver when we're done
scraper_shopee.py CHANGED
@@ -1,9 +1,9 @@
1
  from selenium import webdriver
2
- from webdriver_manager.chrome import ChromeDriverManager
3
- # from selenium.webdriver.chrome.options import Options
4
  from selenium.common.exceptions import NoSuchElementException
5
  from selenium.webdriver.common.by import By
6
  from selenium.webdriver.support import expected_conditions as EC
 
7
  import time
8
 
9
  # Global driver to use throughout the script
@@ -24,19 +24,13 @@ def start_driver(force_restart=False):
24
  close_driver()
25
 
26
  # Setting up the driver
27
- # options = Options()
28
- # options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
29
- # options.add_argument('-no-sandbox')
30
- # options.add_argument('-disable-dev-shm-usage')
 
31
 
32
- # DRIVER = webdriver.Chrome(options=options)
33
-
34
- gChromeOptions = webdriver.ChromeOptions()
35
- gChromeOptions.add_argument("window-size=1920x1480")
36
- gChromeOptions.add_argument("disable-dev-shm-usage")
37
- DRIVER = webdriver.Chrome(
38
- chrome_options=gChromeOptions, executable_path=ChromeDriverManager().install()
39
- )
40
 
41
  ### Function to extract product info from the necessary html and json tags
42
  def get_shopee_product_info_single(product_element, extra_info):
@@ -149,32 +143,19 @@ def get_shopee_product_info_from_page(page_url, extra_info=False):
149
  DRIVER.get(page_url) # Use the driver to get info from the product page
150
  time.sleep(3)
151
 
152
-
153
- try:
154
- # no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
155
- no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
156
- print("EMPTY PAGE")
157
- return data
158
- except NoSuchElementException:
159
- no_product_found = False
160
-
161
-
162
  # FIND ALL PRODUCT ITEMS
163
- # products = DRIVER.find_elements(By.XPATH, "//a[@class='product-item']")
164
  products = DRIVER.find_elements(By.CLASS_NAME, 'col-xs-2-4 shopee-search-item-result__item')
165
  print(f'Found {len(products)} products')
166
  print(products)
167
 
168
- if (not no_product_found) and len(products)>0:
169
  for i in products:
170
  product_dict = get_shopee_product_info_single(i, extra_info)
171
- print(i)
172
- print(product_dict)
173
  data.append(product_dict)
174
  return data
175
 
176
  ### Function to get product info from a main category
177
- def get_shopee_product_info_from_category(cat_url, max_page=0, extra_info=False):
178
  '''
179
  Scrape for multiple pages of products of a category.
180
  Uses get_product_info_from_page().
@@ -188,8 +169,8 @@ def get_shopee_product_info_from_category(cat_url, max_page=0, extra_info=False)
188
  '''
189
  products = []
190
 
191
- page_n = 1
192
- cat_page_url = cat_url + f'?page={page_n}'
193
  product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
194
 
195
  while len(product_list)>0:
@@ -201,7 +182,7 @@ def get_shopee_product_info_from_category(cat_url, max_page=0, extra_info=False)
201
  if stop_flag:
202
  break
203
 
204
- cat_page_url = cat_url + f'?page={page_n}'
205
  product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
206
 
207
  return products
@@ -221,14 +202,11 @@ def scrap_shopee(search_product, num_max_page, extra_info):
221
  # info = get_shopee_product_info_single(product, True)
222
  # print(info)
223
 
224
- start_driver(force_restart=True)
225
-
226
- url = 'https://shopee.vn/search?keyword=' + search_product
227
 
228
  prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
229
 
230
- # prod_per_cat = get_product_info_from_category(main_cat['URL'], num_max_page, extra_info=extra_info)
231
- prod_per_cat = get_shopee_product_info_from_category(url, num_max_page, extra_info=extra_info)
232
  prod_data.extend(prod_per_cat)
233
 
234
  close_driver() # Close driver when we're done
 
1
  from selenium import webdriver
2
+ from selenium.webdriver.chrome.options import Options
 
3
  from selenium.common.exceptions import NoSuchElementException
4
  from selenium.webdriver.common.by import By
5
  from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.webdriver.chrome.service import Service
7
  import time
8
 
9
  # Global driver to use throughout the script
 
24
  close_driver()
25
 
26
  # Setting up the driver
27
+ service = Service()
28
+ options = Options()
29
+ options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
30
+ options.add_argument('-no-sandbox')
31
+ options.add_argument('-disable-dev-shm-usage')
32
 
33
+ DRIVER = webdriver.Chrome(service=service, options=options)
 
 
 
 
 
 
 
34
 
35
  ### Function to extract product info from the necessary html and json tags
36
  def get_shopee_product_info_single(product_element, extra_info):
 
143
  DRIVER.get(page_url) # Use the driver to get info from the product page
144
  time.sleep(3)
145
 
 
 
 
 
 
 
 
 
 
 
146
  # FIND ALL PRODUCT ITEMS
 
147
  products = DRIVER.find_elements(By.CLASS_NAME, 'col-xs-2-4 shopee-search-item-result__item')
148
  print(f'Found {len(products)} products')
149
  print(products)
150
 
151
+ if len(products)>0:
152
  for i in products:
153
  product_dict = get_shopee_product_info_single(i, extra_info)
 
 
154
  data.append(product_dict)
155
  return data
156
 
157
  ### Function to get product info from a main category
158
+ def get_shopee_product_info_from_category(search_product, max_page=0, extra_info=False):
159
  '''
160
  Scrape for multiple pages of products of a category.
161
  Uses get_product_info_from_page().
 
169
  '''
170
  products = []
171
 
172
+ page_n = 0
173
+ cat_page_url = 'https://shopee.vn/search?keyword=' + search_product
174
  product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
175
 
176
  while len(product_list)>0:
 
182
  if stop_flag:
183
  break
184
 
185
+ cat_page_url = cat_page_url + f'&page={page_n}'
186
  product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
187
 
188
  return products
 
202
  # info = get_shopee_product_info_single(product, True)
203
  # print(info)
204
 
205
+ start_driver(force_restart=True)
 
 
206
 
207
  prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
208
 
209
+ prod_per_cat = get_shopee_product_info_from_category(search_product, num_max_page, extra_info=extra_info)
 
210
  prod_data.extend(prod_per_cat)
211
 
212
  close_driver() # Close driver when we're done
scraper_tiki.py CHANGED
@@ -2,6 +2,7 @@ from selenium import webdriver
2
  from selenium.webdriver.chrome.options import Options
3
  from selenium.common.exceptions import NoSuchElementException
4
  from selenium.webdriver.common.by import By
 
5
  import time
6
 
7
  # Global driver to use throughout the script
@@ -22,12 +23,13 @@ def start_driver(force_restart=False):
22
  close_driver()
23
 
24
  # Setting up the driver
 
25
  options = Options()
26
  options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
27
  options.add_argument('-no-sandbox')
28
  options.add_argument('-disable-dev-shm-usage')
29
 
30
- DRIVER = webdriver.Chrome(options=options)
31
 
32
  ### Function to extract product info from the necessary html and json tags
33
  def get_tiki_product_info_single(product_element, extra_info):
@@ -250,7 +252,8 @@ def get_tiki_product_info_from_category(cat_url, max_page=0, extra_info=False):
250
  products = []
251
 
252
  page_n = 1
253
- cat_page_url = cat_url + f'?page={page_n}'
 
254
  product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
255
 
256
  while len(product_list)>0:
@@ -262,7 +265,8 @@ def get_tiki_product_info_from_category(cat_url, max_page=0, extra_info=False):
262
  if stop_flag:
263
  break
264
 
265
- cat_page_url = cat_url + f'?page={page_n}'
 
266
  product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
267
 
268
  return products
 
2
  from selenium.webdriver.chrome.options import Options
3
  from selenium.common.exceptions import NoSuchElementException
4
  from selenium.webdriver.common.by import By
5
+ from selenium.webdriver.chrome.service import Service
6
  import time
7
 
8
  # Global driver to use throughout the script
 
23
  close_driver()
24
 
25
  # Setting up the driver
26
+ service = Service()
27
  options = Options()
28
  options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
29
  options.add_argument('-no-sandbox')
30
  options.add_argument('-disable-dev-shm-usage')
31
 
32
+ DRIVER = webdriver.Chrome(service=service, options=options)
33
 
34
  ### Function to extract product info from the necessary html and json tags
35
  def get_tiki_product_info_single(product_element, extra_info):
 
252
  products = []
253
 
254
  page_n = 1
255
+ cat_page_url = cat_url + f'&page={page_n}'
256
+ print(cat_page_url)
257
  product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
258
 
259
  while len(product_list)>0:
 
265
  if stop_flag:
266
  break
267
 
268
+ cat_page_url = cat_url + f'&page={page_n}'
269
+ print(cat_page_url)
270
  product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
271
 
272
  return products