Linh Vuu commited on
Commit
39a482a
β€’
1 Parent(s): a088ba6

added files

Browse files
Files changed (6) hide show
  1. README copy.md +12 -0
  2. app.py +85 -0
  3. requirements.txt +4 -0
  4. scraper_lazada.py +208 -0
  5. scraper_shopee.py +229 -0
  6. scraper_tiki.py +283 -0
README copy.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: PriceComparison
3
+ emoji: πŸ‘€
4
+ colorFrom: pink
5
+ colorTo: yellow
6
+ sdk: streamlit
7
+ sdk_version: 1.33.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scraper_tiki import *
2
+ from scraper_lazada import *
3
+ from scraper_shopee import *
4
+ import pandas as pd
5
+ import streamlit as st
6
+
7
+ # #test Tiki
8
+ # start_driver()
9
+ # DRIVER.get('https://tiki.vn/search?sort=price%2Casc&q=megaduo')
10
+ # time.sleep(3)
11
+ # products = DRIVER.find_elements(By.CLASS_NAME, 'product-item')
12
+ # product = products[2]
13
+ # info = get_tiki_product_info_single(product, True)
14
+ # print(info)
15
+
16
+ # # Test Lazada
17
+ # start_driver()
18
+ # DRIVER.get('https://www.lazada.vn/catalog/?page=1&q=megaduo&sort=priceasc')
19
+ # time.sleep(3)
20
+ # products = DRIVER.find_elements(By.CLASS_NAME, 'Bm3ON')
21
+ # product = products[2]
22
+ # info = get_lazada_product_info_single(product, True)
23
+ # print(info)
24
+
25
+ def main():
26
+
27
+ st.subheader("Price Comparison (So SΓ‘nh GiΓ‘)")
28
+
29
+ with st.form(key="user_input_form"):
30
+
31
+ search_product = st.text_input("What would you like to buy? (Bẑn muốn mua gì?)")
32
+ submit_button = st.form_submit_button(label="Search")
33
+
34
+ if submit_button:
35
+ print('Scraping', search_product)
36
+ # search_product = "megaduo"
37
+ # search_product = input("Search for what? ")
38
+ num_max_page = 1
39
+ extra_info = True
40
+ n_products_to_view = 5 # Change this as you like to check more products
41
+ col_to_display = ['name', 'price', 'product_url', 'image']
42
+
43
+ st.subheader("Shopee")
44
+ shopee_data = scrap_shopee(search_product, num_max_page, extra_info)
45
+ if shopee_data:
46
+ df_shopee = pd.DataFrame(data=shopee_data, columns=shopee_data[0].keys())
47
+ print(df_shopee.head())
48
+ st.write(df_shopee[col_to_display].sort_values(by='price').head(n_products_to_view))
49
+ else:
50
+ df_shopee = pd.DataFrame(columns = col_to_display)
51
+ st.write("Not found.")
52
+
53
+ st.subheader("Lazada")
54
+ lazada_data = scrap_lazada(search_product, num_max_page, extra_info)
55
+ if lazada_data:
56
+ df_lazada = pd.DataFrame(data=lazada_data, columns=lazada_data[0].keys())
57
+ print(df_lazada.head())
58
+ st.write(df_lazada[col_to_display].sort_values(by='price').head(n_products_to_view))
59
+ else:
60
+ df_lazada = pd.DataFrame(columns = col_to_display)
61
+ st.write("Not found.")
62
+
63
+
64
+ st.subheader("Tiki")
65
+ tiki_data = scrap_tiki(search_product, num_max_page, extra_info)
66
+ if tiki_data:
67
+ df_tiki = pd.DataFrame(data=tiki_data, columns=tiki_data[0].keys())
68
+ print(df_tiki.head())
69
+ st.write(df_tiki[col_to_display].sort_values(by='price').head(n_products_to_view))
70
+ else:
71
+ df_tiki = pd.DataFrame(columns = col_to_display)
72
+ st.write("Not found.")
73
+
74
+ # Merge the two dataframes
75
+ merged_df = pd.concat([df_tiki, df_lazada, df_shopee])
76
+
77
+ # Sort the merged dataframe by price
78
+ sorted_merged_df = merged_df.sort_values(by='price')
79
+
80
+ print(sorted_merged_df.head(n_products_to_view))
81
+ st.subheader("All sites, sorted by price ascending (Sắp xếp theo giÑ tăng dần)")
82
+ st.write(sorted_merged_df.head(n_products_to_view))
83
+
84
+ if __name__ == "__main__":
85
+ main()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ selenium
2
+ pandas
3
+ streamlit==1.13.0
4
+ altair==4.1.0
scraper_lazada.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.chrome.options import Options
3
+ from selenium.common.exceptions import NoSuchElementException
4
+ from selenium.webdriver.common.by import By
5
+ import time
6
+
7
+ # Global driver to use throughout the script
8
+ DRIVER = None
9
+
10
+ # Wrapper to close driver if its created
11
+ def close_driver():
12
+ global DRIVER
13
+ if DRIVER is not None:
14
+ DRIVER.close()
15
+ DRIVER = None
16
+
17
+ # Function to (re)start driver
18
+ def start_driver(force_restart=False):
19
+ global DRIVER
20
+
21
+ if force_restart:
22
+ close_driver()
23
+
24
+ # Setting up the driver
25
+ options = Options()
26
+ options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
27
+ options.add_argument('-no-sandbox')
28
+ options.add_argument('-disable-dev-shm-usage')
29
+
30
+ DRIVER = webdriver.Chrome(options=options)
31
+
32
+ ### Function to extract product info from the necessary html and json tags
33
+ def get_lazada_product_info_single(product_element, extra_info):
34
+ """
35
+ Extract info from a single product element from the driver.
36
+ Args:
37
+ product_item: (WebDriverElement) the product whose info needs to be
38
+ extracted.
39
+ Returns:
40
+ info: (dict) a dictionary of info of the product. Every product
41
+ should at least have four pieces of information: name, price,
42
+ link to the product page, and link to the product image.
43
+ """
44
+ info = {'source': 'lazada',
45
+ 'name':'',
46
+ 'price':-1,
47
+ 'product_url':'',
48
+ 'image':''}
49
+
50
+ # print(product_element.get_attribute('outerHTML'))
51
+ try:
52
+ # Find the <a> element within the <div class="RfADt">
53
+ product_title_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
54
+
55
+ # Get the text content of the <a> element
56
+ info['name'] = product_title_element.text
57
+
58
+ except NoSuchElementException:
59
+ info['name'] = ""
60
+
61
+
62
+ # price
63
+ try:
64
+ # Find the <span> element with class "ooOxS" within the <div class="aBrP0">
65
+ price_element = product_element.find_element(By.XPATH, "//div[@class='aBrP0']/span[@class='ooOxS']")
66
+
67
+ # Get the text content of the <span> element
68
+ price_text = price_element.text
69
+
70
+ # Extract the price value
71
+ info['price'] = int(price_text.split(" ")[0].replace('.', ''))
72
+
73
+ except (NoSuchElementException, ValueError):
74
+ pass
75
+
76
+ # link
77
+ try:
78
+ # Find the <a> element within the <div class="RfADt">
79
+ product_link_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
80
+
81
+ # Get the href attribute of the <a> element
82
+ product_link = product_link_element.get_attribute("href")
83
+
84
+ # Extract the URL from the href attribute
85
+ info['product_url'] = product_link.split("//")[1]
86
+
87
+ except NoSuchElementException:
88
+ pass
89
+
90
+ # thumbnail
91
+ try:
92
+ # Find the <img> element within the <div class="_95X4G">
93
+ image_element = product_element.find_element(By.XPATH, "//div[@class='_95X4G']/a/div/img")
94
+
95
+ # Get the src attribute of the <img> element
96
+ info['image'] = image_element.get_attribute("src")
97
+
98
+ except NoSuchElementException:
99
+ pass
100
+
101
+ # If we decide to get extra information
102
+ if extra_info:
103
+ # sales
104
+ try:
105
+ # Find the <span> element within the <div class="_6uN7R">
106
+ sold_element = product_element.find_element(By.XPATH, "//div[@class='_6uN7R']/span[@class='_1cEkb']/span[1]")
107
+
108
+ # Get the text content of the <span> element
109
+ info['sales'] = sold_element.text
110
+
111
+ except (NoSuchElementException, ValueError):
112
+ info['sales'] = 0
113
+
114
+ try:
115
+ # Find the <span> element within the <div class="WNoq3">
116
+ discount_element = product_element.find_element(By.XPATH, "//div[@class='WNoq3']/span[@class='IcOsH']")
117
+
118
+ # Get the text content of the <span> element
119
+ info['discount'] = discount_element.text
120
+
121
+ except (NoSuchElementException, ValueError):
122
+ info['discount'] = '0'
123
+
124
+ return info
125
+
126
+ ### Function to scrape all products from a page
127
+ def get_lazada_product_info_from_page(page_url, extra_info=False):
128
+ """
129
+ Extract info from all products of a specfic page_url on Tiki website
130
+ Args:
131
+ page_url: (string) url of the page to scrape
132
+ Returns:
133
+ data: (list) a list of dictionary of products info. If no products
134
+ found, return empty list.
135
+ """
136
+ global DRIVER
137
+
138
+ data = []
139
+ DRIVER.get(page_url) # Use the driver to get info from the product page
140
+ time.sleep(3)
141
+
142
+ try:
143
+ # no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
144
+ no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
145
+ print("EMPTY PAGE")
146
+ return data
147
+ except NoSuchElementException:
148
+ no_product_found = False
149
+
150
+
151
+ # FIND ALL PRODUCT ITEMS
152
+ products = DRIVER.find_elements(By.CLASS_NAME, 'Bm3ON')
153
+ print(f'Found {len(products)} products')
154
+
155
+ if (not no_product_found) and len(products)>0:
156
+ for i in products:
157
+ product_dict = get_lazada_product_info_single(i, extra_info)
158
+ data.append(product_dict)
159
+ return data
160
+
161
+ ### Function to get product info from a main category
162
+ def get_lazada_product_info_from_category(cat_url, max_page=0, extra_info=False):
163
+ '''
164
+ Scrape for multiple pages of products of a category.
165
+ Uses get_product_info_from_page().
166
+
167
+ Args:
168
+ cat_url: (string) a url string of a category
169
+ max_page: (int) an integer denoting the maximum number of pages to scrape.
170
+ Default value is 0 to scrape all pages.
171
+ Returns:
172
+ products: a list in which every element is a dictionary of one product's information
173
+ '''
174
+ products = []
175
+
176
+ page_n = 1
177
+ cat_page_url = cat_url + f'?page={page_n}'
178
+ product_list = get_lazada_product_info_from_page(cat_page_url, extra_info=extra_info)
179
+
180
+ while len(product_list)>0:
181
+ products.extend(product_list)
182
+ page_n += 1
183
+
184
+ # stop_flag = False if max_page <= 0 else (page_n > max_page)
185
+ stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
186
+ if stop_flag:
187
+ break
188
+
189
+ cat_page_url = cat_url + f'?page={page_n}'
190
+ product_list = get_lazada_product_info_from_page(cat_page_url, extra_info=extra_info)
191
+
192
+ return products
193
+
194
+ def scrap_lazada(search_product, num_max_page, extra_info):
195
+
196
+ start_driver(force_restart=True)
197
+
198
+ url = 'https://www.lazada.vn/catalog/?q=' + search_product
199
+
200
+ prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
201
+
202
+ # prod_per_cat = get_product_info_from_category(main_cat['URL'], num_max_page, extra_info=extra_info)
203
+ prod_per_cat = get_lazada_product_info_from_category(url, num_max_page, extra_info=extra_info)
204
+ prod_data.extend(prod_per_cat)
205
+
206
+ close_driver() # Close driver when we're done
207
+
208
+ return prod_data
scraper_shopee.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.chrome.options import Options
3
+ from selenium.common.exceptions import NoSuchElementException
4
+ from selenium.webdriver.common.by import By
5
+ from selenium.webdriver.support.ui import WebDriverWait
6
+ from selenium.webdriver.support import expected_conditions as EC
7
+ import time
8
+
9
+ # Global driver to use throughout the script
10
+ DRIVER = None
11
+
12
+ # Wrapper to close driver if its created
13
+ def close_driver():
14
+ global DRIVER
15
+ if DRIVER is not None:
16
+ DRIVER.close()
17
+ DRIVER = None
18
+
19
+ # Function to (re)start driver
20
+ def start_driver(force_restart=False):
21
+ global DRIVER
22
+
23
+ if force_restart:
24
+ close_driver()
25
+
26
+ # Setting up the driver
27
+ options = Options()
28
+ options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
29
+ options.add_argument('-no-sandbox')
30
+ options.add_argument('-disable-dev-shm-usage')
31
+
32
+ DRIVER = webdriver.Chrome(options=options)
33
+
34
+ ### Function to extract product info from the necessary html and json tags
35
+ def get_shopee_product_info_single(product_element, extra_info):
36
+ """
37
+ Extract info from a single product element from the driver.
38
+ Args:
39
+ product_item: (WebDriverElement) the product whose info needs to be
40
+ extracted.
41
+ Returns:
42
+ info: (dict) a dictionary of info of the product. Every product
43
+ should at least have four pieces of information: name, price,
44
+ link to the product page, and link to the product image.
45
+ """
46
+ info = {'source': 'shopee',
47
+ 'name':'',
48
+ 'price':-1,
49
+ 'product_url':'',
50
+ 'image':''}
51
+ print(product_element.get_attribute('outerHTML'))
52
+ try:
53
+ # Find the <a> element within the <div class>
54
+ product_title_element = product_element.find_element(By.CLASS_NAME, "line-clamp-2")
55
+
56
+ # Get the text content of the <a> element
57
+ info['name'] = product_title_element.text
58
+ print(info['name'])
59
+
60
+ except NoSuchElementException:
61
+ info['name'] = ""
62
+
63
+
64
+ # price
65
+ try:
66
+ # Find the <span> element within the <div class>
67
+ price_element = product_element.find_element(By.XPATH,'//div[@class="truncate flex items-baseline"]/span[@class="text-base/5 truncate"]')
68
+
69
+ # Get the text content of the <span> element
70
+ price_text = price_element.text
71
+
72
+ # Extract the price value
73
+ info['price'] = int(price_text.split(" ")[0].replace('.', ''))
74
+ print(info['price'])
75
+
76
+ except (NoSuchElementException, ValueError):
77
+ pass
78
+
79
+ # link
80
+ try:
81
+ # Find the <a> element within the <div class>
82
+ product_link_element = product_element.find_element(By.XPATH, '//a[@class="contents"]')
83
+
84
+ # Get the href attribute of the <a> element
85
+ product_link = product_link_element.get_attribute("href")
86
+
87
+ # Extract the URL from the href attribute
88
+ info['product_url'] = product_link
89
+
90
+ except NoSuchElementException:
91
+ pass
92
+
93
+ # thumbnail
94
+ try:
95
+ # Find the <img> element within the <div class>
96
+ image_element = product_element.find_element(By.XPATH, '//img[@class="inset-y-0 w-full h-full pointer-events-none object-contain absolute"]')
97
+
98
+ # Get the src attribute of the <img> element
99
+ info['image'] = image_element.get_attribute("src")
100
+
101
+ except NoSuchElementException:
102
+ pass
103
+
104
+ # If we decide to get extra information
105
+ if extra_info:
106
+ # sales
107
+ try:
108
+ # Find the <span> element within the <div class>
109
+ sold_element = product_element.find_element(By.XPATH, '//div[@class="truncate text-shopee-black87 text-xs min-h-4 flex-shrink-1"]')
110
+
111
+ # Get the text content of the <span> element
112
+ info['sales'] = sold_element.text
113
+
114
+ except (NoSuchElementException, ValueError):
115
+ info['sales'] = 0
116
+
117
+ try:
118
+ # Find the <span> element within the <div class>
119
+ discount_element = product_element.find_element(By.XPATH, '//div[@class="truncate bg-shopee-voucher-yellow text-white leading-4 text-sp10"]')
120
+
121
+ # Get the text content of the <span> element
122
+ info['discount'] = discount_element.text
123
+
124
+ except (NoSuchElementException, ValueError):
125
+ info['discount'] = '0'
126
+
127
+ return info
128
+
129
+ ### Function to scrape all products from a page
130
+ def get_shopee_product_info_from_page(page_url, extra_info=False):
131
+ """
132
+ Extract info from all products of a specfic page_url on Tiki website
133
+ Args:
134
+ page_url: (string) url of the page to scrape
135
+ Returns:
136
+ data: (list) a list of dictionary of products info. If no products
137
+ found, return empty list.
138
+ """
139
+ global DRIVER
140
+
141
+ data = []
142
+ DRIVER.get(page_url) # Use the driver to get info from the product page
143
+ time.sleep(3)
144
+
145
+
146
+ try:
147
+ # no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
148
+ no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
149
+ print("EMPTY PAGE")
150
+ return data
151
+ except NoSuchElementException:
152
+ no_product_found = False
153
+
154
+
155
+ # FIND ALL PRODUCT ITEMS
156
+ # products = DRIVER.find_elements(By.XPATH, "//a[@class='product-item']")
157
+ products = DRIVER.find_elements(By.CLASS_NAME, 'col-xs-2-4 shopee-search-item-result__item')
158
+ print(f'Found {len(products)} products')
159
+ print(products)
160
+
161
+ if (not no_product_found) and len(products)>0:
162
+ for i in products:
163
+ product_dict = get_shopee_product_info_single(i, extra_info)
164
+ print(i)
165
+ print(product_dict)
166
+ data.append(product_dict)
167
+ return data
168
+
169
+ ### Function to get product info from a main category
170
+ def get_shopee_product_info_from_category(cat_url, max_page=0, extra_info=False):
171
+ '''
172
+ Scrape for multiple pages of products of a category.
173
+ Uses get_product_info_from_page().
174
+
175
+ Args:
176
+ cat_url: (string) a url string of a category
177
+ max_page: (int) an integer denoting the maximum number of pages to scrape.
178
+ Default value is 0 to scrape all pages.
179
+ Returns:
180
+ products: a list in which every element is a dictionary of one product's information
181
+ '''
182
+ products = []
183
+
184
+ page_n = 1
185
+ cat_page_url = cat_url + f'?page={page_n}'
186
+ product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
187
+
188
+ while len(product_list)>0:
189
+ products.extend(product_list)
190
+ page_n += 1
191
+
192
+ # stop_flag = False if max_page <= 0 else (page_n > max_page)
193
+ stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
194
+ if stop_flag:
195
+ break
196
+
197
+ cat_page_url = cat_url + f'?page={page_n}'
198
+ product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
199
+
200
+ return products
201
+
202
+ def scrap_shopee(search_product, num_max_page, extra_info):
203
+
204
+ # # #test Shopee
205
+ # start_driver()
206
+ # URL = 'https://shopee.vn/search?keyword=megaduo&page=0&sortBy=relevancy'
207
+ # DRIVER.get(URL)
208
+ # time.sleep(3)
209
+ # print(URL)
210
+ # products = DRIVER.find_elements(By.CLASS_NAME, 'shopee-search-item-result')
211
+ # # products = DRIVER.find_element("css selector", 'li[class="col-xs-2-4 shopee-search-item-result__item"]')
212
+ # product = products[0]
213
+ # # Wait for the element to be present on the page
214
+ # info = get_shopee_product_info_single(product, True)
215
+ # print(info)
216
+
217
+ start_driver(force_restart=True)
218
+
219
+ url = 'https://shopee.vn/search?keyword=' + search_product
220
+
221
+ prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
222
+
223
+ # prod_per_cat = get_product_info_from_category(main_cat['URL'], num_max_page, extra_info=extra_info)
224
+ prod_per_cat = get_shopee_product_info_from_category(url, num_max_page, extra_info=extra_info)
225
+ prod_data.extend(prod_per_cat)
226
+
227
+ close_driver() # Close driver when we're done
228
+
229
+ return prod_data
scraper_tiki.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.chrome.options import Options
3
+ from selenium.common.exceptions import NoSuchElementException
4
+ from selenium.webdriver.common.by import By
5
+ import time
6
+
7
+ # Global driver to use throughout the script
8
+ DRIVER = None
9
+
10
+ # Wrapper to close driver if its created
11
+ def close_driver():
12
+ global DRIVER
13
+ if DRIVER is not None:
14
+ DRIVER.close()
15
+ DRIVER = None
16
+
17
+ # Function to (re)start driver
18
+ def start_driver(force_restart=False):
19
+ global DRIVER
20
+
21
+ if force_restart:
22
+ close_driver()
23
+
24
+ # Setting up the driver
25
+ options = Options()
26
+ options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
27
+ options.add_argument('-no-sandbox')
28
+ options.add_argument('-disable-dev-shm-usage')
29
+
30
+ DRIVER = webdriver.Chrome(options=options)
31
+
32
+ ### Function to extract product info from the necessary html and json tags
33
+ def get_tiki_product_info_single(product_element, extra_info):
34
+ """
35
+ Extract info from a single product element from the driver.
36
+ Args:
37
+ product_item: (WebDriverElement) the product whose info needs to be
38
+ extracted.
39
+ Returns:
40
+ info: (dict) a dictionary of info of the product. Every product
41
+ should at least have four pieces of information: name, price,
42
+ link to the product page, and link to the product image.
43
+ """
44
+ info = {'source': 'tiki',
45
+ 'name':'',
46
+ 'price':-1,
47
+ 'product_url':'',
48
+ 'image':''}
49
+ # print(product_element.get_attribute('outerHTML'))
50
+ try:
51
+ # name = product_element.find_element(By.XPATH, ".//div[@class='name']/h3")
52
+ # name = product_element.find_element(By.CLASS_NAME, 'style__NameStyled-sc-139nb47-8 ibOlar').find_element(By.TAG_NAME, 'h3')
53
+
54
+ name = product_element.find_element(By.CLASS_NAME, 'name').find_element(By.TAG_NAME, 'h3')
55
+
56
+ info['name'] = name.get_attribute('innerHTML').strip()
57
+
58
+ except NoSuchElementException:
59
+
60
+ # Find the <h3> element by class name
61
+ name = product_element.find_element(By.CLASS_NAME, 'style__NameStyled-sc-139nb47-8')
62
+
63
+ # Get the text content of the element
64
+ info['name'] = name.text
65
+
66
+ # price = product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML')
67
+ # print(price)
68
+
69
+ # price
70
+ try:
71
+ # price=product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML').strip()
72
+ price = product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML')
73
+ # price = product_element.find_element(By.XPATH, ".//div[@class='price-discount__price']").get_attribute('innerHTML')
74
+
75
+ info['price']=int(price.replace('<sup>β‚«</sup>', '').replace('.', ''))
76
+ # info['price'] = int(re.sub(r'[\.\sβ‚«]', '', price)) # With regex
77
+ # info['price'] = int(''.join([c for c in price if c not in '.β‚« '])) # Without regex
78
+ except (NoSuchElementException, ValueError):
79
+ pass
80
+
81
+ # link
82
+ try:
83
+ product_link = product_element.get_attribute('href')
84
+ info['product_url'] = product_link
85
+ except NoSuchElementException:
86
+ pass
87
+
88
+ # thumbnail
89
+ try:
90
+ # thumbnail = product_element.find_elements(By.XPATH, ".//div[@class='thumbnail']//child::img")[-1]
91
+
92
+ # thumbnail = product_element.find_element(By.CLASS_NAME, 'thumbnail').find_element(By.TAG_NAME, 'img')
93
+ # info['image'] = thumbnail.get_attribute('src')
94
+
95
+ # Find the <div> element with class "image-wrapper"
96
+ image_div = product_element.find_element(By.CLASS_NAME, 'image-wrapper')
97
+
98
+ # Find the <img> element within the <div> element
99
+ img_element = image_div.find_element(By.TAG_NAME, 'img')
100
+
101
+ # Get the value of the "srcset" attribute
102
+ srcset_value = img_element.get_attribute('srcset')
103
+
104
+ # Extract the link of the image from the srcset value
105
+ image_link = srcset_value.split(',')[0].split(' ')[0]
106
+ info['image'] = image_link
107
+
108
+ except NoSuchElementException:
109
+ pass
110
+
111
+ # If we decide to get extra information
112
+ if extra_info:
113
+ # sales
114
+ try:
115
+ # sales_elem = product_element.find_element(By.XPATH, ".//div[@class='styles__StyledQtySold-sc-732h27-2']")
116
+ # sales_elem = product_element.find_element(By.CLASS_NAME, 'quantity has-border')
117
+ # info['sales'] = sales_elem
118
+ # info['sales'] = int(re.sub(r'\D', '', sales_elem.get_attribute('innerHTML')))
119
+
120
+ # Find the <span> element with class "quantity"
121
+ quantity_span = product_element.find_element(By.CLASS_NAME, 'quantity')
122
+
123
+ # Get the text content of the element
124
+ info['sales'] = quantity_span.text
125
+
126
+ except (NoSuchElementException, ValueError):
127
+ info['sales'] = 0
128
+
129
+ # # rating
130
+ # try:
131
+ # # rating = product_element.find_element(By.XPATH, ".//div[@class='average']").get_attribute('style')
132
+ # rating = product_element.find_element(By.CLASS_NAME, 'average').get_attribute('style')
133
+ # # info['rating'] = float(re.sub(r'\D','', rating))/100*5 # With regex
134
+ # info['rating'] = float(''.join([c for c in rating if c.isdigit()]))/100*5 # Without regex
135
+ # except NoSuchElementException:
136
+ # info['rating'] = 0
137
+
138
+ try:
139
+ # Try to get discount using class name
140
+ discount = product_element.find_element(By.CLASS_NAME, 'price-discount__discount').get_attribute('innerHTML')
141
+ info['discount'] = discount.replace('-', '') # Remove any dashes
142
+
143
+ except (NoSuchElementException, ValueError):
144
+ try:
145
+ # Try to get discount using another method
146
+ discount_div = product_element.find_element(By.CLASS_NAME, 'style__DiscountPercentStyled-sc-e9h7mj-1')
147
+ info['discount'] = discount_div.text.replace('-', '') # Remove any dashes
148
+
149
+ except NoSuchElementException:
150
+ # If both attempts fail, set discount to 0
151
+ info['discount'] = '0'
152
+
153
+ # # tiki now
154
+ # try:
155
+ # info['tiki_now'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-service').find_element(By.CLASS_NAME, 'item'))
156
+ # except NoSuchElementException:
157
+ # info['tiki_now'] = False
158
+
159
+ # # freeship, official seller, and/or trusted seller
160
+ # try:
161
+ # info['freeship'] = False
162
+ # info['official'] = False
163
+ # info['trusted'] = False
164
+ # thumbnail_tag = product_element.find_element(By.CLASS_NAME, 'thumbnail')
165
+ # list_img = thumbnail_tag.find_elements(By.TAG_NAME, 'img')
166
+ # # list_img = product_element.find_elements(By.XPATH, ".//div[@class='thumbnail']/img")
167
+ # for img in list_img:
168
+ # if img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/dc/0d/49/3251737db2de83b74eba8a9ad6d03338.png':
169
+ # info['freeship'] = True
170
+ # elif img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/b9/1f/4b/557eac9c67a4466ccebfa74cde854215.png':
171
+ # info['official'] = True
172
+ # elif img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/e0/41/da/bb0fc684a838eff5e264ce0534a148f0.png':
173
+ # info['trusted'] = True
174
+ # except NoSuchElementException:
175
+ # pass
176
+
177
+ # # under price
178
+ # try:
179
+ # # info['under_price'] = bool(product_element.find_element(By.XPATH, ".//div[@class='badge-under-price']/child::div[@class='item']"))
180
+ # info['under_price'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-under-price').find_element(By.CLASS_NAME, 'item'))
181
+ # except NoSuchElementException:
182
+ # info['under_price'] = False
183
+
184
+ # # installment
185
+ # try:
186
+ # # info['installment'] = bool(product_element.find_element(By.XPATH, ".//div[@class='badge-benefits']//child::img[1]"))
187
+ # info['installment'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-benefits').find_element(By.TAG_NAME, 'img'))
188
+ # except NoSuchElementException:
189
+ # info['installment'] = False
190
+
191
+ # # gift
192
+ # try:
193
+ # # info['gift'] = bool(product_element.find_element(By.XPATH, ".//div[@class='freegift-list']"))
194
+ # info['gift'] = bool(product_element.find_element(By.CLASS_NAME, 'freegift-list'))
195
+ # except NoSuchElementException:
196
+ # info['gift'] = False
197
+
198
+ return info
199
+
200
+
201
+ ### Function to scrape all products from a page
202
+ def get_tiki_product_info_from_page(page_url, extra_info=False):
203
+ """
204
+ Extract info from all products of a specfic page_url on Tiki website
205
+ Args:
206
+ page_url: (string) url of the page to scrape
207
+ Returns:
208
+ data: (list) a list of dictionary of products info. If no products
209
+ found, return empty list.
210
+ """
211
+ global DRIVER
212
+
213
+ data = []
214
+ DRIVER.get(page_url) # Use the driver to get info from the product page
215
+ time.sleep(3)
216
+
217
+
218
+ try:
219
+ # no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
220
+ no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
221
+ print("EMPTY PAGE")
222
+ return data
223
+ except NoSuchElementException:
224
+ no_product_found = False
225
+
226
+ # FIND ALL PRODUCT ITEMS
227
+ # products = DRIVER.find_elements(By.XPATH, "//a[@class='product-item']")
228
+ products = DRIVER.find_elements(By.CLASS_NAME, 'product-item')
229
+ print(f'Found {len(products)} products')
230
+
231
+ if (not no_product_found) and len(products)>0:
232
+ for i in products:
233
+ product_dict = get_tiki_product_info_single(i, extra_info)
234
+ data.append(product_dict)
235
+ return data
236
+
237
+ ### Function to get product info from a main category
238
+ def get_tiki_product_info_from_category(cat_url, max_page=0, extra_info=False):
239
+ '''
240
+ Scrape for multiple pages of products of a category.
241
+ Uses get_product_info_from_page().
242
+
243
+ Args:
244
+ cat_url: (string) a url string of a category
245
+ max_page: (int) an integer denoting the maximum number of pages to scrape.
246
+ Default value is 0 to scrape all pages.
247
+ Returns:
248
+ products: a list in which every element is a dictionary of one product's information
249
+ '''
250
+ products = []
251
+
252
+ page_n = 1
253
+ cat_page_url = cat_url + f'?page={page_n}'
254
+ product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
255
+
256
+ while len(product_list)>0:
257
+ products.extend(product_list)
258
+ page_n += 1
259
+
260
+ # stop_flag = False if max_page <= 0 else (page_n > max_page)
261
+ stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
262
+ if stop_flag:
263
+ break
264
+
265
+ cat_page_url = cat_url + f'?page={page_n}'
266
+ product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
267
+
268
+ return products
269
+
270
+ def scrap_tiki(search_product, num_max_page, extra_info):
271
+
272
+ start_driver(force_restart=True)
273
+
274
+ url = 'https://tiki.vn/search?sort=default&q="' + search_product +'"'
275
+
276
+ prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
277
+
278
+ # prod_per_cat = get_product_info_from_category(main_cat['URL'], num_max_page, extra_info=extra_info)
279
+ prod_per_cat = get_tiki_product_info_from_category(url, num_max_page, extra_info = extra_info)
280
+ prod_data.extend(prod_per_cat)
281
+ close_driver() # Close driver when we're done
282
+
283
+ return prod_data