yrarjun commited on
Commit
4eda287
·
1 Parent(s): b3e2afc
main.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from scrapers.scraper_daraz import Daraz
4
+
5
+ app = FastAPI()
6
+
7
+
8
+ # CORS settings
9
+ origins = ["*"]
10
+
11
+ app.add_middleware(
12
+ CORSMiddleware,
13
+ allow_origins=origins,
14
+ allow_credentials=True,
15
+ allow_methods=["GET", "POST", "PUT", "DELETE"],
16
+ allow_headers=["*"],
17
+ )
18
+
19
+ @app.get("/")
20
+ async def read_root():
21
+ return {"message": "We gonna scrap a lot of datas"}
22
+
23
+ @app.get("/products/{url:path}/")
24
+ async def all_products(url: str):
25
+ decoded_url = url.replace("%2F", "/")
26
+ daraz = Daraz(decoded_url)
27
+ products = await daraz.scrape_products()
28
+ return products
29
+
30
+ @app.get("/product-details/{url:path}/")
31
+ async def get_product_details(url: str):
32
+ decoded_url = url.replace("%2F", "/")
33
+ daraz = Daraz(decoded_url)
34
+ product_details = await daraz.product_details(decoded_url)
35
+ return product_details
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ annotated-types==0.6.0
2
+ anyio==4.3.0
3
+ beautifulsoup4==4.11.2
4
+ certifi==2024.2.2
5
+ charset-normalizer==3.3.2
6
+ click==8.1.7
7
+ colorama==0.4.6
8
+ exceptiongroup==1.2.0
9
+ fastapi==0.110.0
10
+ greenlet==2.0.1
11
+ h11==0.14.0
12
+ idna==3.6
13
+ lxml==4.9.2
14
+ playwright==1.32.0
15
+ pydantic==2.6.4
16
+ pydantic_core==2.16.3
17
+ pyee==9.0.4
18
+ PyYAML==6.0
19
+ requests==2.31.0
20
+ sniffio==1.3.1
21
+ soupsieve==2.5
22
+ starlette==0.36.3
23
+ typing_extensions==4.10.0
24
+ urllib3==2.2.1
25
+ uvicorn==0.29.0
scrapers/__pycache__/daraz_scraper.cpython-310.pyc ADDED
Binary file (8.42 kB). View file
 
scrapers/__pycache__/scraper_daraz.cpython-310.pyc ADDED
Binary file (4.84 kB). View file
 
scrapers/daraz_scraper.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tools.functionalities import userAgents, TryExcept, yamlMe, check_domain, random_interval, verifyDarazURL
2
+ from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
3
+ from bs4 import BeautifulSoup
4
+ import pymongo as mong
5
+ import pandas as pd
6
+ import requests
7
+ import re
8
+ from tools.functionalities import *;
9
+ from time import sleep
10
+
11
+ class Daraz:
12
+ def __init__(self, base_url):
13
+ self.base_url = base_url
14
+ self.headers = {"User-Agent": userAgents()}
15
+ self.catchClause = TryExcept()
16
+ self.yaml_me = yamlMe('selectors')
17
+
18
+ async def category_name(self):
19
+ req = requests.get(self.base_url, headers=self.headers)
20
+ soup = BeautifulSoup(req.content, 'lxml')
21
+ category = [cate.text.strip() for cate in soup.find('ul', class_='breadcrumb').find_all('li', class_='breadcrumb_item')][-1]
22
+ name = [nam.strip() for nam in re.split(r'[,/]', category)]
23
+ return ' '.join(name)
24
+
25
+ async def product_details(self, product_url):
26
+
27
+ async with async_playwright() as p:
28
+ browser = await p.firefox.launch(headless = True)
29
+ context = await browser.new_context(user_agent = userAgents())
30
+ page = await context.new_page()
31
+ await page.goto(product_url)
32
+
33
+ # wait for few seconds
34
+ await page.wait_for_timeout(timeout=random_interval(10)*1000)
35
+ # sleep()
36
+
37
+
38
+ datas = {
39
+ "Name": await self.catchClause.text(page.query_selector(self.yaml_me['product_name'])),
40
+ "Discount price": await self.catchClause.text(page.query_selector(self.yaml_me['product_dc_price'])),
41
+ "Original price": await self.catchClause.text(page.query_selector(self.yaml_me['product_og_price'])),
42
+ "Sold by": await self.catchClause.text(page.query_selector(self.yaml_me['store'])),
43
+ "Store link": f"""https:{await self.catchClause.attributes(page.query_selector(self.yaml_me['store']), 'href')}""",
44
+ "Hyperlink": product_url,
45
+ "Image": await self.catchClause.attributes(page.query_selector(self.yaml_me['image_link']), 'src'),
46
+ }
47
+
48
+ # select product_details_description
49
+ product_details = await self.catchClause.text_all(await page.query_selector_all(self.yaml_me['product_details']))
50
+
51
+ if product_details:
52
+ datas['product_details'] = product_details
53
+
54
+ # product_specifications
55
+ elements = await page.query_selector_all(self.yaml_me['product_specifications'])
56
+
57
+ if elements:
58
+ for element in elements:
59
+ key, value = await self.catchClause.extract_key_value(element)
60
+ datas[key] = value
61
+ else:
62
+ print("No product specifications found.")
63
+
64
+ # await browser.close()
65
+ # await browser.close()
66
+ return datas
67
+
68
+ async def scrape_datas(self):
69
+ daraz_dicts = []
70
+
71
+ async with async_playwright() as p:
72
+ browser = await p.chromium.launch(headless=True)
73
+ context = await browser.new_context(user_agent = userAgents())
74
+
75
+ page = await context.new_page()
76
+
77
+ await page.goto(self.base_url)
78
+
79
+ # Determine the country from the URL.
80
+ country = await check_domain(self.base_url)
81
+
82
+ print(f"""Initiating the automation | Powered by Playwright.\n
83
+ Daraz {country}
84
+ """)
85
+
86
+ # Get the name of the category being scraped.
87
+ category = await self.category_name()
88
+
89
+ # Get the total number of pages in the category.
90
+ page_number_elements = await page.query_selector_all(self.yaml_me['last_page_number'])
91
+
92
+ self.last_page_number = int(await (page_number_elements[len(page_number_elements)-2]).get_attribute('title'))
93
+
94
+ print(f"Category: {category} | Number of pages: {self.last_page_number}")
95
+
96
+ # Get the "next page" button.
97
+ next_page = await page.query_selector(self.yaml_me['next_page_button'])
98
+
99
+ # Loop through the page using the "next page button".
100
+ for count in range(1, self.last_page_number+1):
101
+ main_contents = await page.query_selector_all(self.yaml_me['category_main_contents'])
102
+
103
+ # Print a message indicating the current page being scraped.
104
+ print(f"\nScraping page | {count}")
105
+
106
+ # Wait for a short time before scraping the next page.
107
+ await page.wait_for_timeout(timeout=random_interval(5)*1000)
108
+
109
+ # Loop through the products on the current page and extract their data.
110
+ for content in main_contents:
111
+ product_name = await self.catchClause.text(content.query_selector(self.yaml_me['category_product_names']))
112
+
113
+ try:
114
+ dc_price = float(re.sub(r'[Rs.,]', '', await ( await content.query_selector(self.yaml_me['category_discount_price'])).inner_text()).strip())
115
+ except Exception as e:
116
+ dc_price = "N/A"
117
+
118
+ # if oc_price is not oc is dc_price
119
+ try:
120
+ og_price = float(re.sub(r'[Rs.,]', '', await ( await content.query_selector(self.yaml_me['category_og_price'])).inner_text()).strip())
121
+ except Exception as e:
122
+ og_price = dc_price
123
+ dc_price = 'N/A'
124
+ try:
125
+ # find dc% by calcuation
126
+ # dc_rate = float(re.sub(r'[-%]', '', await (await content.query_selector(self.yaml_me['category_discount_rate'])).inner_text()).strip())
127
+ dc_rate = ((og_price - dc_price) / og_price) * 100
128
+ dc_rate = round(dc_rate,2)
129
+ except Exception as e:
130
+ dc_rate = "N/A"
131
+
132
+ await page.wait_for_timeout(timeout=0.03*1000)
133
+
134
+ datas = {
135
+ "Name": product_name,
136
+ "Original price": og_price,
137
+ "Discount price": dc_price,
138
+ "Discount rate": dc_rate,
139
+ "Hyperlink": f"""https:{await self.catchClause.attributes(content.query_selector(self.yaml_me['brand_product_link']), 'href')}""",
140
+ "Image": await self.catchClause.attributes(content.query_selector(self.yaml_me['category_product_image']), 'src') ,
141
+ }
142
+
143
+ daraz_dicts.append(datas)
144
+
145
+ # Click the "next page" button to go to the next page.
146
+ try:
147
+ await page.wait_for_selector(self.yaml_me['next_page_button'], timeout = 10000)
148
+ await next_page.click()
149
+ except PlaywrightTimeoutError:
150
+ # If the "next page" button cannot be found, there are no more pages to scrape.
151
+ # Print a message indicating the error and break out of the loop.
152
+ print(f"Content loading error at page number {count}. There are no result found beyond this page. Scraper is exiting......")
153
+ break
154
+ # Close the browser.
155
+ await browser.close()
156
+ return daraz_dicts
157
+
158
+ async def scrape_details(self):
159
+ daraz_dicts = []
160
+
161
+ async with async_playwright() as p:
162
+ browser = await p.chromium.launch(headless=True)
163
+ context = await browser.new_context(user_agent = userAgents())
164
+ page = await context.new_page()
165
+ await page.goto(self.base_url)
166
+
167
+ # Determine the country from the URL.
168
+ country = await check_domain(self.base_url)
169
+ print(f"""Initiating the automation | Powered by Playwright.\n
170
+ Daraz {country}
171
+ """)
172
+
173
+ # Get the name of the category being scraped.
174
+ category = await self.category_name()
175
+
176
+ # Get the total number of pages in the category.
177
+ page_number_elements = await page.query_selector_all(self.yaml_me['last_page_number'])
178
+
179
+ self.last_page_number = int(await (page_number_elements[len(page_number_elements)-2]).get_attribute('title'))
180
+
181
+ print(f"Category: {category} | Number of pages: {self.last_page_number}")
182
+
183
+ # Get the "next page" button.
184
+ next_page = await page.query_selector(self.yaml_me['next_page_button'])
185
+
186
+ # Loop through the page using the "next page button".
187
+ for count in range(1, self.last_page_number+1):
188
+ main_contents = await page.query_selector_all(self.yaml_me['category_main_contents'])
189
+
190
+ # Print a message indicating the current page being scraped.
191
+ print(f"\nScraping page | {count}")
192
+
193
+ # Wait for a short time before scraping the next page.
194
+ await page.wait_for_timeout(timeout=random_interval(5)*1000)
195
+
196
+ # Loop through the products on the current page and extract their data.
197
+ # for content in main_contents:
198
+
199
+
200
+ for content in main_contents[:5]:
201
+ product_name = await self.catchClause.text(content.query_selector(self.yaml_me['category_product_names']))
202
+ product_link = f"""https:{await self.catchClause.attributes(content.query_selector(self.yaml_me['brand_product_link']), 'href')}"""
203
+
204
+
205
+ # here waht i want to got through product details and scrae and only go to next product
206
+ datas = await self.product_details(product_url=product_link)
207
+ daraz_dicts.append(datas)
208
+
209
+ # Click the "next page" button to go to the next page.
210
+ try:
211
+ await page.wait_for_selector(self.yaml_me['next_page_button'], timeout = 10000)
212
+ await next_page.click()
213
+ except PlaywrightTimeoutError:
214
+ # If the "next page" button cannot be found, there are no more pages to scrape.
215
+ # Print a message indicating the error and break out of the loop.
216
+ print(f"Content loading error at page number {count}. There are no result found beyond this page. Scraper is exiting......")
217
+ break
218
+ # Close the browser.
219
+ await browser.close()
220
+ return daraz_dicts
221
+
222
+ async def export_to_mongo(self):
223
+ """
224
+ Asynchronously exports scraped data to a MongoDB database.
225
+
226
+ Steps:
227
+ 1. Obtains the collection name by calling the `category_name()` method asynchronously.
228
+ 2. Establishes a connection to the local MongoDB server on port 27017.
229
+ 3. Selects the 'daraz' database from the client.
230
+ 4. Fetches data by calling the `scrape_datas()` method asynchronously.
231
+ 5. Inserts the fetched data into the specified collection in the database using `insert_many()`.
232
+ 6. Closes the MongoDB client.
233
+
234
+ Returns:
235
+ pymongo.results.InsertManyResult: The result object containing information about the insertion operation.
236
+ """
237
+ collection_name = await self.category_name()
238
+ client = mong.MongoClient('mongodb://localhost:27017/')
239
+ db = client['daraz']
240
+ collection = db[collection_name]
241
+ print(f"Collecting {collection_name} to Mongo database.")
242
+ datas = await self.scrape_datas()
243
+ result = collection.insert_many(datas)
244
+ client.close()
245
+ return result
246
+
247
+ async def export_to_sheet2(self):
248
+ # Obtain the file name
249
+ file_name = await self.category_name()
250
+ print(f"Exporting {file_name} to Google Sheets.")
251
+
252
+ # Authenticate with Google Sheets
253
+ client = authenticate_with_google_sheets()
254
+
255
+ # Spreadsheet ID
256
+ spreadsheet_id = "1yjQNaTMieHH__AbnR7VgRcc5VcRcvv7BsyVHUCXB1N0"
257
+
258
+ # Create a new sheet with the obtained file name
259
+ create_new_sheet(client, spreadsheet_id, file_name)
260
+
261
+ # Fetch data using the scrape_datas() method
262
+ datas = await self.scrape_datas()
263
+
264
+ # Convert data to DataFrame
265
+ df = pd.DataFrame(datas)
266
+
267
+ # Remove duplicates from DataFrame
268
+ df = df.drop_duplicates()
269
+
270
+ # Open the sheet
271
+ sheet = client.open_by_key(spreadsheet_id).worksheet(file_name)
272
+
273
+ # Write header row and make it bold
274
+ header_row = df.columns.tolist()
275
+ sheet.insert_row(header_row, 1)
276
+ sheet.format('A1:Z1', {'textFormat': {'bold': True}})
277
+ sheet.freeze(rows=1) # Freeze the header row
278
+
279
+ # Insert data into the sheet
280
+ for i, data in enumerate(df.values, start=2): # Start from row 2 after the header row
281
+ sheet.insert_row(list(data), i)
282
+
283
+ print(f"Data exported to Google Sheets sheet '{file_name}'.")
284
+
285
+ # async def export_to_sheet(self):
286
+ # """
287
+ # Asynchronously exports scraped data to an Excel sheet.
288
+
289
+ # Steps:
290
+ # 1. Obtains the file name by calling the `category_name()` method asynchronously.
291
+ # 2. Creates a 'Daraz database' directory if it doesn't exist.
292
+ # 3. Fetches data by calling the `scrape_datas()` method asynchronously.
293
+ # 4. Converts the data into a Pandas DataFrame.
294
+ # 5. Writes the DataFrame to an Excel file located at 'Daraz database/{file_name}.xlsx'.
295
+
296
+ # Note:
297
+ # The function assumes that the `scrape_datas()` method returns a list of dictionaries, each representing a row of data.
298
+
299
+ # Returns:
300
+ # None
301
+ # """
302
+ # file_name = await self.category_name()
303
+ # print(f"Exporting {file_name} to Excel database.")
304
+ # create_path('Daraz database')
305
+ # datas = await self.scrape_datas()
306
+ # df = pd.DataFrame(datas)
307
+ # df.to_excel(f"Daraz database//{file_name}.xlsx", index = False)
308
+
309
+ async def export_to_sheet(self):
310
+ file_name = await self.category_name()
311
+ print(f"Exporting {file_name} to Excel database.")
312
+ create_path('Daraz Details')
313
+ datas = await self.scrape_details()
314
+ df = pd.DataFrame(datas)
315
+ print(df)
316
+ df.to_excel(f"Daraz Details//{file_name[:9]} Details .xlsx", index = False)
scrapers/scrape_details_all.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ async def scrape_details(self):
2
+ daraz_dicts = []
3
+
4
+ async with async_playwright() as p:
5
+ browser = await p.chromium.launch(headless=True)
6
+ context = await browser.new_context(user_agent=userAgents())
7
+ page = await context.new_page()
8
+ await page.goto(self.base_url)
9
+
10
+ country = await check_domain(self.base_url)
11
+
12
+ category = await self.category_name()
13
+ page_number_elements = await page.query_selector_all(self.yaml_me['last_page_number'])
14
+ self.last_page_number = int(await (page_number_elements[len(page_number_elements) - 2]).get_attribute('title'))
15
+ print(f"Category: {category} | Number of pages: {self.last_page_number}")
16
+
17
+ next_page = await page.query_selector(self.yaml_me['next_page_button'])
18
+
19
+ for count in range(1, self.last_page_number + 1):
20
+ main_contents = await page.query_selector_all(self.yaml_me['category_main_contents'])
21
+ print(f"\nScraping page | {count}")
22
+
23
+ # await page.wait_for_timeout(timeout=random_interval(5) * 1000)
24
+
25
+ # List to store coroutines for scraping product details
26
+ coroutines = []
27
+
28
+ for content in main_contents:
29
+ product_link = f"""https:{await self.catchClause.attributes(content.query_selector(self.yaml_me['category_product_link']), 'href')}"""
30
+
31
+ print(f"Product Link: {product_link}")
32
+
33
+ # Append coroutine to the list
34
+ coroutines.append(self.product_details(product_link))
35
+
36
+ # Execute all coroutines concurrently and wait for them to finish
37
+ product_details_results = await asyncio.gather(*coroutines)
38
+ print(product_details_results)
39
+
40
+ # print(coroutines)
41
+
42
+ # Extend daraz_dicts with the results
43
+ daraz_dicts.append(product_details_results)
44
+
45
+ try:
46
+ await page.wait_for_selector(self.yaml_me['next_page_button'], timeout=10000)
47
+ await next_page.click()
48
+ except PlaywrightTimeoutError:
49
+ print(f"Content loading error at page number {count}. There are no results found beyond this page. Scraper is exiting......")
50
+ break
51
+
52
+ await browser.close()
53
+ return daraz_dicts
scrapers/scraper_daraz.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tools.functionalities import userAgents, TryExcept, yamlMe, check_domain, random_interval, create_path
2
+ from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
3
+ from bs4 import BeautifulSoup
4
+ import requests
5
+ import re
6
+
7
+
8
+ class Daraz:
9
+ def __init__(self, base_url):
10
+ self.base_url = base_url
11
+ self.headers = {"User-Agent": userAgents()}
12
+ self.catchClause = TryExcept()
13
+ self.yaml_me = yamlMe('selectors')
14
+
15
+ async def category_name(self):
16
+ req = requests.get(self.base_url, headers=self.headers)
17
+ soup = BeautifulSoup(req.content, 'lxml')
18
+ category = [cate.text.strip() for cate in soup.find('ul', class_='breadcrumb').find_all('li', class_='breadcrumb_item')][-1]
19
+ name = [nam.strip() for nam in re.split(r'[,/]', category)]
20
+ return ' '.join(name)
21
+
22
+ async def product_details(self, product_url):
23
+
24
+ async with async_playwright() as p:
25
+ browser = await p.chromium.launch(headless = True)
26
+ context = await browser.new_context(user_agent = userAgents())
27
+ page = await context.new_page()
28
+ await page.goto(product_url)
29
+
30
+ # wait for few seconds
31
+ # await page.wait_for_timeout(timeout=random_interval(10)*1000)
32
+ # sleep()
33
+
34
+ datas = {
35
+ "Name": await self.catchClause.text(page.query_selector(self.yaml_me['product_name'])),
36
+ "Discount price": await self.catchClause.text(page.query_selector(self.yaml_me['product_dc_price'])),
37
+ "Original price": await self.catchClause.text(page.query_selector(self.yaml_me['product_og_price'])),
38
+ "Sold by": await self.catchClause.text(page.query_selector(self.yaml_me['store'])),
39
+ "Store link": f"""https:{await self.catchClause.attributes(page.query_selector(self.yaml_me['store']), 'href')}""",
40
+ "Hyperlink": product_url,
41
+ "Image": await self.catchClause.attributes(page.query_selector(self.yaml_me['image_link']), 'src'),
42
+ }
43
+
44
+ # select product_details_description
45
+ product_details = await self.catchClause.text_all(await page.query_selector_all(self.yaml_me['product_details']))
46
+
47
+ if product_details:
48
+ datas['product_details'] = product_details
49
+
50
+ # product_specifications
51
+ elements = await page.query_selector_all(self.yaml_me['product_specifications'])
52
+
53
+ if elements:
54
+ for element in elements:
55
+ key, value = await self.catchClause.extract_key_value(element)
56
+ datas[key] = value
57
+ else:
58
+ print("No product specifications found.")
59
+
60
+ # await browser.new_page()
61
+ await browser.close()
62
+ return datas
63
+
64
+ async def scrape_products(self):
65
+ daraz_dicts = []
66
+
67
+ async with async_playwright() as p:
68
+ browser = await p.chromium.launch(headless=True)
69
+ context = await browser.new_context(user_agent = userAgents())
70
+
71
+ page = await context.new_page()
72
+
73
+ await page.goto(self.base_url)
74
+
75
+ # Determine the country from the URL.
76
+ country = await check_domain(self.base_url)
77
+
78
+ print(f"""Initiating the automation | Powered by Playwright.\n
79
+ Daraz {country}
80
+ """)
81
+
82
+ # Get the name of the category being scraped.
83
+ category = await self.category_name()
84
+
85
+ # Get the total number of pages in the category.
86
+ page_number_elements = await page.query_selector_all(self.yaml_me['last_page_number'])
87
+
88
+ self.last_page_number = int(await (page_number_elements[len(page_number_elements)-2]).get_attribute('title'))
89
+
90
+ print(f"Category: {category} | Number of pages: {self.last_page_number}")
91
+
92
+ # Get the "next page" button.
93
+ next_page = await page.query_selector(self.yaml_me['next_page_button'])
94
+
95
+ # Loop through the page using the "next page button".
96
+ for count in range(1, self.last_page_number+1):
97
+ main_contents = await page.query_selector_all(self.yaml_me['category_main_contents'])
98
+
99
+ # Print a message indicating the current page being scraped.
100
+ print(f"\nScraping page | {count}")
101
+
102
+ # Wait for a short time before scraping the next page.
103
+ await page.wait_for_timeout(timeout=random_interval(5)*1000)
104
+
105
+ # Loop through the products on the current page and extract their data.
106
+ for content in main_contents:
107
+ product_name = await self.catchClause.text(content.query_selector(self.yaml_me['category_product_names']))
108
+
109
+ print(f"Scrapping Product: {product_name}")
110
+
111
+ try:
112
+ dc_price = float(re.sub(r'[Rs.,]', '', await ( await content.query_selector(self.yaml_me['category_discount_price'])).inner_text()).strip())
113
+ except Exception as e:
114
+ dc_price = "N/A"
115
+
116
+ # if oc_price is not oc is dc_price
117
+ try:
118
+ og_price = float(re.sub(r'[Rs.,]', '', await ( await content.query_selector(self.yaml_me['category_og_price'])).inner_text()).strip())
119
+ except Exception as e:
120
+ og_price = dc_price
121
+ dc_price = 'N/A'
122
+ try:
123
+ # find dc% by calcuation
124
+ # dc_rate = float(re.sub(r'[-%]', '', await (await content.query_selector(self.yaml_me['category_discount_rate'])).inner_text()).strip())
125
+ dc_rate = ((og_price - dc_price) / og_price) * 100
126
+ dc_rate = round(dc_rate,2)
127
+ except Exception as e:
128
+ dc_rate = "N/A"
129
+
130
+ await page.wait_for_timeout(timeout=0.03*1000)
131
+
132
+ datas = {
133
+ "Name": product_name,
134
+ "Original price": og_price,
135
+ "Discount price": dc_price,
136
+ "Discount rate": dc_rate,
137
+ "Hyperlink": f"""https:{await self.catchClause.attributes(content.query_selector(self.yaml_me['category_product_link']), 'href')}""",
138
+ "Image": await self.catchClause.attributes(content.query_selector(self.yaml_me['category_product_image']), 'src') ,
139
+ }
140
+
141
+ daraz_dicts.append(datas)
142
+
143
+ # Click the "next page" button to go to the next page.
144
+ try:
145
+ await page.wait_for_selector(self.yaml_me['next_page_button'], timeout = 10000)
146
+ await next_page.click()
147
+ except PlaywrightTimeoutError:
148
+ # If the "next page" button cannot be found, there are no more pages to scrape.
149
+ # Print a message indicating the error and break out of the loop.
150
+ print(f"Content loading error at page number {count}. There are no result found beyond this page. Scraper is exiting......")
151
+ break
152
+ # Close the browser.
153
+ await browser.close()
154
+ return daraz_dicts
scrapers/selectors.yaml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CSS selectors for product category elements:
2
+ # category_main_contents: "div[data-qa-locator='product-item']"
3
+ category_main_contents: "div.gridItem--Yd0sa"
4
+ # category_main_contents: "div[data-qa-locator='product-item']:not(:has(*))"
5
+
6
+
7
+ category_product_names: "div.title-wrapper--IaQ0m"
8
+
9
+ # category_product_links: "a.product-card--vHfY9"
10
+ category_product_link: "a.product-card--vHfY9"
11
+
12
+ # category_product_image: "div.image-wrapper--ydch1 img"
13
+ category_product_image: "img#id-img"
14
+ category_discount_price: "div.current-price--Jklkc span.currency--GVKjl"
15
+ category_discount_rate: "span.discount--HADrg"
16
+ category_og_price: "div.original-price--lHYOH del.currency--GVKjl"
17
+
18
+ last_page_number: "li[tabindex='0']"
19
+ next_page_button: "li[title='Next Page']"
20
+
21
+
22
+ # CSS selectors for individual product link:
23
+ product_name: "span.pdp-mod-product-badge-title"
24
+ ratings: "div.review-info-rate span.score"
25
+ product_dc_price: "div.pdp-mod-product-price span.pdp-price.pdp-price_type_normal.pdp-price_color_orange.pdp-price_size_xl"
26
+ product_og_price: "div.pdp-mod-product-price div.origin-block span:first-child"
27
+ image_link: "img.gallery-preview-panel__image"
28
+ store: "div.seller-name__detail a:first-child"
29
+
30
+ product_details: "div > div.pdp-product-desc > div.html-content.pdp-product-highlights > ul > li"
31
+ product_specifications: "div > div.pdp-product-desc > div.pdp-mod-specification > div.pdp-general-features > ul > li"
32
+
33
+
34
+ # specifications
35
+ brand: "div.pdp-general-features ul li.key-li:nth-child(1) div.key-value"
36
+ sku: "div.pdp-general-features ul li.key-li:nth-child(2) div.key-value"
37
+ battery_capacity: "div.pdp-general-features ul li.key-li:nth-child(3) div.key-value"
38
+ ppi: "div.pdp-general-features ul li.key-li:nth-child(4) div.key-value"
39
+ charger_type: "div.pdp-general-features ul li.key-li:nth-child(5) div.key-value"
40
+ flash: "div.pdp-general-features ul li.key-li:nth-child(6) div.key-value"
41
+ screen_size_inches: "div.pdp-general-features ul li.key-li:nth-child(7) div.key-value"
42
+ bluetooth_support: "div.pdp-general-features ul li.key-li:nth-child(8) div.key-value"
43
+ build_type: "div.pdp-general-features ul li.key-li:nth-child(9) div.key-value"
44
+ wifi: "div.pdp-general-features ul li.key-li:nth-child(10) div.key-value"
45
+ sim_type: "div.pdp-general-features ul li.key-li:nth-child(11) div.key-value"
46
+ removable_battery: "div.pdp-general-features ul li.key-li:nth-child(12) div.key-value"
47
+ gps: "div.pdp-general-features ul li.key-li:nth-child(13) div.key-value"
48
+ camera_front_megapixels: "div.pdp-general-features ul li.key-li:nth-child(14) div.key-value"
49
+ fm_radio: "div.pdp-general-features ul li.key-li:nth-child(15) div.key-value"
50
+ nfc: "div.pdp-general-features ul li.key-li:nth-child(16) div.key-value"
51
+ fingerprint_sensor: "div.pdp-general-features ul li.key-li:nth-child(17) div.key-value"
52
+ expandable_storage: "div.pdp-general-features ul li.key-li:nth-child(18) div.key-value"
53
+ memory_card_slot_type: "div.pdp-general-features ul li.key-li:nth-child(19) div.key-value"
54
+ camera_back_megapixels: "div.pdp-general-features ul li.key-li:nth-child(20) div.key-value"
55
+ number_of_cameras: "div.pdp-general-features ul li.key-li:nth-child(21) div.key-value"
56
+ display_protection: "div.pdp-general-features ul li.key-li:nth-child(22) div.key-value"
57
+ model_year: "div.pdp-general-features ul li.key-li:nth-child(23) div.key-value"
58
+ video_resolution: "div.pdp-general-features ul li.key-li:nth-child(24) div.key-value"
59
+ network_connections: "div.pdp-general-features ul li.key-li:nth-child(25) div.key-value"
60
+ operating_system: "div.pdp-general-features ul li.key-li:nth-child(26) div.key-value"
61
+ notch_display: "div.pdp-general-features ul li.key-li:nth-child(27) div.key-value"
62
+ fast_charging: "div.pdp-general-features ul li.key-li:nth-child(28) div.key-value"
63
+ model: "div.pdp-general-features ul li.key-li:nth-child(29) div.key-value"
64
+ headphone_jack: "div.pdp-general-features ul li.key-li:nth-child(30) div.key-value"
65
+ wireless_charging: "div.pdp-general-features ul li.key-li:nth-child(31) div.key-value"
66
+ display_type: "div.pdp-general-features ul li.key-li:nth-child(32) div.key-value"
67
+ processor_type: "div.pdp-general-features ul li.key-li:nth-child(33) div.key-value"
68
+ refresh_rate: "div.pdp-general-features ul li.key-li:nth-child(34) div.key-value"
69
+ number_of_sim_slots: "div.pdp-general-features ul li.key-li:nth-child(35) div.key-value"
70
+ resolution: "div.pdp-general-features ul li.key-li:nth-child(36) div.key-value"
71
+
72
+
tools/__pycache__/functionalities.cpython-310.pyc ADDED
Binary file (5.04 kB). View file
 
tools/functionalities.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools
2
+ import random
3
+ import yaml
4
+ import os
5
+ import re
6
+ import os
7
+
8
+ class TryExcept:
9
+ """
10
+ A class that provides try-except functionality for retrieving element attributes and inner text.
11
+
12
+ Methods:
13
+ - text(element): Retrieves the inner text of an element or returns "N/A" if the element has no inner text.
14
+ - attribute(element, attr): Retrieves the value of
15
+ """
16
+ async def text(self, element):
17
+ try:
18
+ elements = (await (await element).inner_text()).strip()
19
+ except AttributeError:
20
+ elements = "N/A"
21
+ return elements
22
+
23
+ async def attributes(self, element, attr):
24
+ try:
25
+ elements = await (await element).get_attribute(attr)
26
+ except AttributeError:
27
+ elements = "N/A"
28
+ return elements
29
+
30
+ async def text_all(self, elements):
31
+ texts = []
32
+ for element in elements:
33
+ try:
34
+ text = (await element.inner_text()).strip()
35
+ except AttributeError:
36
+ text = "N/A"
37
+ texts.append(text)
38
+ return texts
39
+
40
+ async def extract_key_value(self, element):
41
+ try:
42
+ key_element = await element.query_selector('.key-title')
43
+ value_element = await element.query_selector('.key-value')
44
+ if key_element and value_element:
45
+ key = (await key_element.inner_text()).strip()
46
+ value = (await value_element.inner_text()).strip()
47
+ return key, value
48
+ else:
49
+ return "N/A", "N/A"
50
+ except AttributeError:
51
+ return "N/A", "N/A"
52
+
53
+
54
+ def create_path(dir_name):
55
+ # """
56
+ # Creates a directory with the specified name if i doesn't already exist.
57
+
58
+ # Args:
59
+ # -dir_name: A string representing the name of the direcory to create.
60
+
61
+ # Return:
62
+ # -None
63
+ # """
64
+ path_dir = os.path.join(os.getcwd(), dir_name)
65
+ if os.path.exists(path_dir):
66
+ pass
67
+ else:
68
+ os.mkdir(path_dir)
69
+
70
+
71
+ def verifyDarazURL(url):
72
+ """
73
+ Check if the URL belongs to a Daraz website.
74
+
75
+ Args:
76
+ - url (str): The URL to check.
77
+
78
+ Returns:
79
+ - bool: True if the URL is not a Daraz website, False otherwise.
80
+ """
81
+
82
+ daraz_pattern = re.search("""^https://www.daraz.(com.np|lk|pk|com.bd)/+""", url)
83
+ if daraz_pattern == None:
84
+ return True
85
+ else:
86
+ return False
87
+
88
+
89
+ def random_interval(value):
90
+ """
91
+ This function takes an integer value as an argument and returns a random integer between 2 and the length
92
+ of a list containing values from 0 up to the given value.
93
+
94
+ Args:
95
+ - value (int): an integer value representing the upper limit of the range
96
+
97
+ Returns:
98
+ - rand_time (int): a random integer between 2 and the length of the range list plus 1
99
+ """
100
+
101
+ ranges = [time for time in range(value+1)]
102
+ rand_time = random.randint(2, len(ranges))
103
+ return rand_time
104
+
105
+
106
+ async def check_domain(url):
107
+ """
108
+ Check the domain of a URL and return the country it belongs to.
109
+
110
+ Args:
111
+ - url (str): The URL to check.
112
+
113
+ Returns:
114
+ - str: The name of the country the domain belongs to (Nepal, Sri Lanka, Bangladesh, or Pakistan).
115
+
116
+ """
117
+
118
+ pattern = re.search(r"(.np|.bd|.lk|.pk|.mm)", url)
119
+ domain_lists = {
120
+ 'np': 'Nepal',
121
+ 'lk': 'Sri Lanka',
122
+ 'bd': 'Bangladesh',
123
+ 'pk': 'Pakistan',
124
+ 'mm': 'Myanmar',
125
+ }
126
+ try:
127
+ country =pattern.group(1).replace(".", '')
128
+ except AttributeError:
129
+ country = None
130
+ return domain_lists[country]
131
+
132
+
133
+ def flat(d_lists):
134
+ """
135
+ Flatten a multi-dimentional list.
136
+
137
+ Args:
138
+ - d_lists (list): A multi-dimensional list.
139
+
140
+ Returns:
141
+ - list: A flattened version of the input list.
142
+ """
143
+
144
+ return list(itertools.chain(*d_lists))
145
+
146
+
147
+ def yamlMe(selectors):
148
+ """
149
+ Loads a YAML file containing CSS selectors and their corresponding data fields, and returns the loaded data as a dictionary.
150
+
151
+ Args:
152
+ - selectors (str): The name of the YAML file to load.
153
+
154
+ Returns:
155
+ - dict: A dictionary containing CSS selectors and their corresponding data fields.
156
+ """
157
+
158
+ with open(f"scrapers\\{selectors}.yaml") as file:
159
+ sel = yaml.load(file, Loader = yaml.SafeLoader)
160
+ return sel
161
+
162
+
163
+ def userAgents():
164
+ """
165
+ Loads a text file containing a list of user agent strings, and returns a random choice from the list.
166
+
167
+ Returns:
168
+ - str: A randomly chosen user agent string.
169
+ """
170
+ with open(f"{os.getcwd()}\\tools\\user-agents.txt") as f:
171
+ agents = f.read().split("\n")
172
+ return random.choice(agents)
173
+
tools/user-agents.txt ADDED
The diff for this file is too large to render. See raw diff