Spaces:
Runtime error
Runtime error
others
Browse files- main.py +35 -0
- requirements.txt +25 -0
- scrapers/__pycache__/daraz_scraper.cpython-310.pyc +0 -0
- scrapers/__pycache__/scraper_daraz.cpython-310.pyc +0 -0
- scrapers/daraz_scraper.py +316 -0
- scrapers/scrape_details_all.py +53 -0
- scrapers/scraper_daraz.py +154 -0
- scrapers/selectors.yaml +72 -0
- tools/__pycache__/functionalities.cpython-310.pyc +0 -0
- tools/functionalities.py +173 -0
- tools/user-agents.txt +0 -0
main.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from scrapers.scraper_daraz import Daraz
|
| 4 |
+
|
| 5 |
+
app = FastAPI()
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
# CORS settings
|
| 9 |
+
origins = ["*"]
|
| 10 |
+
|
| 11 |
+
app.add_middleware(
|
| 12 |
+
CORSMiddleware,
|
| 13 |
+
allow_origins=origins,
|
| 14 |
+
allow_credentials=True,
|
| 15 |
+
allow_methods=["GET", "POST", "PUT", "DELETE"],
|
| 16 |
+
allow_headers=["*"],
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
@app.get("/")
|
| 20 |
+
async def read_root():
|
| 21 |
+
return {"message": "We gonna scrap a lot of datas"}
|
| 22 |
+
|
| 23 |
+
@app.get("/products/{url:path}/")
|
| 24 |
+
async def all_products(url: str):
|
| 25 |
+
decoded_url = url.replace("%2F", "/")
|
| 26 |
+
daraz = Daraz(decoded_url)
|
| 27 |
+
products = await daraz.scrape_products()
|
| 28 |
+
return products
|
| 29 |
+
|
| 30 |
+
@app.get("/product-details/{url:path}/")
|
| 31 |
+
async def get_product_details(url: str):
|
| 32 |
+
decoded_url = url.replace("%2F", "/")
|
| 33 |
+
daraz = Daraz(decoded_url)
|
| 34 |
+
product_details = await daraz.product_details(decoded_url)
|
| 35 |
+
return product_details
|
requirements.txt
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
annotated-types==0.6.0
|
| 2 |
+
anyio==4.3.0
|
| 3 |
+
beautifulsoup4==4.11.2
|
| 4 |
+
certifi==2024.2.2
|
| 5 |
+
charset-normalizer==3.3.2
|
| 6 |
+
click==8.1.7
|
| 7 |
+
colorama==0.4.6
|
| 8 |
+
exceptiongroup==1.2.0
|
| 9 |
+
fastapi==0.110.0
|
| 10 |
+
greenlet==2.0.1
|
| 11 |
+
h11==0.14.0
|
| 12 |
+
idna==3.6
|
| 13 |
+
lxml==4.9.2
|
| 14 |
+
playwright==1.32.0
|
| 15 |
+
pydantic==2.6.4
|
| 16 |
+
pydantic_core==2.16.3
|
| 17 |
+
pyee==9.0.4
|
| 18 |
+
PyYAML==6.0
|
| 19 |
+
requests==2.31.0
|
| 20 |
+
sniffio==1.3.1
|
| 21 |
+
soupsieve==2.5
|
| 22 |
+
starlette==0.36.3
|
| 23 |
+
typing_extensions==4.10.0
|
| 24 |
+
urllib3==2.2.1
|
| 25 |
+
uvicorn==0.29.0
|
scrapers/__pycache__/daraz_scraper.cpython-310.pyc
ADDED
|
Binary file (8.42 kB). View file
|
|
|
scrapers/__pycache__/scraper_daraz.cpython-310.pyc
ADDED
|
Binary file (4.84 kB). View file
|
|
|
scrapers/daraz_scraper.py
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tools.functionalities import userAgents, TryExcept, yamlMe, check_domain, random_interval, verifyDarazURL
|
| 2 |
+
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
import pymongo as mong
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import requests
|
| 7 |
+
import re
|
| 8 |
+
from tools.functionalities import *;
|
| 9 |
+
from time import sleep
|
| 10 |
+
|
| 11 |
+
class Daraz:
|
| 12 |
+
def __init__(self, base_url):
|
| 13 |
+
self.base_url = base_url
|
| 14 |
+
self.headers = {"User-Agent": userAgents()}
|
| 15 |
+
self.catchClause = TryExcept()
|
| 16 |
+
self.yaml_me = yamlMe('selectors')
|
| 17 |
+
|
| 18 |
+
async def category_name(self):
|
| 19 |
+
req = requests.get(self.base_url, headers=self.headers)
|
| 20 |
+
soup = BeautifulSoup(req.content, 'lxml')
|
| 21 |
+
category = [cate.text.strip() for cate in soup.find('ul', class_='breadcrumb').find_all('li', class_='breadcrumb_item')][-1]
|
| 22 |
+
name = [nam.strip() for nam in re.split(r'[,/]', category)]
|
| 23 |
+
return ' '.join(name)
|
| 24 |
+
|
| 25 |
+
async def product_details(self, product_url):
|
| 26 |
+
|
| 27 |
+
async with async_playwright() as p:
|
| 28 |
+
browser = await p.firefox.launch(headless = True)
|
| 29 |
+
context = await browser.new_context(user_agent = userAgents())
|
| 30 |
+
page = await context.new_page()
|
| 31 |
+
await page.goto(product_url)
|
| 32 |
+
|
| 33 |
+
# wait for few seconds
|
| 34 |
+
await page.wait_for_timeout(timeout=random_interval(10)*1000)
|
| 35 |
+
# sleep()
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
datas = {
|
| 39 |
+
"Name": await self.catchClause.text(page.query_selector(self.yaml_me['product_name'])),
|
| 40 |
+
"Discount price": await self.catchClause.text(page.query_selector(self.yaml_me['product_dc_price'])),
|
| 41 |
+
"Original price": await self.catchClause.text(page.query_selector(self.yaml_me['product_og_price'])),
|
| 42 |
+
"Sold by": await self.catchClause.text(page.query_selector(self.yaml_me['store'])),
|
| 43 |
+
"Store link": f"""https:{await self.catchClause.attributes(page.query_selector(self.yaml_me['store']), 'href')}""",
|
| 44 |
+
"Hyperlink": product_url,
|
| 45 |
+
"Image": await self.catchClause.attributes(page.query_selector(self.yaml_me['image_link']), 'src'),
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
# select product_details_description
|
| 49 |
+
product_details = await self.catchClause.text_all(await page.query_selector_all(self.yaml_me['product_details']))
|
| 50 |
+
|
| 51 |
+
if product_details:
|
| 52 |
+
datas['product_details'] = product_details
|
| 53 |
+
|
| 54 |
+
# product_specifications
|
| 55 |
+
elements = await page.query_selector_all(self.yaml_me['product_specifications'])
|
| 56 |
+
|
| 57 |
+
if elements:
|
| 58 |
+
for element in elements:
|
| 59 |
+
key, value = await self.catchClause.extract_key_value(element)
|
| 60 |
+
datas[key] = value
|
| 61 |
+
else:
|
| 62 |
+
print("No product specifications found.")
|
| 63 |
+
|
| 64 |
+
# await browser.close()
|
| 65 |
+
# await browser.close()
|
| 66 |
+
return datas
|
| 67 |
+
|
| 68 |
+
async def scrape_datas(self):
|
| 69 |
+
daraz_dicts = []
|
| 70 |
+
|
| 71 |
+
async with async_playwright() as p:
|
| 72 |
+
browser = await p.chromium.launch(headless=True)
|
| 73 |
+
context = await browser.new_context(user_agent = userAgents())
|
| 74 |
+
|
| 75 |
+
page = await context.new_page()
|
| 76 |
+
|
| 77 |
+
await page.goto(self.base_url)
|
| 78 |
+
|
| 79 |
+
# Determine the country from the URL.
|
| 80 |
+
country = await check_domain(self.base_url)
|
| 81 |
+
|
| 82 |
+
print(f"""Initiating the automation | Powered by Playwright.\n
|
| 83 |
+
Daraz {country}
|
| 84 |
+
""")
|
| 85 |
+
|
| 86 |
+
# Get the name of the category being scraped.
|
| 87 |
+
category = await self.category_name()
|
| 88 |
+
|
| 89 |
+
# Get the total number of pages in the category.
|
| 90 |
+
page_number_elements = await page.query_selector_all(self.yaml_me['last_page_number'])
|
| 91 |
+
|
| 92 |
+
self.last_page_number = int(await (page_number_elements[len(page_number_elements)-2]).get_attribute('title'))
|
| 93 |
+
|
| 94 |
+
print(f"Category: {category} | Number of pages: {self.last_page_number}")
|
| 95 |
+
|
| 96 |
+
# Get the "next page" button.
|
| 97 |
+
next_page = await page.query_selector(self.yaml_me['next_page_button'])
|
| 98 |
+
|
| 99 |
+
# Loop through the page using the "next page button".
|
| 100 |
+
for count in range(1, self.last_page_number+1):
|
| 101 |
+
main_contents = await page.query_selector_all(self.yaml_me['category_main_contents'])
|
| 102 |
+
|
| 103 |
+
# Print a message indicating the current page being scraped.
|
| 104 |
+
print(f"\nScraping page | {count}")
|
| 105 |
+
|
| 106 |
+
# Wait for a short time before scraping the next page.
|
| 107 |
+
await page.wait_for_timeout(timeout=random_interval(5)*1000)
|
| 108 |
+
|
| 109 |
+
# Loop through the products on the current page and extract their data.
|
| 110 |
+
for content in main_contents:
|
| 111 |
+
product_name = await self.catchClause.text(content.query_selector(self.yaml_me['category_product_names']))
|
| 112 |
+
|
| 113 |
+
try:
|
| 114 |
+
dc_price = float(re.sub(r'[Rs.,]', '', await ( await content.query_selector(self.yaml_me['category_discount_price'])).inner_text()).strip())
|
| 115 |
+
except Exception as e:
|
| 116 |
+
dc_price = "N/A"
|
| 117 |
+
|
| 118 |
+
# if oc_price is not oc is dc_price
|
| 119 |
+
try:
|
| 120 |
+
og_price = float(re.sub(r'[Rs.,]', '', await ( await content.query_selector(self.yaml_me['category_og_price'])).inner_text()).strip())
|
| 121 |
+
except Exception as e:
|
| 122 |
+
og_price = dc_price
|
| 123 |
+
dc_price = 'N/A'
|
| 124 |
+
try:
|
| 125 |
+
# find dc% by calcuation
|
| 126 |
+
# dc_rate = float(re.sub(r'[-%]', '', await (await content.query_selector(self.yaml_me['category_discount_rate'])).inner_text()).strip())
|
| 127 |
+
dc_rate = ((og_price - dc_price) / og_price) * 100
|
| 128 |
+
dc_rate = round(dc_rate,2)
|
| 129 |
+
except Exception as e:
|
| 130 |
+
dc_rate = "N/A"
|
| 131 |
+
|
| 132 |
+
await page.wait_for_timeout(timeout=0.03*1000)
|
| 133 |
+
|
| 134 |
+
datas = {
|
| 135 |
+
"Name": product_name,
|
| 136 |
+
"Original price": og_price,
|
| 137 |
+
"Discount price": dc_price,
|
| 138 |
+
"Discount rate": dc_rate,
|
| 139 |
+
"Hyperlink": f"""https:{await self.catchClause.attributes(content.query_selector(self.yaml_me['brand_product_link']), 'href')}""",
|
| 140 |
+
"Image": await self.catchClause.attributes(content.query_selector(self.yaml_me['category_product_image']), 'src') ,
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
daraz_dicts.append(datas)
|
| 144 |
+
|
| 145 |
+
# Click the "next page" button to go to the next page.
|
| 146 |
+
try:
|
| 147 |
+
await page.wait_for_selector(self.yaml_me['next_page_button'], timeout = 10000)
|
| 148 |
+
await next_page.click()
|
| 149 |
+
except PlaywrightTimeoutError:
|
| 150 |
+
# If the "next page" button cannot be found, there are no more pages to scrape.
|
| 151 |
+
# Print a message indicating the error and break out of the loop.
|
| 152 |
+
print(f"Content loading error at page number {count}. There are no result found beyond this page. Scraper is exiting......")
|
| 153 |
+
break
|
| 154 |
+
# Close the browser.
|
| 155 |
+
await browser.close()
|
| 156 |
+
return daraz_dicts
|
| 157 |
+
|
| 158 |
+
async def scrape_details(self):
|
| 159 |
+
daraz_dicts = []
|
| 160 |
+
|
| 161 |
+
async with async_playwright() as p:
|
| 162 |
+
browser = await p.chromium.launch(headless=True)
|
| 163 |
+
context = await browser.new_context(user_agent = userAgents())
|
| 164 |
+
page = await context.new_page()
|
| 165 |
+
await page.goto(self.base_url)
|
| 166 |
+
|
| 167 |
+
# Determine the country from the URL.
|
| 168 |
+
country = await check_domain(self.base_url)
|
| 169 |
+
print(f"""Initiating the automation | Powered by Playwright.\n
|
| 170 |
+
Daraz {country}
|
| 171 |
+
""")
|
| 172 |
+
|
| 173 |
+
# Get the name of the category being scraped.
|
| 174 |
+
category = await self.category_name()
|
| 175 |
+
|
| 176 |
+
# Get the total number of pages in the category.
|
| 177 |
+
page_number_elements = await page.query_selector_all(self.yaml_me['last_page_number'])
|
| 178 |
+
|
| 179 |
+
self.last_page_number = int(await (page_number_elements[len(page_number_elements)-2]).get_attribute('title'))
|
| 180 |
+
|
| 181 |
+
print(f"Category: {category} | Number of pages: {self.last_page_number}")
|
| 182 |
+
|
| 183 |
+
# Get the "next page" button.
|
| 184 |
+
next_page = await page.query_selector(self.yaml_me['next_page_button'])
|
| 185 |
+
|
| 186 |
+
# Loop through the page using the "next page button".
|
| 187 |
+
for count in range(1, self.last_page_number+1):
|
| 188 |
+
main_contents = await page.query_selector_all(self.yaml_me['category_main_contents'])
|
| 189 |
+
|
| 190 |
+
# Print a message indicating the current page being scraped.
|
| 191 |
+
print(f"\nScraping page | {count}")
|
| 192 |
+
|
| 193 |
+
# Wait for a short time before scraping the next page.
|
| 194 |
+
await page.wait_for_timeout(timeout=random_interval(5)*1000)
|
| 195 |
+
|
| 196 |
+
# Loop through the products on the current page and extract their data.
|
| 197 |
+
# for content in main_contents:
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
for content in main_contents[:5]:
|
| 201 |
+
product_name = await self.catchClause.text(content.query_selector(self.yaml_me['category_product_names']))
|
| 202 |
+
product_link = f"""https:{await self.catchClause.attributes(content.query_selector(self.yaml_me['brand_product_link']), 'href')}"""
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
# here waht i want to got through product details and scrae and only go to next product
|
| 206 |
+
datas = await self.product_details(product_url=product_link)
|
| 207 |
+
daraz_dicts.append(datas)
|
| 208 |
+
|
| 209 |
+
# Click the "next page" button to go to the next page.
|
| 210 |
+
try:
|
| 211 |
+
await page.wait_for_selector(self.yaml_me['next_page_button'], timeout = 10000)
|
| 212 |
+
await next_page.click()
|
| 213 |
+
except PlaywrightTimeoutError:
|
| 214 |
+
# If the "next page" button cannot be found, there are no more pages to scrape.
|
| 215 |
+
# Print a message indicating the error and break out of the loop.
|
| 216 |
+
print(f"Content loading error at page number {count}. There are no result found beyond this page. Scraper is exiting......")
|
| 217 |
+
break
|
| 218 |
+
# Close the browser.
|
| 219 |
+
await browser.close()
|
| 220 |
+
return daraz_dicts
|
| 221 |
+
|
| 222 |
+
async def export_to_mongo(self):
|
| 223 |
+
"""
|
| 224 |
+
Asynchronously exports scraped data to a MongoDB database.
|
| 225 |
+
|
| 226 |
+
Steps:
|
| 227 |
+
1. Obtains the collection name by calling the `category_name()` method asynchronously.
|
| 228 |
+
2. Establishes a connection to the local MongoDB server on port 27017.
|
| 229 |
+
3. Selects the 'daraz' database from the client.
|
| 230 |
+
4. Fetches data by calling the `scrape_datas()` method asynchronously.
|
| 231 |
+
5. Inserts the fetched data into the specified collection in the database using `insert_many()`.
|
| 232 |
+
6. Closes the MongoDB client.
|
| 233 |
+
|
| 234 |
+
Returns:
|
| 235 |
+
pymongo.results.InsertManyResult: The result object containing information about the insertion operation.
|
| 236 |
+
"""
|
| 237 |
+
collection_name = await self.category_name()
|
| 238 |
+
client = mong.MongoClient('mongodb://localhost:27017/')
|
| 239 |
+
db = client['daraz']
|
| 240 |
+
collection = db[collection_name]
|
| 241 |
+
print(f"Collecting {collection_name} to Mongo database.")
|
| 242 |
+
datas = await self.scrape_datas()
|
| 243 |
+
result = collection.insert_many(datas)
|
| 244 |
+
client.close()
|
| 245 |
+
return result
|
| 246 |
+
|
| 247 |
+
async def export_to_sheet2(self):
|
| 248 |
+
# Obtain the file name
|
| 249 |
+
file_name = await self.category_name()
|
| 250 |
+
print(f"Exporting {file_name} to Google Sheets.")
|
| 251 |
+
|
| 252 |
+
# Authenticate with Google Sheets
|
| 253 |
+
client = authenticate_with_google_sheets()
|
| 254 |
+
|
| 255 |
+
# Spreadsheet ID
|
| 256 |
+
spreadsheet_id = "1yjQNaTMieHH__AbnR7VgRcc5VcRcvv7BsyVHUCXB1N0"
|
| 257 |
+
|
| 258 |
+
# Create a new sheet with the obtained file name
|
| 259 |
+
create_new_sheet(client, spreadsheet_id, file_name)
|
| 260 |
+
|
| 261 |
+
# Fetch data using the scrape_datas() method
|
| 262 |
+
datas = await self.scrape_datas()
|
| 263 |
+
|
| 264 |
+
# Convert data to DataFrame
|
| 265 |
+
df = pd.DataFrame(datas)
|
| 266 |
+
|
| 267 |
+
# Remove duplicates from DataFrame
|
| 268 |
+
df = df.drop_duplicates()
|
| 269 |
+
|
| 270 |
+
# Open the sheet
|
| 271 |
+
sheet = client.open_by_key(spreadsheet_id).worksheet(file_name)
|
| 272 |
+
|
| 273 |
+
# Write header row and make it bold
|
| 274 |
+
header_row = df.columns.tolist()
|
| 275 |
+
sheet.insert_row(header_row, 1)
|
| 276 |
+
sheet.format('A1:Z1', {'textFormat': {'bold': True}})
|
| 277 |
+
sheet.freeze(rows=1) # Freeze the header row
|
| 278 |
+
|
| 279 |
+
# Insert data into the sheet
|
| 280 |
+
for i, data in enumerate(df.values, start=2): # Start from row 2 after the header row
|
| 281 |
+
sheet.insert_row(list(data), i)
|
| 282 |
+
|
| 283 |
+
print(f"Data exported to Google Sheets sheet '{file_name}'.")
|
| 284 |
+
|
| 285 |
+
# async def export_to_sheet(self):
|
| 286 |
+
# """
|
| 287 |
+
# Asynchronously exports scraped data to an Excel sheet.
|
| 288 |
+
|
| 289 |
+
# Steps:
|
| 290 |
+
# 1. Obtains the file name by calling the `category_name()` method asynchronously.
|
| 291 |
+
# 2. Creates a 'Daraz database' directory if it doesn't exist.
|
| 292 |
+
# 3. Fetches data by calling the `scrape_datas()` method asynchronously.
|
| 293 |
+
# 4. Converts the data into a Pandas DataFrame.
|
| 294 |
+
# 5. Writes the DataFrame to an Excel file located at 'Daraz database/{file_name}.xlsx'.
|
| 295 |
+
|
| 296 |
+
# Note:
|
| 297 |
+
# The function assumes that the `scrape_datas()` method returns a list of dictionaries, each representing a row of data.
|
| 298 |
+
|
| 299 |
+
# Returns:
|
| 300 |
+
# None
|
| 301 |
+
# """
|
| 302 |
+
# file_name = await self.category_name()
|
| 303 |
+
# print(f"Exporting {file_name} to Excel database.")
|
| 304 |
+
# create_path('Daraz database')
|
| 305 |
+
# datas = await self.scrape_datas()
|
| 306 |
+
# df = pd.DataFrame(datas)
|
| 307 |
+
# df.to_excel(f"Daraz database//{file_name}.xlsx", index = False)
|
| 308 |
+
|
| 309 |
+
async def export_to_sheet(self):
|
| 310 |
+
file_name = await self.category_name()
|
| 311 |
+
print(f"Exporting {file_name} to Excel database.")
|
| 312 |
+
create_path('Daraz Details')
|
| 313 |
+
datas = await self.scrape_details()
|
| 314 |
+
df = pd.DataFrame(datas)
|
| 315 |
+
print(df)
|
| 316 |
+
df.to_excel(f"Daraz Details//{file_name[:9]} Details .xlsx", index = False)
|
scrapers/scrape_details_all.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
async def scrape_details(self):
|
| 2 |
+
daraz_dicts = []
|
| 3 |
+
|
| 4 |
+
async with async_playwright() as p:
|
| 5 |
+
browser = await p.chromium.launch(headless=True)
|
| 6 |
+
context = await browser.new_context(user_agent=userAgents())
|
| 7 |
+
page = await context.new_page()
|
| 8 |
+
await page.goto(self.base_url)
|
| 9 |
+
|
| 10 |
+
country = await check_domain(self.base_url)
|
| 11 |
+
|
| 12 |
+
category = await self.category_name()
|
| 13 |
+
page_number_elements = await page.query_selector_all(self.yaml_me['last_page_number'])
|
| 14 |
+
self.last_page_number = int(await (page_number_elements[len(page_number_elements) - 2]).get_attribute('title'))
|
| 15 |
+
print(f"Category: {category} | Number of pages: {self.last_page_number}")
|
| 16 |
+
|
| 17 |
+
next_page = await page.query_selector(self.yaml_me['next_page_button'])
|
| 18 |
+
|
| 19 |
+
for count in range(1, self.last_page_number + 1):
|
| 20 |
+
main_contents = await page.query_selector_all(self.yaml_me['category_main_contents'])
|
| 21 |
+
print(f"\nScraping page | {count}")
|
| 22 |
+
|
| 23 |
+
# await page.wait_for_timeout(timeout=random_interval(5) * 1000)
|
| 24 |
+
|
| 25 |
+
# List to store coroutines for scraping product details
|
| 26 |
+
coroutines = []
|
| 27 |
+
|
| 28 |
+
for content in main_contents:
|
| 29 |
+
product_link = f"""https:{await self.catchClause.attributes(content.query_selector(self.yaml_me['category_product_link']), 'href')}"""
|
| 30 |
+
|
| 31 |
+
print(f"Product Link: {product_link}")
|
| 32 |
+
|
| 33 |
+
# Append coroutine to the list
|
| 34 |
+
coroutines.append(self.product_details(product_link))
|
| 35 |
+
|
| 36 |
+
# Execute all coroutines concurrently and wait for them to finish
|
| 37 |
+
product_details_results = await asyncio.gather(*coroutines)
|
| 38 |
+
print(product_details_results)
|
| 39 |
+
|
| 40 |
+
# print(coroutines)
|
| 41 |
+
|
| 42 |
+
# Extend daraz_dicts with the results
|
| 43 |
+
daraz_dicts.append(product_details_results)
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
await page.wait_for_selector(self.yaml_me['next_page_button'], timeout=10000)
|
| 47 |
+
await next_page.click()
|
| 48 |
+
except PlaywrightTimeoutError:
|
| 49 |
+
print(f"Content loading error at page number {count}. There are no results found beyond this page. Scraper is exiting......")
|
| 50 |
+
break
|
| 51 |
+
|
| 52 |
+
await browser.close()
|
| 53 |
+
return daraz_dicts
|
scrapers/scraper_daraz.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tools.functionalities import userAgents, TryExcept, yamlMe, check_domain, random_interval, create_path
|
| 2 |
+
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
import requests
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Daraz:
|
| 9 |
+
def __init__(self, base_url):
|
| 10 |
+
self.base_url = base_url
|
| 11 |
+
self.headers = {"User-Agent": userAgents()}
|
| 12 |
+
self.catchClause = TryExcept()
|
| 13 |
+
self.yaml_me = yamlMe('selectors')
|
| 14 |
+
|
| 15 |
+
async def category_name(self):
|
| 16 |
+
req = requests.get(self.base_url, headers=self.headers)
|
| 17 |
+
soup = BeautifulSoup(req.content, 'lxml')
|
| 18 |
+
category = [cate.text.strip() for cate in soup.find('ul', class_='breadcrumb').find_all('li', class_='breadcrumb_item')][-1]
|
| 19 |
+
name = [nam.strip() for nam in re.split(r'[,/]', category)]
|
| 20 |
+
return ' '.join(name)
|
| 21 |
+
|
| 22 |
+
async def product_details(self, product_url):
|
| 23 |
+
|
| 24 |
+
async with async_playwright() as p:
|
| 25 |
+
browser = await p.chromium.launch(headless = True)
|
| 26 |
+
context = await browser.new_context(user_agent = userAgents())
|
| 27 |
+
page = await context.new_page()
|
| 28 |
+
await page.goto(product_url)
|
| 29 |
+
|
| 30 |
+
# wait for few seconds
|
| 31 |
+
# await page.wait_for_timeout(timeout=random_interval(10)*1000)
|
| 32 |
+
# sleep()
|
| 33 |
+
|
| 34 |
+
datas = {
|
| 35 |
+
"Name": await self.catchClause.text(page.query_selector(self.yaml_me['product_name'])),
|
| 36 |
+
"Discount price": await self.catchClause.text(page.query_selector(self.yaml_me['product_dc_price'])),
|
| 37 |
+
"Original price": await self.catchClause.text(page.query_selector(self.yaml_me['product_og_price'])),
|
| 38 |
+
"Sold by": await self.catchClause.text(page.query_selector(self.yaml_me['store'])),
|
| 39 |
+
"Store link": f"""https:{await self.catchClause.attributes(page.query_selector(self.yaml_me['store']), 'href')}""",
|
| 40 |
+
"Hyperlink": product_url,
|
| 41 |
+
"Image": await self.catchClause.attributes(page.query_selector(self.yaml_me['image_link']), 'src'),
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
# select product_details_description
|
| 45 |
+
product_details = await self.catchClause.text_all(await page.query_selector_all(self.yaml_me['product_details']))
|
| 46 |
+
|
| 47 |
+
if product_details:
|
| 48 |
+
datas['product_details'] = product_details
|
| 49 |
+
|
| 50 |
+
# product_specifications
|
| 51 |
+
elements = await page.query_selector_all(self.yaml_me['product_specifications'])
|
| 52 |
+
|
| 53 |
+
if elements:
|
| 54 |
+
for element in elements:
|
| 55 |
+
key, value = await self.catchClause.extract_key_value(element)
|
| 56 |
+
datas[key] = value
|
| 57 |
+
else:
|
| 58 |
+
print("No product specifications found.")
|
| 59 |
+
|
| 60 |
+
# await browser.new_page()
|
| 61 |
+
await browser.close()
|
| 62 |
+
return datas
|
| 63 |
+
|
| 64 |
+
async def scrape_products(self):
|
| 65 |
+
daraz_dicts = []
|
| 66 |
+
|
| 67 |
+
async with async_playwright() as p:
|
| 68 |
+
browser = await p.chromium.launch(headless=True)
|
| 69 |
+
context = await browser.new_context(user_agent = userAgents())
|
| 70 |
+
|
| 71 |
+
page = await context.new_page()
|
| 72 |
+
|
| 73 |
+
await page.goto(self.base_url)
|
| 74 |
+
|
| 75 |
+
# Determine the country from the URL.
|
| 76 |
+
country = await check_domain(self.base_url)
|
| 77 |
+
|
| 78 |
+
print(f"""Initiating the automation | Powered by Playwright.\n
|
| 79 |
+
Daraz {country}
|
| 80 |
+
""")
|
| 81 |
+
|
| 82 |
+
# Get the name of the category being scraped.
|
| 83 |
+
category = await self.category_name()
|
| 84 |
+
|
| 85 |
+
# Get the total number of pages in the category.
|
| 86 |
+
page_number_elements = await page.query_selector_all(self.yaml_me['last_page_number'])
|
| 87 |
+
|
| 88 |
+
self.last_page_number = int(await (page_number_elements[len(page_number_elements)-2]).get_attribute('title'))
|
| 89 |
+
|
| 90 |
+
print(f"Category: {category} | Number of pages: {self.last_page_number}")
|
| 91 |
+
|
| 92 |
+
# Get the "next page" button.
|
| 93 |
+
next_page = await page.query_selector(self.yaml_me['next_page_button'])
|
| 94 |
+
|
| 95 |
+
# Loop through the page using the "next page button".
|
| 96 |
+
for count in range(1, self.last_page_number+1):
|
| 97 |
+
main_contents = await page.query_selector_all(self.yaml_me['category_main_contents'])
|
| 98 |
+
|
| 99 |
+
# Print a message indicating the current page being scraped.
|
| 100 |
+
print(f"\nScraping page | {count}")
|
| 101 |
+
|
| 102 |
+
# Wait for a short time before scraping the next page.
|
| 103 |
+
await page.wait_for_timeout(timeout=random_interval(5)*1000)
|
| 104 |
+
|
| 105 |
+
# Loop through the products on the current page and extract their data.
|
| 106 |
+
for content in main_contents:
|
| 107 |
+
product_name = await self.catchClause.text(content.query_selector(self.yaml_me['category_product_names']))
|
| 108 |
+
|
| 109 |
+
print(f"Scrapping Product: {product_name}")
|
| 110 |
+
|
| 111 |
+
try:
|
| 112 |
+
dc_price = float(re.sub(r'[Rs.,]', '', await ( await content.query_selector(self.yaml_me['category_discount_price'])).inner_text()).strip())
|
| 113 |
+
except Exception as e:
|
| 114 |
+
dc_price = "N/A"
|
| 115 |
+
|
| 116 |
+
# if oc_price is not oc is dc_price
|
| 117 |
+
try:
|
| 118 |
+
og_price = float(re.sub(r'[Rs.,]', '', await ( await content.query_selector(self.yaml_me['category_og_price'])).inner_text()).strip())
|
| 119 |
+
except Exception as e:
|
| 120 |
+
og_price = dc_price
|
| 121 |
+
dc_price = 'N/A'
|
| 122 |
+
try:
|
| 123 |
+
# find dc% by calcuation
|
| 124 |
+
# dc_rate = float(re.sub(r'[-%]', '', await (await content.query_selector(self.yaml_me['category_discount_rate'])).inner_text()).strip())
|
| 125 |
+
dc_rate = ((og_price - dc_price) / og_price) * 100
|
| 126 |
+
dc_rate = round(dc_rate,2)
|
| 127 |
+
except Exception as e:
|
| 128 |
+
dc_rate = "N/A"
|
| 129 |
+
|
| 130 |
+
await page.wait_for_timeout(timeout=0.03*1000)
|
| 131 |
+
|
| 132 |
+
datas = {
|
| 133 |
+
"Name": product_name,
|
| 134 |
+
"Original price": og_price,
|
| 135 |
+
"Discount price": dc_price,
|
| 136 |
+
"Discount rate": dc_rate,
|
| 137 |
+
"Hyperlink": f"""https:{await self.catchClause.attributes(content.query_selector(self.yaml_me['category_product_link']), 'href')}""",
|
| 138 |
+
"Image": await self.catchClause.attributes(content.query_selector(self.yaml_me['category_product_image']), 'src') ,
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
daraz_dicts.append(datas)
|
| 142 |
+
|
| 143 |
+
# Click the "next page" button to go to the next page.
|
| 144 |
+
try:
|
| 145 |
+
await page.wait_for_selector(self.yaml_me['next_page_button'], timeout = 10000)
|
| 146 |
+
await next_page.click()
|
| 147 |
+
except PlaywrightTimeoutError:
|
| 148 |
+
# If the "next page" button cannot be found, there are no more pages to scrape.
|
| 149 |
+
# Print a message indicating the error and break out of the loop.
|
| 150 |
+
print(f"Content loading error at page number {count}. There are no result found beyond this page. Scraper is exiting......")
|
| 151 |
+
break
|
| 152 |
+
# Close the browser.
|
| 153 |
+
await browser.close()
|
| 154 |
+
return daraz_dicts
|
scrapers/selectors.yaml
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CSS selectors for product category elements:
|
| 2 |
+
# category_main_contents: "div[data-qa-locator='product-item']"
|
| 3 |
+
category_main_contents: "div.gridItem--Yd0sa"
|
| 4 |
+
# category_main_contents: "div[data-qa-locator='product-item']:not(:has(*))"
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
category_product_names: "div.title-wrapper--IaQ0m"
|
| 8 |
+
|
| 9 |
+
# category_product_links: "a.product-card--vHfY9"
|
| 10 |
+
category_product_link: "a.product-card--vHfY9"
|
| 11 |
+
|
| 12 |
+
# category_product_image: "div.image-wrapper--ydch1 img"
|
| 13 |
+
category_product_image: "img#id-img"
|
| 14 |
+
category_discount_price: "div.current-price--Jklkc span.currency--GVKjl"
|
| 15 |
+
category_discount_rate: "span.discount--HADrg"
|
| 16 |
+
category_og_price: "div.original-price--lHYOH del.currency--GVKjl"
|
| 17 |
+
|
| 18 |
+
last_page_number: "li[tabindex='0']"
|
| 19 |
+
next_page_button: "li[title='Next Page']"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# CSS selectors for individual product link:
|
| 23 |
+
product_name: "span.pdp-mod-product-badge-title"
|
| 24 |
+
ratings: "div.review-info-rate span.score"
|
| 25 |
+
product_dc_price: "div.pdp-mod-product-price span.pdp-price.pdp-price_type_normal.pdp-price_color_orange.pdp-price_size_xl"
|
| 26 |
+
product_og_price: "div.pdp-mod-product-price div.origin-block span:first-child"
|
| 27 |
+
image_link: "img.gallery-preview-panel__image"
|
| 28 |
+
store: "div.seller-name__detail a:first-child"
|
| 29 |
+
|
| 30 |
+
product_details: "div > div.pdp-product-desc > div.html-content.pdp-product-highlights > ul > li"
|
| 31 |
+
product_specifications: "div > div.pdp-product-desc > div.pdp-mod-specification > div.pdp-general-features > ul > li"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# specifications
|
| 35 |
+
brand: "div.pdp-general-features ul li.key-li:nth-child(1) div.key-value"
|
| 36 |
+
sku: "div.pdp-general-features ul li.key-li:nth-child(2) div.key-value"
|
| 37 |
+
battery_capacity: "div.pdp-general-features ul li.key-li:nth-child(3) div.key-value"
|
| 38 |
+
ppi: "div.pdp-general-features ul li.key-li:nth-child(4) div.key-value"
|
| 39 |
+
charger_type: "div.pdp-general-features ul li.key-li:nth-child(5) div.key-value"
|
| 40 |
+
flash: "div.pdp-general-features ul li.key-li:nth-child(6) div.key-value"
|
| 41 |
+
screen_size_inches: "div.pdp-general-features ul li.key-li:nth-child(7) div.key-value"
|
| 42 |
+
bluetooth_support: "div.pdp-general-features ul li.key-li:nth-child(8) div.key-value"
|
| 43 |
+
build_type: "div.pdp-general-features ul li.key-li:nth-child(9) div.key-value"
|
| 44 |
+
wifi: "div.pdp-general-features ul li.key-li:nth-child(10) div.key-value"
|
| 45 |
+
sim_type: "div.pdp-general-features ul li.key-li:nth-child(11) div.key-value"
|
| 46 |
+
removable_battery: "div.pdp-general-features ul li.key-li:nth-child(12) div.key-value"
|
| 47 |
+
gps: "div.pdp-general-features ul li.key-li:nth-child(13) div.key-value"
|
| 48 |
+
camera_front_megapixels: "div.pdp-general-features ul li.key-li:nth-child(14) div.key-value"
|
| 49 |
+
fm_radio: "div.pdp-general-features ul li.key-li:nth-child(15) div.key-value"
|
| 50 |
+
nfc: "div.pdp-general-features ul li.key-li:nth-child(16) div.key-value"
|
| 51 |
+
fingerprint_sensor: "div.pdp-general-features ul li.key-li:nth-child(17) div.key-value"
|
| 52 |
+
expandable_storage: "div.pdp-general-features ul li.key-li:nth-child(18) div.key-value"
|
| 53 |
+
memory_card_slot_type: "div.pdp-general-features ul li.key-li:nth-child(19) div.key-value"
|
| 54 |
+
camera_back_megapixels: "div.pdp-general-features ul li.key-li:nth-child(20) div.key-value"
|
| 55 |
+
number_of_cameras: "div.pdp-general-features ul li.key-li:nth-child(21) div.key-value"
|
| 56 |
+
display_protection: "div.pdp-general-features ul li.key-li:nth-child(22) div.key-value"
|
| 57 |
+
model_year: "div.pdp-general-features ul li.key-li:nth-child(23) div.key-value"
|
| 58 |
+
video_resolution: "div.pdp-general-features ul li.key-li:nth-child(24) div.key-value"
|
| 59 |
+
network_connections: "div.pdp-general-features ul li.key-li:nth-child(25) div.key-value"
|
| 60 |
+
operating_system: "div.pdp-general-features ul li.key-li:nth-child(26) div.key-value"
|
| 61 |
+
notch_display: "div.pdp-general-features ul li.key-li:nth-child(27) div.key-value"
|
| 62 |
+
fast_charging: "div.pdp-general-features ul li.key-li:nth-child(28) div.key-value"
|
| 63 |
+
model: "div.pdp-general-features ul li.key-li:nth-child(29) div.key-value"
|
| 64 |
+
headphone_jack: "div.pdp-general-features ul li.key-li:nth-child(30) div.key-value"
|
| 65 |
+
wireless_charging: "div.pdp-general-features ul li.key-li:nth-child(31) div.key-value"
|
| 66 |
+
display_type: "div.pdp-general-features ul li.key-li:nth-child(32) div.key-value"
|
| 67 |
+
processor_type: "div.pdp-general-features ul li.key-li:nth-child(33) div.key-value"
|
| 68 |
+
refresh_rate: "div.pdp-general-features ul li.key-li:nth-child(34) div.key-value"
|
| 69 |
+
number_of_sim_slots: "div.pdp-general-features ul li.key-li:nth-child(35) div.key-value"
|
| 70 |
+
resolution: "div.pdp-general-features ul li.key-li:nth-child(36) div.key-value"
|
| 71 |
+
|
| 72 |
+
|
tools/__pycache__/functionalities.cpython-310.pyc
ADDED
|
Binary file (5.04 kB). View file
|
|
|
tools/functionalities.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import itertools
|
| 2 |
+
import random
|
| 3 |
+
import yaml
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
class TryExcept:
|
| 9 |
+
"""
|
| 10 |
+
A class that provides try-except functionality for retrieving element attributes and inner text.
|
| 11 |
+
|
| 12 |
+
Methods:
|
| 13 |
+
- text(element): Retrieves the inner text of an element or returns "N/A" if the element has no inner text.
|
| 14 |
+
- attribute(element, attr): Retrieves the value of
|
| 15 |
+
"""
|
| 16 |
+
async def text(self, element):
|
| 17 |
+
try:
|
| 18 |
+
elements = (await (await element).inner_text()).strip()
|
| 19 |
+
except AttributeError:
|
| 20 |
+
elements = "N/A"
|
| 21 |
+
return elements
|
| 22 |
+
|
| 23 |
+
async def attributes(self, element, attr):
|
| 24 |
+
try:
|
| 25 |
+
elements = await (await element).get_attribute(attr)
|
| 26 |
+
except AttributeError:
|
| 27 |
+
elements = "N/A"
|
| 28 |
+
return elements
|
| 29 |
+
|
| 30 |
+
async def text_all(self, elements):
|
| 31 |
+
texts = []
|
| 32 |
+
for element in elements:
|
| 33 |
+
try:
|
| 34 |
+
text = (await element.inner_text()).strip()
|
| 35 |
+
except AttributeError:
|
| 36 |
+
text = "N/A"
|
| 37 |
+
texts.append(text)
|
| 38 |
+
return texts
|
| 39 |
+
|
| 40 |
+
async def extract_key_value(self, element):
|
| 41 |
+
try:
|
| 42 |
+
key_element = await element.query_selector('.key-title')
|
| 43 |
+
value_element = await element.query_selector('.key-value')
|
| 44 |
+
if key_element and value_element:
|
| 45 |
+
key = (await key_element.inner_text()).strip()
|
| 46 |
+
value = (await value_element.inner_text()).strip()
|
| 47 |
+
return key, value
|
| 48 |
+
else:
|
| 49 |
+
return "N/A", "N/A"
|
| 50 |
+
except AttributeError:
|
| 51 |
+
return "N/A", "N/A"
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def create_path(dir_name):
|
| 55 |
+
# """
|
| 56 |
+
# Creates a directory with the specified name if i doesn't already exist.
|
| 57 |
+
|
| 58 |
+
# Args:
|
| 59 |
+
# -dir_name: A string representing the name of the direcory to create.
|
| 60 |
+
|
| 61 |
+
# Return:
|
| 62 |
+
# -None
|
| 63 |
+
# """
|
| 64 |
+
path_dir = os.path.join(os.getcwd(), dir_name)
|
| 65 |
+
if os.path.exists(path_dir):
|
| 66 |
+
pass
|
| 67 |
+
else:
|
| 68 |
+
os.mkdir(path_dir)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def verifyDarazURL(url):
|
| 72 |
+
"""
|
| 73 |
+
Check if the URL belongs to a Daraz website.
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
- url (str): The URL to check.
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
- bool: True if the URL is not a Daraz website, False otherwise.
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
daraz_pattern = re.search("""^https://www.daraz.(com.np|lk|pk|com.bd)/+""", url)
|
| 83 |
+
if daraz_pattern == None:
|
| 84 |
+
return True
|
| 85 |
+
else:
|
| 86 |
+
return False
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def random_interval(value):
|
| 90 |
+
"""
|
| 91 |
+
This function takes an integer value as an argument and returns a random integer between 2 and the length
|
| 92 |
+
of a list containing values from 0 up to the given value.
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
- value (int): an integer value representing the upper limit of the range
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
- rand_time (int): a random integer between 2 and the length of the range list plus 1
|
| 99 |
+
"""
|
| 100 |
+
|
| 101 |
+
ranges = [time for time in range(value+1)]
|
| 102 |
+
rand_time = random.randint(2, len(ranges))
|
| 103 |
+
return rand_time
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
async def check_domain(url):
|
| 107 |
+
"""
|
| 108 |
+
Check the domain of a URL and return the country it belongs to.
|
| 109 |
+
|
| 110 |
+
Args:
|
| 111 |
+
- url (str): The URL to check.
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
- str: The name of the country the domain belongs to (Nepal, Sri Lanka, Bangladesh, or Pakistan).
|
| 115 |
+
|
| 116 |
+
"""
|
| 117 |
+
|
| 118 |
+
pattern = re.search(r"(.np|.bd|.lk|.pk|.mm)", url)
|
| 119 |
+
domain_lists = {
|
| 120 |
+
'np': 'Nepal',
|
| 121 |
+
'lk': 'Sri Lanka',
|
| 122 |
+
'bd': 'Bangladesh',
|
| 123 |
+
'pk': 'Pakistan',
|
| 124 |
+
'mm': 'Myanmar',
|
| 125 |
+
}
|
| 126 |
+
try:
|
| 127 |
+
country =pattern.group(1).replace(".", '')
|
| 128 |
+
except AttributeError:
|
| 129 |
+
country = None
|
| 130 |
+
return domain_lists[country]
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def flat(d_lists):
|
| 134 |
+
"""
|
| 135 |
+
Flatten a multi-dimentional list.
|
| 136 |
+
|
| 137 |
+
Args:
|
| 138 |
+
- d_lists (list): A multi-dimensional list.
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
- list: A flattened version of the input list.
|
| 142 |
+
"""
|
| 143 |
+
|
| 144 |
+
return list(itertools.chain(*d_lists))
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def yamlMe(selectors):
|
| 148 |
+
"""
|
| 149 |
+
Loads a YAML file containing CSS selectors and their corresponding data fields, and returns the loaded data as a dictionary.
|
| 150 |
+
|
| 151 |
+
Args:
|
| 152 |
+
- selectors (str): The name of the YAML file to load.
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
- dict: A dictionary containing CSS selectors and their corresponding data fields.
|
| 156 |
+
"""
|
| 157 |
+
|
| 158 |
+
with open(f"scrapers\\{selectors}.yaml") as file:
|
| 159 |
+
sel = yaml.load(file, Loader = yaml.SafeLoader)
|
| 160 |
+
return sel
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def userAgents():
|
| 164 |
+
"""
|
| 165 |
+
Loads a text file containing a list of user agent strings, and returns a random choice from the list.
|
| 166 |
+
|
| 167 |
+
Returns:
|
| 168 |
+
- str: A randomly chosen user agent string.
|
| 169 |
+
"""
|
| 170 |
+
with open(f"{os.getcwd()}\\tools\\user-agents.txt") as f:
|
| 171 |
+
agents = f.read().split("\n")
|
| 172 |
+
return random.choice(agents)
|
| 173 |
+
|
tools/user-agents.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|