Spaces:

dada11
/

daraz-scrapper-api

Runtime error

App Files Files Community

yrarjun commited on Mar 27, 2024

Commit

4eda287

1 Parent(s): b3e2afc

others

Browse files

Files changed (11) hide show

main.py +35 -0
requirements.txt +25 -0
scrapers/__pycache__/daraz_scraper.cpython-310.pyc +0 -0
scrapers/__pycache__/scraper_daraz.cpython-310.pyc +0 -0
scrapers/daraz_scraper.py +316 -0
scrapers/scrape_details_all.py +53 -0
scrapers/scraper_daraz.py +154 -0
scrapers/selectors.yaml +72 -0
tools/__pycache__/functionalities.cpython-310.pyc +0 -0
tools/functionalities.py +173 -0
tools/user-agents.txt +0 -0

main.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from scrapers.scraper_daraz import Daraz
+app = FastAPI()
+# CORS settings
+origins = ["*"]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["GET", "POST", "PUT", "DELETE"],
+    allow_headers=["*"],
+)
+@app.get("/")
+async def read_root():
+    return {"message": "We gonna scrap a lot of datas"}
+@app.get("/products/{url:path}/")
+async def all_products(url: str):
+    decoded_url = url.replace("%2F", "/")
+    daraz = Daraz(decoded_url)
+    products = await daraz.scrape_products()
+    return products
+@app.get("/product-details/{url:path}/")
+async def get_product_details(url: str):
+    decoded_url = url.replace("%2F", "/")
+    daraz = Daraz(decoded_url)
+    product_details = await daraz.product_details(decoded_url)
+    return product_details

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+annotated-types==0.6.0
+anyio==4.3.0
+beautifulsoup4==4.11.2
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+exceptiongroup==1.2.0
+fastapi==0.110.0
+greenlet==2.0.1
+h11==0.14.0
+idna==3.6
+lxml==4.9.2
+playwright==1.32.0
+pydantic==2.6.4
+pydantic_core==2.16.3
+pyee==9.0.4
+PyYAML==6.0
+requests==2.31.0
+sniffio==1.3.1
+soupsieve==2.5
+starlette==0.36.3
+typing_extensions==4.10.0
+urllib3==2.2.1
+uvicorn==0.29.0

scrapers/__pycache__/daraz_scraper.cpython-310.pyc ADDED Viewed

Binary file (8.42 kB). View file

scrapers/__pycache__/scraper_daraz.cpython-310.pyc ADDED Viewed

Binary file (4.84 kB). View file

scrapers/daraz_scraper.py ADDED Viewed

	@@ -0,0 +1,316 @@

+from tools.functionalities import userAgents, TryExcept, yamlMe, check_domain, random_interval, verifyDarazURL
+from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
+from bs4 import BeautifulSoup
+import pymongo as mong
+import pandas as pd
+import requests
+import re
+from tools.functionalities import *;
+from time import sleep
+class Daraz:
+    def __init__(self, base_url):
+        self.base_url = base_url
+        self.headers = {"User-Agent": userAgents()}
+        self.catchClause = TryExcept()
+        self.yaml_me = yamlMe('selectors')
+    async def category_name(self):
+        req = requests.get(self.base_url, headers=self.headers)
+        soup = BeautifulSoup(req.content, 'lxml')
+        category = [cate.text.strip() for cate in soup.find('ul', class_='breadcrumb').find_all('li', class_='breadcrumb_item')][-1]
+        name = [nam.strip() for nam in re.split(r'[,/]', category)]
+        return ' '.join(name)
+    async def product_details(self, product_url):
+        async with async_playwright() as p:
+            browser = await p.firefox.launch(headless = True)
+            context = await browser.new_context(user_agent = userAgents())
+            page = await context.new_page()
+            await page.goto(product_url)
+            # wait for few seconds
+            await page.wait_for_timeout(timeout=random_interval(10)*1000)
+            # sleep()
+            datas = {
+                "Name": await self.catchClause.text(page.query_selector(self.yaml_me['product_name'])),
+                "Discount price": await self.catchClause.text(page.query_selector(self.yaml_me['product_dc_price'])),
+                "Original price": await self.catchClause.text(page.query_selector(self.yaml_me['product_og_price'])),
+                "Sold by": await self.catchClause.text(page.query_selector(self.yaml_me['store'])),
+                "Store link": f"""https:{await self.catchClause.attributes(page.query_selector(self.yaml_me['store']), 'href')}""",
+                "Hyperlink": product_url,
+                "Image": await self.catchClause.attributes(page.query_selector(self.yaml_me['image_link']), 'src'),
+            }
+            # select product_details_description
+            product_details = await self.catchClause.text_all(await page.query_selector_all(self.yaml_me['product_details']))
+            if product_details:
+                datas['product_details'] =  product_details
+            # product_specifications
+            elements = await page.query_selector_all(self.yaml_me['product_specifications'])
+            if elements:
+                for element in elements:
+                    key, value = await self.catchClause.extract_key_value(element)
+                    datas[key] = value
+            else:
+                print("No product specifications found.")
+            # await browser.close()
+            # await browser.close()
+            return datas
+    async def scrape_datas(self):
+        daraz_dicts = []
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            context = await browser.new_context(user_agent = userAgents())
+            page = await context.new_page()
+            await page.goto(self.base_url)
+            # Determine the country from the URL.
+            country = await check_domain(self.base_url)
+            print(f"""Initiating the automation | Powered by Playwright.\n
+                    Daraz {country}
+                """)
+            # Get the name of the category being scraped.
+            category = await self.category_name()
+            # Get the total number of pages in the category.
+            page_number_elements = await page.query_selector_all(self.yaml_me['last_page_number'])
+            self.last_page_number = int(await (page_number_elements[len(page_number_elements)-2]).get_attribute('title'))
+            print(f"Category: {category} | Number of pages: {self.last_page_number}")
+            # Get the "next page" button.
+            next_page = await page.query_selector(self.yaml_me['next_page_button'])
+            # Loop through the page using the "next page button".
+            for count in range(1, self.last_page_number+1):
+                main_contents = await page.query_selector_all(self.yaml_me['category_main_contents'])
+                # Print a message indicating the current page being scraped.
+                print(f"\nScraping page | {count}")
+                # Wait for a short time before scraping the next page.
+                await page.wait_for_timeout(timeout=random_interval(5)*1000)
+                # Loop through the products on the current page and extract their data.
+                for content in main_contents:
+                    product_name = await self.catchClause.text(content.query_selector(self.yaml_me['category_product_names']))
+                    try:
+                        dc_price = float(re.sub(r'[Rs.,]', '', await ( await content.query_selector(self.yaml_me['category_discount_price'])).inner_text()).strip())
+                    except Exception as e:
+                        dc_price = "N/A"
+                    #  if oc_price is not oc is dc_price
+                    try:
+                        og_price = float(re.sub(r'[Rs.,]', '', await ( await content.query_selector(self.yaml_me['category_og_price'])).inner_text()).strip())
+                    except Exception as e:
+                        og_price = dc_price
+                        dc_price = 'N/A'
+                    try:
+                        # find dc% by calcuation
+                        # dc_rate = float(re.sub(r'[-%]', '', await (await content.query_selector(self.yaml_me['category_discount_rate'])).inner_text()).strip())
+                        dc_rate =  ((og_price - dc_price) / og_price) * 100
+                        dc_rate = round(dc_rate,2)
+                    except Exception as e:
+                        dc_rate = "N/A"
+                    await page.wait_for_timeout(timeout=0.03*1000)
+                    datas = {
+                        "Name": product_name,
+                        "Original price": og_price,
+                        "Discount price": dc_price,
+                        "Discount rate": dc_rate,
+                        "Hyperlink": f"""https:{await self.catchClause.attributes(content.query_selector(self.yaml_me['brand_product_link']), 'href')}""",
+                        "Image": await self.catchClause.attributes(content.query_selector(self.yaml_me['category_product_image']), 'src') ,
+                    }
+                    daraz_dicts.append(datas)
+                # Click the "next page" button to go to the next page.
+                try:
+                    await page.wait_for_selector(self.yaml_me['next_page_button'], timeout = 10000)
+                    await next_page.click()
+                except PlaywrightTimeoutError:
+                    # If the "next page" button cannot be found, there are no more pages to scrape.
+                    # Print a message indicating the error and break out of the loop.
+                    print(f"Content loading error at page number {count}. There are no result found beyond this page. Scraper is exiting......")
+                    break
+            # Close the browser.
+            await browser.close()
+        return daraz_dicts
+    async def scrape_details(self):
+        daraz_dicts = []
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            context = await browser.new_context(user_agent = userAgents())
+            page = await context.new_page()
+            await page.goto(self.base_url)
+            # Determine the country from the URL.
+            country = await check_domain(self.base_url)
+            print(f"""Initiating the automation | Powered by Playwright.\n
+                    Daraz {country}
+                """)
+            # Get the name of the category being scraped.
+            category = await self.category_name()
+            # Get the total number of pages in the category.
+            page_number_elements = await page.query_selector_all(self.yaml_me['last_page_number'])
+            self.last_page_number = int(await (page_number_elements[len(page_number_elements)-2]).get_attribute('title'))
+            print(f"Category: {category} | Number of pages: {self.last_page_number}")
+            # Get the "next page" button.
+            next_page = await page.query_selector(self.yaml_me['next_page_button'])
+            # Loop through the page using the "next page button".
+            for count in range(1, self.last_page_number+1):
+                main_contents = await page.query_selector_all(self.yaml_me['category_main_contents'])
+                # Print a message indicating the current page being scraped.
+                print(f"\nScraping page | {count}")
+                # Wait for a short time before scraping the next page.
+                await page.wait_for_timeout(timeout=random_interval(5)*1000)
+                # Loop through the products on the current page and extract their data.
+                # for content in main_contents:
+                for content in main_contents[:5]:
+                    product_name = await self.catchClause.text(content.query_selector(self.yaml_me['category_product_names']))
+                    product_link =  f"""https:{await self.catchClause.attributes(content.query_selector(self.yaml_me['brand_product_link']), 'href')}"""
+                  # here waht i want to got through product details and scrae and only go to next product
+                    datas = await self.product_details(product_url=product_link)
+                    daraz_dicts.append(datas)
+                # Click the "next page" button to go to the next page.
+                try:
+                    await page.wait_for_selector(self.yaml_me['next_page_button'], timeout = 10000)
+                    await next_page.click()
+                except PlaywrightTimeoutError:
+                    # If the "next page" button cannot be found, there are no more pages to scrape.
+                    # Print a message indicating the error and break out of the loop.
+                    print(f"Content loading error at page number {count}. There are no result found beyond this page. Scraper is exiting......")
+                    break
+            # Close the browser.
+            await browser.close()
+        return daraz_dicts
+    async def export_to_mongo(self):
+        """
+            Asynchronously exports scraped data to a MongoDB database.
+            Steps:
+            1. Obtains the collection name by calling the `category_name()` method asynchronously.
+            2. Establishes a connection to the local MongoDB server on port 27017.
+            3. Selects the 'daraz' database from the client.
+            4. Fetches data by calling the `scrape_datas()` method asynchronously.
+            5. Inserts the fetched data into the specified collection in the database using `insert_many()`.
+            6. Closes the MongoDB client.
+            Returns:
+                pymongo.results.InsertManyResult: The result object containing information about the insertion operation.
+        """
+        collection_name = await self.category_name()
+        client = mong.MongoClient('mongodb://localhost:27017/')
+        db = client['daraz']
+        collection = db[collection_name]
+        print(f"Collecting {collection_name} to Mongo database.")
+        datas = await self.scrape_datas()
+        result = collection.insert_many(datas)
+        client.close()
+        return result
+    async def export_to_sheet2(self):
+    # Obtain the file name
+        file_name = await self.category_name()
+        print(f"Exporting {file_name} to Google Sheets.")
+        # Authenticate with Google Sheets
+        client = authenticate_with_google_sheets()
+        # Spreadsheet ID
+        spreadsheet_id = "1yjQNaTMieHH__AbnR7VgRcc5VcRcvv7BsyVHUCXB1N0"
+        # Create a new sheet with the obtained file name
+        create_new_sheet(client, spreadsheet_id, file_name)
+        # Fetch data using the scrape_datas() method
+        datas = await self.scrape_datas()
+        # Convert data to DataFrame
+        df = pd.DataFrame(datas)
+        # Remove duplicates from DataFrame
+        df = df.drop_duplicates()
+        # Open the sheet
+        sheet = client.open_by_key(spreadsheet_id).worksheet(file_name)
+        # Write header row and make it bold
+        header_row = df.columns.tolist()
+        sheet.insert_row(header_row, 1)
+        sheet.format('A1:Z1', {'textFormat': {'bold': True}})
+        sheet.freeze(rows=1)  # Freeze the header row
+        # Insert data into the sheet
+        for i, data in enumerate(df.values, start=2):  # Start from row 2 after the header row
+            sheet.insert_row(list(data), i)
+        print(f"Data exported to Google Sheets sheet '{file_name}'.")
+    # async def export_to_sheet(self):
+    #     """
+    #         Asynchronously exports scraped data to an Excel sheet.
+    #         Steps:
+    #         1. Obtains the file name by calling the `category_name()` method asynchronously.
+    #         2. Creates a 'Daraz database' directory if it doesn't exist.
+    #         3. Fetches data by calling the `scrape_datas()` method asynchronously.
+    #         4. Converts the data into a Pandas DataFrame.
+    #         5. Writes the DataFrame to an Excel file located at 'Daraz database/{file_name}.xlsx'.
+    #         Note:
+    #             The function assumes that the `scrape_datas()` method returns a list of dictionaries, each representing a row of data.
+    #         Returns:
+    #             None
+    #     """
+    #     file_name = await self.category_name()
+    #     print(f"Exporting {file_name} to Excel database.")
+    #     create_path('Daraz database')
+    #     datas = await self.scrape_datas()
+    #     df = pd.DataFrame(datas)
+    #     df.to_excel(f"Daraz database//{file_name}.xlsx", index = False)
+    async def export_to_sheet(self):
+        file_name = await self.category_name()
+        print(f"Exporting {file_name} to Excel database.")
+        create_path('Daraz Details')
+        datas = await self.scrape_details()
+        df = pd.DataFrame(datas)
+        print(df)
+        df.to_excel(f"Daraz Details//{file_name[:9]} Details .xlsx", index = False)

scrapers/scrape_details_all.py ADDED Viewed

	@@ -0,0 +1,53 @@

+    async def scrape_details(self):
+        daraz_dicts = []
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            context = await browser.new_context(user_agent=userAgents())
+            page = await context.new_page()
+            await page.goto(self.base_url)
+            country = await check_domain(self.base_url)
+            category = await self.category_name()
+            page_number_elements = await page.query_selector_all(self.yaml_me['last_page_number'])
+            self.last_page_number = int(await (page_number_elements[len(page_number_elements) - 2]).get_attribute('title'))
+            print(f"Category: {category} | Number of pages: {self.last_page_number}")
+            next_page = await page.query_selector(self.yaml_me['next_page_button'])
+            for count in range(1, self.last_page_number + 1):
+                main_contents = await page.query_selector_all(self.yaml_me['category_main_contents'])
+                print(f"\nScraping page | {count}")
+                # await page.wait_for_timeout(timeout=random_interval(5) * 1000)
+                # List to store coroutines for scraping product details
+                coroutines = []
+                for content in main_contents:
+                    product_link = f"""https:{await self.catchClause.attributes(content.query_selector(self.yaml_me['category_product_link']), 'href')}"""
+                    print(f"Product Link: {product_link}")
+                    # Append coroutine to the list
+                    coroutines.append(self.product_details(product_link))
+                # Execute all coroutines concurrently and wait for them to finish
+                product_details_results = await asyncio.gather(*coroutines)
+                print(product_details_results)
+                # print(coroutines)
+                # Extend daraz_dicts with the results
+                daraz_dicts.append(product_details_results)
+                try:
+                    await page.wait_for_selector(self.yaml_me['next_page_button'], timeout=10000)
+                    await next_page.click()
+                except PlaywrightTimeoutError:
+                    print(f"Content loading error at page number {count}. There are no results found beyond this page. Scraper is exiting......")
+                    break
+        await browser.close()
+        return daraz_dicts

scrapers/scraper_daraz.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from tools.functionalities import userAgents, TryExcept, yamlMe, check_domain, random_interval, create_path
+from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
+from bs4 import BeautifulSoup
+import requests
+import re
+class Daraz:
+    def __init__(self, base_url):
+        self.base_url = base_url
+        self.headers = {"User-Agent": userAgents()}
+        self.catchClause = TryExcept()
+        self.yaml_me = yamlMe('selectors')
+    async def category_name(self):
+        req = requests.get(self.base_url, headers=self.headers)
+        soup = BeautifulSoup(req.content, 'lxml')
+        category = [cate.text.strip() for cate in soup.find('ul', class_='breadcrumb').find_all('li', class_='breadcrumb_item')][-1]
+        name = [nam.strip() for nam in re.split(r'[,/]', category)]
+        return ' '.join(name)
+    async def product_details(self, product_url):
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless = True)
+            context = await browser.new_context(user_agent = userAgents())
+            page = await context.new_page()
+            await page.goto(product_url)
+            # wait for few seconds
+            # await page.wait_for_timeout(timeout=random_interval(10)*1000)
+            # sleep()
+            datas = {
+                "Name": await self.catchClause.text(page.query_selector(self.yaml_me['product_name'])),
+                "Discount price": await self.catchClause.text(page.query_selector(self.yaml_me['product_dc_price'])),
+                "Original price": await self.catchClause.text(page.query_selector(self.yaml_me['product_og_price'])),
+                "Sold by": await self.catchClause.text(page.query_selector(self.yaml_me['store'])),
+                "Store link": f"""https:{await self.catchClause.attributes(page.query_selector(self.yaml_me['store']), 'href')}""",
+                "Hyperlink": product_url,
+                "Image": await self.catchClause.attributes(page.query_selector(self.yaml_me['image_link']), 'src'),
+            }
+            # select product_details_description
+            product_details = await self.catchClause.text_all(await page.query_selector_all(self.yaml_me['product_details']))
+            if product_details:
+                datas['product_details'] =  product_details
+            # product_specifications
+            elements = await page.query_selector_all(self.yaml_me['product_specifications'])
+            if elements:
+                for element in elements:
+                    key, value = await self.catchClause.extract_key_value(element)
+                    datas[key] = value
+            else:
+                print("No product specifications found.")
+            # await browser.new_page()
+            await browser.close()
+        return datas
+    async def scrape_products(self):
+        daraz_dicts = []
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            context = await browser.new_context(user_agent = userAgents())
+            page = await context.new_page()
+            await page.goto(self.base_url)
+            # Determine the country from the URL.
+            country = await check_domain(self.base_url)
+            print(f"""Initiating the automation | Powered by Playwright.\n
+                    Daraz {country}
+                """)
+            # Get the name of the category being scraped.
+            category = await self.category_name()
+            # Get the total number of pages in the category.
+            page_number_elements = await page.query_selector_all(self.yaml_me['last_page_number'])
+            self.last_page_number = int(await (page_number_elements[len(page_number_elements)-2]).get_attribute('title'))
+            print(f"Category: {category} | Number of pages: {self.last_page_number}")
+            # Get the "next page" button.
+            next_page = await page.query_selector(self.yaml_me['next_page_button'])
+            # Loop through the page using the "next page button".
+            for count in range(1, self.last_page_number+1):
+                main_contents = await page.query_selector_all(self.yaml_me['category_main_contents'])
+                # Print a message indicating the current page being scraped.
+                print(f"\nScraping page | {count}")
+                # Wait for a short time before scraping the next page.
+                await page.wait_for_timeout(timeout=random_interval(5)*1000)
+                # Loop through the products on the current page and extract their data.
+                for content in main_contents:
+                    product_name = await self.catchClause.text(content.query_selector(self.yaml_me['category_product_names']))
+                    print(f"Scrapping Product: {product_name}")
+                    try:
+                        dc_price = float(re.sub(r'[Rs.,]', '', await ( await content.query_selector(self.yaml_me['category_discount_price'])).inner_text()).strip())
+                    except Exception as e:
+                        dc_price = "N/A"
+                    #  if oc_price is not oc is dc_price
+                    try:
+                        og_price = float(re.sub(r'[Rs.,]', '', await ( await content.query_selector(self.yaml_me['category_og_price'])).inner_text()).strip())
+                    except Exception as e:
+                        og_price = dc_price
+                        dc_price = 'N/A'
+                    try:
+                        # find dc% by calcuation
+                        # dc_rate = float(re.sub(r'[-%]', '', await (await content.query_selector(self.yaml_me['category_discount_rate'])).inner_text()).strip())
+                        dc_rate =  ((og_price - dc_price) / og_price) * 100
+                        dc_rate = round(dc_rate,2)
+                    except Exception as e:
+                        dc_rate = "N/A"
+                    await page.wait_for_timeout(timeout=0.03*1000)
+                    datas = {
+                        "Name": product_name,
+                        "Original price": og_price,
+                        "Discount price": dc_price,
+                        "Discount rate": dc_rate,
+                        "Hyperlink": f"""https:{await self.catchClause.attributes(content.query_selector(self.yaml_me['category_product_link']), 'href')}""",
+                        "Image": await self.catchClause.attributes(content.query_selector(self.yaml_me['category_product_image']), 'src') ,
+                    }
+                    daraz_dicts.append(datas)
+                # Click the "next page" button to go to the next page.
+                try:
+                    await page.wait_for_selector(self.yaml_me['next_page_button'], timeout = 10000)
+                    await next_page.click()
+                except PlaywrightTimeoutError:
+                    # If the "next page" button cannot be found, there are no more pages to scrape.
+                    # Print a message indicating the error and break out of the loop.
+                    print(f"Content loading error at page number {count}. There are no result found beyond this page. Scraper is exiting......")
+                    break
+            # Close the browser.
+            await browser.close()
+        return daraz_dicts

scrapers/selectors.yaml ADDED Viewed

	@@ -0,0 +1,72 @@

+# CSS selectors for product category elements:
+# category_main_contents: "div[data-qa-locator='product-item']"
+category_main_contents: "div.gridItem--Yd0sa"
+# category_main_contents: "div[data-qa-locator='product-item']:not(:has(*))"
+category_product_names: "div.title-wrapper--IaQ0m"
+# category_product_links: "a.product-card--vHfY9"
+category_product_link: "a.product-card--vHfY9"
+# category_product_image: "div.image-wrapper--ydch1 img"
+category_product_image: "img#id-img"
+category_discount_price: "div.current-price--Jklkc span.currency--GVKjl"
+category_discount_rate: "span.discount--HADrg"
+category_og_price: "div.original-price--lHYOH del.currency--GVKjl"
+last_page_number: "li[tabindex='0']"
+next_page_button: "li[title='Next Page']"
+# CSS selectors for individual product link:
+product_name: "span.pdp-mod-product-badge-title"
+ratings: "div.review-info-rate span.score"
+product_dc_price: "div.pdp-mod-product-price span.pdp-price.pdp-price_type_normal.pdp-price_color_orange.pdp-price_size_xl"
+product_og_price: "div.pdp-mod-product-price div.origin-block span:first-child"
+image_link: "img.gallery-preview-panel__image"
+store: "div.seller-name__detail a:first-child"
+product_details: "div > div.pdp-product-desc > div.html-content.pdp-product-highlights > ul > li"
+product_specifications: "div > div.pdp-product-desc > div.pdp-mod-specification > div.pdp-general-features > ul > li"
+# specifications
+brand: "div.pdp-general-features ul li.key-li:nth-child(1) div.key-value"
+sku: "div.pdp-general-features ul li.key-li:nth-child(2) div.key-value"
+battery_capacity: "div.pdp-general-features ul li.key-li:nth-child(3) div.key-value"
+ppi: "div.pdp-general-features ul li.key-li:nth-child(4) div.key-value"
+charger_type: "div.pdp-general-features ul li.key-li:nth-child(5) div.key-value"
+flash: "div.pdp-general-features ul li.key-li:nth-child(6) div.key-value"
+screen_size_inches: "div.pdp-general-features ul li.key-li:nth-child(7) div.key-value"
+bluetooth_support: "div.pdp-general-features ul li.key-li:nth-child(8) div.key-value"
+build_type: "div.pdp-general-features ul li.key-li:nth-child(9) div.key-value"
+wifi: "div.pdp-general-features ul li.key-li:nth-child(10) div.key-value"
+sim_type: "div.pdp-general-features ul li.key-li:nth-child(11) div.key-value"
+removable_battery: "div.pdp-general-features ul li.key-li:nth-child(12) div.key-value"
+gps: "div.pdp-general-features ul li.key-li:nth-child(13) div.key-value"
+camera_front_megapixels: "div.pdp-general-features ul li.key-li:nth-child(14) div.key-value"
+fm_radio: "div.pdp-general-features ul li.key-li:nth-child(15) div.key-value"
+nfc: "div.pdp-general-features ul li.key-li:nth-child(16) div.key-value"
+fingerprint_sensor: "div.pdp-general-features ul li.key-li:nth-child(17) div.key-value"
+expandable_storage: "div.pdp-general-features ul li.key-li:nth-child(18) div.key-value"
+memory_card_slot_type: "div.pdp-general-features ul li.key-li:nth-child(19) div.key-value"
+camera_back_megapixels: "div.pdp-general-features ul li.key-li:nth-child(20) div.key-value"
+number_of_cameras: "div.pdp-general-features ul li.key-li:nth-child(21) div.key-value"
+display_protection: "div.pdp-general-features ul li.key-li:nth-child(22) div.key-value"
+model_year: "div.pdp-general-features ul li.key-li:nth-child(23) div.key-value"
+video_resolution: "div.pdp-general-features ul li.key-li:nth-child(24) div.key-value"
+network_connections: "div.pdp-general-features ul li.key-li:nth-child(25) div.key-value"
+operating_system: "div.pdp-general-features ul li.key-li:nth-child(26) div.key-value"
+notch_display: "div.pdp-general-features ul li.key-li:nth-child(27) div.key-value"
+fast_charging: "div.pdp-general-features ul li.key-li:nth-child(28) div.key-value"
+model: "div.pdp-general-features ul li.key-li:nth-child(29) div.key-value"
+headphone_jack: "div.pdp-general-features ul li.key-li:nth-child(30) div.key-value"
+wireless_charging: "div.pdp-general-features ul li.key-li:nth-child(31) div.key-value"
+display_type: "div.pdp-general-features ul li.key-li:nth-child(32) div.key-value"
+processor_type: "div.pdp-general-features ul li.key-li:nth-child(33) div.key-value"
+refresh_rate: "div.pdp-general-features ul li.key-li:nth-child(34) div.key-value"
+number_of_sim_slots: "div.pdp-general-features ul li.key-li:nth-child(35) div.key-value"
+resolution: "div.pdp-general-features ul li.key-li:nth-child(36) div.key-value"

tools/__pycache__/functionalities.cpython-310.pyc ADDED Viewed

Binary file (5.04 kB). View file

tools/functionalities.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import itertools
+import random
+import yaml
+import os
+import re
+import os
+class TryExcept:
+    """
+    A class that provides try-except functionality for retrieving element attributes and inner text.
+    Methods:
+    - text(element): Retrieves the inner text of an element or returns "N/A" if the element has no inner text.
+    - attribute(element, attr): Retrieves the value of
+    """
+    async def text(self, element):
+        try:
+            elements = (await (await element).inner_text()).strip()
+        except AttributeError:
+            elements = "N/A"
+        return elements
+    async def attributes(self, element, attr):
+        try:
+            elements = await (await element).get_attribute(attr)
+        except AttributeError:
+            elements = "N/A"
+        return elements
+    async def text_all(self, elements):
+        texts = []
+        for element in elements:
+            try:
+                text = (await element.inner_text()).strip()
+            except AttributeError:
+                text = "N/A"
+            texts.append(text)
+        return texts
+    async def extract_key_value(self, element):
+        try:
+            key_element = await element.query_selector('.key-title')
+            value_element = await element.query_selector('.key-value')
+            if key_element and value_element:
+                key = (await key_element.inner_text()).strip()
+                value = (await value_element.inner_text()).strip()
+                return key, value
+            else:
+                return "N/A", "N/A"
+        except AttributeError:
+            return "N/A", "N/A"
+def create_path(dir_name):
+#     """
+#     Creates a directory with the specified name if i doesn't already exist.
+#     Args:
+#         -dir_name: A string representing the name of the direcory to create.
+#     Return:
+#         -None
+#     """
+    path_dir = os.path.join(os.getcwd(), dir_name)
+    if os.path.exists(path_dir):
+        pass
+    else:
+        os.mkdir(path_dir)
+def verifyDarazURL(url):
+    """
+    Check if the URL belongs to a Daraz website.
+    Args:
+    - url (str): The URL to check.
+    Returns:
+    - bool: True if the URL is not a Daraz website, False otherwise.
+    """
+    daraz_pattern = re.search("""^https://www.daraz.(com.np|lk|pk|com.bd)/+""", url)
+    if daraz_pattern == None:
+        return True
+    else:
+        return False
+def random_interval(value):
+    """
+    This function takes an integer value as an argument and returns a random integer between 2 and the length
+    of a list containing values from 0 up to the given value.
+    Args:
+    - value (int): an integer value representing the upper limit of the range
+    Returns:
+    - rand_time (int): a random integer between 2 and the length of the range list plus 1
+    """
+    ranges = [time for time in range(value+1)]
+    rand_time = random.randint(2, len(ranges))
+    return rand_time
+async def check_domain(url):
+    """
+    Check the domain of a URL and return the country it belongs to.
+    Args:
+    - url (str): The URL to check.
+    Returns:
+    - str: The name of the country the domain belongs to (Nepal, Sri Lanka, Bangladesh, or Pakistan).
+    """
+    pattern = re.search(r"(.np|.bd|.lk|.pk|.mm)", url)
+    domain_lists = {
+        'np': 'Nepal',
+        'lk': 'Sri Lanka',
+        'bd': 'Bangladesh',
+        'pk': 'Pakistan',
+        'mm': 'Myanmar',
+    }
+    try:
+        country =pattern.group(1).replace(".", '')
+    except AttributeError:
+        country = None
+    return domain_lists[country]
+def flat(d_lists):
+    """
+    Flatten a multi-dimentional list.
+    Args:
+    - d_lists (list): A multi-dimensional list.
+    Returns:
+    - list: A flattened version of the input list.
+    """
+    return list(itertools.chain(*d_lists))
+def yamlMe(selectors):
+    """
+    Loads a YAML file containing CSS selectors and their corresponding data fields, and returns the loaded data as a dictionary.
+    Args:
+    - selectors (str): The name of the YAML file to load.
+    Returns:
+    - dict: A dictionary containing CSS selectors and their corresponding data fields.
+    """
+    with open(f"scrapers\\{selectors}.yaml") as file:
+        sel = yaml.load(file, Loader = yaml.SafeLoader)
+        return sel
+def userAgents():
+   """
+   Loads a text file containing a list of user agent strings, and returns a random choice from the list.
+   Returns:
+   - str: A randomly chosen user agent string.
+   """
+   with open(f"{os.getcwd()}\\tools\\user-agents.txt") as f:
+    agents = f.read().split("\n")
+    return random.choice(agents)

tools/user-agents.txt ADDED Viewed

The diff for this file is too large to render. See raw diff