Spaces:
Sleeping
Sleeping
from concurrent.futures import ThreadPoolExecutor | |
from src.utils.exceptions import CustomException | |
from urllib.parse import urlparse, urljoin | |
from src.utils.functions import getConfig, cleanText | |
from src.utils.logging import logger | |
from bs4 import BeautifulSoup | |
import time | |
import requests | |
class WebsiteCrawler: | |
def __init__(self): | |
"""Initialize the WebsiteCrawler with configuration settings.""" | |
self.config = getConfig(path="config.ini") | |
def getLinksFromPage(self, url: str) -> list[str]: | |
""" | |
Extract all valid links from a given webpage. | |
Args: | |
url (str): The URL of the webpage to extract links from. | |
Returns: | |
list[str]: A list of extracted links from the page. | |
""" | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, "html.parser") | |
anchors = soup.find_all("a") | |
links = [] | |
for anchor in anchors: | |
if "href" in anchor.attrs: | |
if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc: | |
links.append(anchor.attrs["href"]) | |
elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")): | |
links.append(urljoin(url + "/", anchor.attrs["href"])) | |
links = [link for link in links if "#" not in link] | |
links = list(set(links)) | |
return links | |
def getLinks(self, url: str) -> list[str]: | |
""" | |
Fetch and return all unique links found from the given URL. | |
Args: | |
url (str): The starting URL to fetch links from. | |
Returns: | |
list[str]: A list of unique links found. | |
""" | |
try: | |
logger.info("Fetching links from URL") | |
start = time.time() | |
links = self.getLinksFromPage(url) | |
uniqueLinks = set() | |
for link in links: | |
now = time.time() | |
if now - start > self.config.getint("WEBCRAWLER", "timeout"): | |
break | |
uniqueLinks = uniqueLinks.union(set(self.getLinksFromPage(link))) | |
return list(set([x[:-1] if x[-1] == "/" else x for x in uniqueLinks])) | |
except Exception as e: | |
logger.error(CustomException(e)) | |
def extractTextFromUrl(self, url: str) -> str: | |
""" | |
Extract and clean text content from a given URL. | |
Args: | |
url (str): The URL of the webpage to extract text from. | |
Returns: | |
str: Cleaned text extracted from the webpage. | |
""" | |
response = requests.get(url) | |
response.raise_for_status() | |
html = response.text | |
soup = BeautifulSoup(html, 'html.parser') | |
return cleanText(text=soup.get_text(separator=' ', strip=True)) | |
def extractTextFromUrlList(self, urls: list[str]) -> str: | |
""" | |
Extract text from a list of URLs concurrently. | |
Args: | |
urls (list[str]): A list of URLs to extract text from. | |
Returns: | |
str: All extracted text combined into a single string. | |
""" | |
try: | |
logger.info("Extracting text from URLs") | |
with ThreadPoolExecutor() as executor: | |
texts = list(executor.map(self.extractTextFromUrl, urls)) | |
return "\n".join(texts) | |
except Exception as e: | |
logger.error(CustomException(e)) |