ConversAI / src /components /loaders /websiteCrawler.py
techconsptrs's picture
UPDATE: code update
1802405
from concurrent.futures import ThreadPoolExecutor
from src.utils.exceptions import CustomException
from urllib.parse import urlparse, urljoin
from src.utils.functions import getConfig, cleanText
from src.utils.logging import logger
from bs4 import BeautifulSoup
import time
import requests
class WebsiteCrawler:
def __init__(self):
"""Initialize the WebsiteCrawler with configuration settings."""
self.config = getConfig(path="config.ini")
def getLinksFromPage(self, url: str) -> list[str]:
"""
Extract all valid links from a given webpage.
Args:
url (str): The URL of the webpage to extract links from.
Returns:
list[str]: A list of extracted links from the page.
"""
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
anchors = soup.find_all("a")
links = []
for anchor in anchors:
if "href" in anchor.attrs:
if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
links.append(anchor.attrs["href"])
elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
links.append(urljoin(url + "/", anchor.attrs["href"]))
links = [link for link in links if "#" not in link]
links = list(set(links))
return links
def getLinks(self, url: str) -> list[str]:
"""
Fetch and return all unique links found from the given URL.
Args:
url (str): The starting URL to fetch links from.
Returns:
list[str]: A list of unique links found.
"""
try:
logger.info("Fetching links from URL")
start = time.time()
links = self.getLinksFromPage(url)
uniqueLinks = set()
for link in links:
now = time.time()
if now - start > self.config.getint("WEBCRAWLER", "timeout"):
break
uniqueLinks = uniqueLinks.union(set(self.getLinksFromPage(link)))
return list(set([x[:-1] if x[-1] == "/" else x for x in uniqueLinks]))
except Exception as e:
logger.error(CustomException(e))
def extractTextFromUrl(self, url: str) -> str:
"""
Extract and clean text content from a given URL.
Args:
url (str): The URL of the webpage to extract text from.
Returns:
str: Cleaned text extracted from the webpage.
"""
response = requests.get(url)
response.raise_for_status()
html = response.text
soup = BeautifulSoup(html, 'html.parser')
return cleanText(text=soup.get_text(separator=' ', strip=True))
def extractTextFromUrlList(self, urls: list[str]) -> str:
"""
Extract text from a list of URLs concurrently.
Args:
urls (list[str]): A list of URLs to extract text from.
Returns:
str: All extracted text combined into a single string.
"""
try:
logger.info("Extracting text from URLs")
with ThreadPoolExecutor() as executor:
texts = list(executor.map(self.extractTextFromUrl, urls))
return "\n".join(texts)
except Exception as e:
logger.error(CustomException(e))