Spaces:
Build error
Build error
import aiohttp | |
from aiohttp import ClientSession | |
import asyncio | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse, urljoin, urldefrag | |
class WebpageCrawler: | |
def __init__(self): | |
self.dict_href_links = {} | |
async def fetch(self, session: ClientSession, url: str) -> str: | |
async with session.get(url) as response: | |
try: | |
return await response.text() | |
except UnicodeDecodeError: | |
return await response.text(encoding="latin1") | |
def url_exists(self, url: str) -> bool: | |
try: | |
response = requests.head(url) | |
return response.status_code == 200 | |
except requests.ConnectionError: | |
return False | |
async def get_links(self, session: ClientSession, website_link: str, base_url: str): | |
html_data = await self.fetch(session, website_link) | |
soup = BeautifulSoup(html_data, "html.parser") | |
list_links = [] | |
for link in soup.find_all("a", href=True): | |
href = link["href"].strip() | |
full_url = urljoin(base_url, href) | |
normalized_url = self.normalize_url(full_url) # sections removed | |
if ( | |
normalized_url not in self.dict_href_links | |
and self.is_child_url(normalized_url, base_url) | |
and self.url_exists(normalized_url) | |
): | |
self.dict_href_links[normalized_url] = None | |
list_links.append(normalized_url) | |
return list_links | |
async def get_subpage_links( | |
self, session: ClientSession, urls: list, base_url: str | |
): | |
tasks = [self.get_links(session, url, base_url) for url in urls] | |
results = await asyncio.gather(*tasks) | |
all_links = [link for sublist in results for link in sublist] | |
return all_links | |
async def get_all_pages(self, url: str, base_url: str): | |
async with aiohttp.ClientSession() as session: | |
dict_links = {url: "Not-checked"} | |
counter = None | |
while counter != 0: | |
unchecked_links = [ | |
link | |
for link, status in dict_links.items() | |
if status == "Not-checked" | |
] | |
if not unchecked_links: | |
break | |
new_links = await self.get_subpage_links( | |
session, unchecked_links, base_url | |
) | |
for link in unchecked_links: | |
dict_links[link] = "Checked" | |
print(f"Checked: {link}") | |
dict_links.update( | |
{ | |
link: "Not-checked" | |
for link in new_links | |
if link not in dict_links | |
} | |
) | |
counter = len( | |
[ | |
status | |
for status in dict_links.values() | |
if status == "Not-checked" | |
] | |
) | |
checked_urls = [ | |
url for url, status in dict_links.items() if status == "Checked" | |
] | |
return checked_urls | |
def is_webpage(self, url: str) -> bool: | |
try: | |
response = requests.head(url, allow_redirects=True) | |
content_type = response.headers.get("Content-Type", "").lower() | |
return "text/html" in content_type | |
except requests.RequestException: | |
return False | |
def clean_url_list(self, urls): | |
files, webpages = [], [] | |
for url in urls: | |
if self.is_webpage(url): | |
webpages.append(url) | |
else: | |
files.append(url) | |
return files, webpages | |
def is_child_url(self, url, base_url): | |
return url.startswith(base_url) | |
def normalize_url(self, url: str): | |
# Strip the fragment identifier | |
defragged_url, _ = urldefrag(url) | |
return defragged_url | |