Spaces:
No application file
No application file
from langchain_community.document_loaders.firecrawl import FireCrawlLoader | |
import os | |
from dotenv import load_dotenv | |
import asyncio | |
from rich.pretty import pprint # noqa | |
from typing import List | |
from langchain_core.documents import Document | |
import re | |
import aiohttp | |
from tenacity import ( | |
retry, | |
stop_after_attempt, | |
wait_exponential, | |
retry_if_exception_type, | |
) | |
import logging | |
load_dotenv() | |
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY") | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.INFO) | |
def clean_markdown(document: Document) -> Document: | |
raw_content = document.page_content | |
metadata = { | |
"url": document.metadata.get("og:url", document.metadata.get("ogUrl", None)), | |
"title": document.metadata.get( | |
"og:title", document.metadata.get("ogTitle", None) | |
), | |
"description": document.metadata.get( | |
"og:description", document.metadata.get("ogDescription", None) | |
), | |
} | |
try: | |
cleaned_content = re.sub(r"\!\[.*?\]\(.*?\)", "", raw_content) | |
cleaned_content = re.sub(r"", "", cleaned_content) | |
cleaned_content = re.sub(r"\[.*?\]\(.*?\)", "", cleaned_content) | |
cleaned_content = re.sub(r"(\w)-\n(\w)", r"\1\2", cleaned_content) | |
cleaned_content = re.sub(r"\n\n\n+", "\n\n", cleaned_content) | |
cleaned_content = re.sub(r"([^a-zA-Z0-9\s])\1{3,}", r"\1\1", cleaned_content) | |
cleaned_content = re.sub(r"[\U0001F300-\U0001F9FF]+\n\n", "", cleaned_content) | |
cleaned_content = re.sub(r"\n\n[/#]\n\n", "\n\n", cleaned_content) | |
cleaned_content = cleaned_content.strip() | |
except Exception as e: | |
logger.error(f"Error cleaning markdown: {e}") | |
raise e | |
document.page_content = cleaned_content | |
document.metadata = metadata | |
return document | |
async def scrape_website(url: str): | |
logger.info(f"Scraping url : {url}") | |
try: | |
lc_loader = FireCrawlLoader( | |
url=url, | |
api_key=FIRECRAWL_API_KEY, | |
mode="scrape", | |
params={ | |
"formats": ["markdown"], | |
"onlyMainContent": True, | |
"removeBase64Images": True, | |
"skipTlsVerification": True, | |
}, | |
) | |
lc_doc = await lc_loader.aload() | |
cleaned_lc_doc = clean_markdown(lc_doc[0]) | |
return cleaned_lc_doc | |
except Exception as e: | |
logger.error(f"Error scraping {url}: {e}") | |
raise e | |
async def scrape_main(urls: List[str]): | |
tasks = [scrape_website(url) for url in urls] | |
responses = await asyncio.gather(*tasks, return_exceptions=True) | |
return [ | |
response | |
for response in responses | |
if response is not None or isinstance(response, Exception) | |
] | |
if __name__ == "__main__": | |
urls = ["https://www.artisan.co", "https://www.artisan.co/about"] | |
responses = asyncio.run(scrape_main(urls)) | |
pprint(responses) | |