Spaces:

AbhishekRP2002
/

chainlit-artisan-chatbot

No application file

chainlit-artisan-chatbot / src /web_scrape.py

Abhishek Ranjan

feat: implement rag chatbot for artisan

7f8188c 7 months ago

3.2 kB

	from langchain_community.document_loaders.firecrawl import FireCrawlLoader
	import os
	from dotenv import load_dotenv
	import asyncio
	from rich.pretty import pprint # noqa
	from typing import List
	from langchain_core.documents import Document
	import re
	import aiohttp
	from tenacity import (
	retry,
	stop_after_attempt,
	wait_exponential,
	retry_if_exception_type,
	)
	import logging


	load_dotenv()

	FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")

	logger = logging.getLogger(__name__)
	logger.setLevel(logging.INFO)


	def clean_markdown(document: Document) -> Document:
	raw_content = document.page_content
	metadata = {
	"url": document.metadata.get("og:url", document.metadata.get("ogUrl", None)),
	"title": document.metadata.get(
	"og:title", document.metadata.get("ogTitle", None)
	),
	"description": document.metadata.get(
	"og:description", document.metadata.get("ogDescription", None)
	),
	}

	try:
	cleaned_content = re.sub(r"\!\[.?\]\(.?\)", "", raw_content)
	cleaned_content = re.sub(r"![.?](.?)", "", cleaned_content)
	cleaned_content = re.sub(r"\[.?\]\(.?\)", "", cleaned_content)
	cleaned_content = re.sub(r"(\w)-\n(\w)", r"\1\2", cleaned_content)
	cleaned_content = re.sub(r"\n\n\n+", "\n\n", cleaned_content)
	cleaned_content = re.sub(r"([^a-zA-Z0-9\s])\1{3,}", r"\1\1", cleaned_content)
	cleaned_content = re.sub(r"[\U0001F300-\U0001F9FF]+\n\n", "", cleaned_content)
	cleaned_content = re.sub(r"\n\n[/#]\n\n", "\n\n", cleaned_content)
	cleaned_content = cleaned_content.strip()
	except Exception as e:
	logger.error(f"Error cleaning markdown: {e}")
	raise e

	document.page_content = cleaned_content
	document.metadata = metadata

	return document


	@retry(
	stop=stop_after_attempt(3),
	wait=wait_exponential(multiplier=1, min=4, max=15),
	retry=retry_if_exception_type(
	(Exception, asyncio.TimeoutError, aiohttp.ClientError)
	),
	retry_error_callback=lambda retry_state: None,
	)
	async def scrape_website(url: str):
	logger.info(f"Scraping url : {url}")
	try:
	lc_loader = FireCrawlLoader(
	url=url,
	api_key=FIRECRAWL_API_KEY,
	mode="scrape",
	params={
	"formats": ["markdown"],
	"onlyMainContent": True,
	"removeBase64Images": True,
	"skipTlsVerification": True,
	},
	)
	lc_doc = await lc_loader.aload()
	cleaned_lc_doc = clean_markdown(lc_doc[0])
	return cleaned_lc_doc
	except Exception as e:
	logger.error(f"Error scraping {url}: {e}")
	raise e


	async def scrape_main(urls: List[str]):
	tasks = [scrape_website(url) for url in urls]
	responses = await asyncio.gather(*tasks, return_exceptions=True)
	return [
	response
	for response in responses
	if response is not None or isinstance(response, Exception)
	]


	if __name__ == "__main__":
	urls = ["https://www.artisan.co", "https://www.artisan.co/about"]
	responses = asyncio.run(scrape_main(urls))
	pprint(responses)