Spaces:

mriusero
/

A-Mistral-Agent

Sleeping

A-Mistral-Agent / src /tools /visit_webpage.py

mriusero

feat: 55 pts version

0167b87 4 months ago

1.97 kB

	from src.utils.tooling import tool
	from src.utils.vector_store import chunk_content, load_in_vector_db



	@tool
	def visit_webpage(url: str) -> str:
	"""
	Visits a webpage at the given URL and reads its content as a markdown string.
	This tool is useful for extracting information from web pages in a structured format after a search.
	Args:
	url (str): The URL of the webpage to visit.
	"""
	try:
	from src.web2llm.app.scraper import scrape_url
	from src.web2llm.app.converter import html_to_markdown
	import re
	import requests
	from markdownify import markdownify
	from requests.exceptions import RequestException
	from smolagents.utils import truncate_content
	from urllib.parse import urlparse

	except ImportError as e:
	raise ImportError(
	f"You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests` : {e}"
	) from e

	forbidden_domains = ["universetoday.com"]

	parsed_url = urlparse(url)
	domain = parsed_url.netloc

	if domain in forbidden_domains:
	return "This domain is forbidden and cannot be accessed, please try another one."

	try:
	# Web2LLM app
	result = scrape_url(url, clean=True)
	markdown_content = html_to_markdown(result["clean_html"])

	load_in_vector_db(
	markdown_content,
	metadatas={
	"title": result["title"],
	"url": url,
	}
	)
	return "The webpage has been successfully visited: content has been vectorized and stored in the knowledge base."

	except requests.exceptions.Timeout:
	return "The request timed out. Please try again later or check the URL."

	except RequestException as e:
	return f"Error fetching the webpage: {str(e)}"

	except Exception as e:
	return f"An unexpected error occurred: {str(e)}"