Spaces:
Sleeping
Sleeping
from src.utils.tooling import tool | |
from src.utils.vector_store import chunk_content, load_in_vector_db | |
def visit_webpage(url: str) -> str: | |
""" | |
Visits a webpage at the given URL and reads its content as a markdown string. | |
This tool is useful for extracting information from web pages in a structured format after a search. | |
Args: | |
url (str): The URL of the webpage to visit. | |
""" | |
try: | |
from src.web2llm.app.scraper import scrape_url | |
from src.web2llm.app.converter import html_to_markdown | |
import re | |
import requests | |
from markdownify import markdownify | |
from requests.exceptions import RequestException | |
from smolagents.utils import truncate_content | |
from urllib.parse import urlparse | |
except ImportError as e: | |
raise ImportError( | |
f"You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests` : {e}" | |
) from e | |
forbidden_domains = ["universetoday.com"] | |
parsed_url = urlparse(url) | |
domain = parsed_url.netloc | |
if domain in forbidden_domains: | |
return "This domain is forbidden and cannot be accessed, please try another one." | |
try: | |
# Web2LLM app | |
result = scrape_url(url, clean=True) | |
markdown_content = html_to_markdown(result["clean_html"]) | |
load_in_vector_db( | |
markdown_content, | |
metadatas={ | |
"title": result["title"], | |
"url": url, | |
} | |
) | |
return "The webpage has been successfully visited: content has been vectorized and stored in the knowledge base." | |
except requests.exceptions.Timeout: | |
return "The request timed out. Please try again later or check the URL." | |
except RequestException as e: | |
return f"Error fetching the webpage: {str(e)}" | |
except Exception as e: | |
return f"An unexpected error occurred: {str(e)}" |