Spaces:
Sleeping
Sleeping
File size: 1,973 Bytes
88cb8ab 0167b87 fa68f26 6a48f7d 88cb8ab 197e03a 88cb8ab 6a48f7d 3bcd8f6 88cb8ab 0167b87 fa68f26 88cb8ab 6a48f7d 88cb8ab fa68f26 0167b87 88cb8ab 6a48f7d 3bcd8f6 6a48f7d 0167b87 6a48f7d fa68f26 88cb8ab fa68f26 88cb8ab fa68f26 88cb8ab 0167b87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
from src.utils.tooling import tool
from src.utils.vector_store import chunk_content, load_in_vector_db
@tool
def visit_webpage(url: str) -> str:
"""
Visits a webpage at the given URL and reads its content as a markdown string.
This tool is useful for extracting information from web pages in a structured format after a search.
Args:
url (str): The URL of the webpage to visit.
"""
try:
from src.web2llm.app.scraper import scrape_url
from src.web2llm.app.converter import html_to_markdown
import re
import requests
from markdownify import markdownify
from requests.exceptions import RequestException
from smolagents.utils import truncate_content
from urllib.parse import urlparse
except ImportError as e:
raise ImportError(
f"You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests` : {e}"
) from e
forbidden_domains = ["universetoday.com"]
parsed_url = urlparse(url)
domain = parsed_url.netloc
if domain in forbidden_domains:
return "This domain is forbidden and cannot be accessed, please try another one."
try:
# Web2LLM app
result = scrape_url(url, clean=True)
markdown_content = html_to_markdown(result["clean_html"])
load_in_vector_db(
markdown_content,
metadatas={
"title": result["title"],
"url": url,
}
)
return "The webpage has been successfully visited: content has been vectorized and stored in the knowledge base."
except requests.exceptions.Timeout:
return "The request timed out. Please try again later or check the URL."
except RequestException as e:
return f"Error fetching the webpage: {str(e)}"
except Exception as e:
return f"An unexpected error occurred: {str(e)}" |