File size: 1,973 Bytes
88cb8ab
0167b87
fa68f26
6a48f7d
 
88cb8ab
 
 
 
197e03a
88cb8ab
 
 
 
6a48f7d
 
3bcd8f6
88cb8ab
 
 
 
0167b87
fa68f26
88cb8ab
 
6a48f7d
88cb8ab
fa68f26
0167b87
 
 
 
 
 
 
 
88cb8ab
6a48f7d
 
 
3bcd8f6
6a48f7d
0167b87
6a48f7d
 
 
 
 
 
fa68f26
88cb8ab
 
fa68f26
88cb8ab
 
fa68f26
88cb8ab
0167b87
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from src.utils.tooling import tool
from src.utils.vector_store import chunk_content, load_in_vector_db



@tool
def visit_webpage(url: str) -> str:
    """
    Visits a webpage at the given URL and reads its content as a markdown string.
    This tool is useful for extracting information from web pages in a structured format after a search.
    Args:
        url (str): The URL of the webpage to visit.
    """
    try:
        from src.web2llm.app.scraper import scrape_url
        from src.web2llm.app.converter import html_to_markdown
        import re
        import requests
        from markdownify import markdownify
        from requests.exceptions import RequestException
        from smolagents.utils import truncate_content
        from urllib.parse import urlparse

    except ImportError as e:
        raise ImportError(
            f"You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests` : {e}"
        ) from e

    forbidden_domains = ["universetoday.com"]

    parsed_url = urlparse(url)
    domain = parsed_url.netloc

    if domain in forbidden_domains:
        return "This domain is forbidden and cannot be accessed, please try another one."

    try:
        # Web2LLM app
        result = scrape_url(url, clean=True)
        markdown_content = html_to_markdown(result["clean_html"])

        load_in_vector_db(
            markdown_content,
            metadatas={
                "title": result["title"],
                "url": url,
            }
        )
        return "The webpage has been successfully visited: content has been vectorized and stored in the knowledge base."

    except requests.exceptions.Timeout:
        return "The request timed out. Please try again later or check the URL."

    except RequestException as e:
        return f"Error fetching the webpage: {str(e)}"

    except Exception as e:
        return f"An unexpected error occurred: {str(e)}"