Spaces:
Runtime error
Runtime error
# this is asmolagent too to fetch html content from a url | |
from smolagents import tool | |
import requests | |
from markdownify import markdownify as md | |
from bs4 import BeautifulSoup | |
from common.mylogger import save_file_with_timestamp, mylog | |
def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str: | |
""" | |
Fetches the HTML content of a given URL. | |
if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML | |
Args: | |
url (str): The URL to fetch. | |
convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML. | |
Returns: | |
str: The HTML content of the URL. | |
""" | |
content = None | |
response = requests.get(url, timeout=30) | |
if (convert_to_markdown): | |
soup = BeautifulSoup(response.text, "html.parser") | |
# remove script and style tags | |
for script in soup(["script", "style"]): | |
script.extract() | |
# for wikipedia only keep the main content | |
if "wikipedia.org" in url: | |
main_content = soup.find("main",{"id":"content"}) | |
if main_content: | |
content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip() | |
else: | |
content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip() | |
else: | |
content = response.text | |
save_file_with_timestamp(content, "webpage", ".md" if convert_to_markdown else ".html") | |
return content | |
# this tool allow web search on a local SearXNG instance | |
def search_web(query: str, num_results: int = 5) -> list: | |
""" | |
Perform a web search using local SearXNG instance. | |
Args: | |
query (str): The search query. | |
num_results (int): The number of results to return. | |
Returns: | |
list: A list of search results sorted by score with {url, title, content, score} for each result. | |
""" | |
# local metaserach engine searxng, run on localhost:8888 | |
searxng_url = "http://localhost:8888/search" | |
params = {"q": query, "format": 'json'} | |
response = requests.get(searxng_url, params=params) | |
if response.status_code == 200: | |
ret = response.json() | |
# keep only the response'results' array | |
results = ret.get("results", []) | |
# keep only the first num_results | |
results = results[:num_results] | |
# for each result keep only the url, title and content ans score | |
results = [ | |
{ | |
"url": result.get("url"), | |
"title": result.get("title"), | |
"content": result.get("content"), | |
"score": result.get("score"), | |
} | |
for result in results | |
] | |
return results | |
else: | |
print(f"Error: {response.status_code}") | |
return [] | |
if __name__ == "__main__": | |
try: | |
# Test the function | |
query = "What is the capital of France?" | |
results = search_web(query,3) | |
print(results) | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
try: | |
# Test the function | |
video_id = "L1vXCYZAYYM" # Replace with your YouTube video ID | |
video_url = "https://www.youtube.com/watch?v=" + video_id | |
url = "https://en.wikipedia.org/wiki/Malko_Competition" | |
# page_content = fetch_webpage(video_url) | |
page_content = fetch_webpage(url, convert_to_markdown=True) | |
print(page_content.encode("utf-8")) | |
except Exception as e: | |
print(f"An error occurred: {e}") | |