Spaces:
Runtime error
Runtime error
| # this is asmolagent too to fetch html content from a url | |
| from smolagents import tool | |
| import requests | |
| from markdownify import markdownify as md | |
| from bs4 import BeautifulSoup | |
| from common.mylogger import save_file_with_timestamp, mylog | |
| def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str: | |
| """ | |
| Fetches the HTML content of a given URL. | |
| if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML | |
| Args: | |
| url (str): The URL to fetch. | |
| convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML. | |
| Returns: | |
| str: The HTML content of the URL. | |
| """ | |
| content = None | |
| response = requests.get(url, timeout=30) | |
| if (convert_to_markdown): | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| # remove script and style tags | |
| for script in soup(["script", "style"]): | |
| script.extract() | |
| # for wikipedia only keep the main content | |
| if "wikipedia.org" in url: | |
| main_content = soup.find("main",{"id":"content"}) | |
| if main_content: | |
| content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip() | |
| else: | |
| content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip() | |
| else: | |
| content = response.text | |
| save_file_with_timestamp(content, "webpage", ".md" if convert_to_markdown else ".html") | |
| return content | |
| # this tool allow web search on a local SearXNG instance | |
| def search_web(query: str, num_results: int = 5) -> list: | |
| """ | |
| Perform a web search using local SearXNG instance. | |
| Args: | |
| query (str): The search query. | |
| num_results (int): The number of results to return. | |
| Returns: | |
| list: A list of search results sorted by score with {url, title, content, score} for each result. | |
| """ | |
| # local metaserach engine searxng, run on localhost:8888 | |
| searxng_url = "http://localhost:8888/search" | |
| params = {"q": query, "format": 'json'} | |
| response = requests.get(searxng_url, params=params) | |
| if response.status_code == 200: | |
| ret = response.json() | |
| # keep only the response'results' array | |
| results = ret.get("results", []) | |
| # keep only the first num_results | |
| results = results[:num_results] | |
| # for each result keep only the url, title and content ans score | |
| results = [ | |
| { | |
| "url": result.get("url"), | |
| "title": result.get("title"), | |
| "content": result.get("content"), | |
| "score": result.get("score"), | |
| } | |
| for result in results | |
| ] | |
| return results | |
| else: | |
| print(f"Error: {response.status_code}") | |
| return [] | |
| if __name__ == "__main__": | |
| try: | |
| # Test the function | |
| query = "What is the capital of France?" | |
| results = search_web(query,3) | |
| print(results) | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| try: | |
| # Test the function | |
| video_id = "L1vXCYZAYYM" # Replace with your YouTube video ID | |
| video_url = "https://www.youtube.com/watch?v=" + video_id | |
| url = "https://en.wikipedia.org/wiki/Malko_Competition" | |
| # page_content = fetch_webpage(video_url) | |
| page_content = fetch_webpage(url, convert_to_markdown=True) | |
| print(page_content.encode("utf-8")) | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |