AutoGPT

Sleeping

App Files Files Community

AutoGPT / autogpt /commands /web_requests.py

myclassunil

Duplicate from aliabid94/AutoGPT

d6152d3 about 1 year ago

raw

history blame contribute delete

No virus

5.11 kB

	"""Browse a webpage and summarize it using the LLM model"""
	from __future__ import annotations

	from urllib.parse import urljoin, urlparse

	import requests
	from bs4 import BeautifulSoup
	from requests import Response
	from requests.compat import urljoin

	from autogpt.config import Config
	from autogpt.memory import get_memory
	from autogpt.processing.html import extract_hyperlinks, format_hyperlinks

	CFG = Config()
	memory = get_memory(CFG)

	session = requests.Session()
	session.headers.update({"User-Agent": CFG.user_agent})


	def is_valid_url(url: str) -> bool:
	"""Check if the URL is valid

	Args:
	url (str): The URL to check

	Returns:
	bool: True if the URL is valid, False otherwise
	"""
	try:
	result = urlparse(url)
	return all([result.scheme, result.netloc])
	except ValueError:
	return False


	def sanitize_url(url: str) -> str:
	"""Sanitize the URL

	Args:
	url (str): The URL to sanitize

	Returns:
	str: The sanitized URL
	"""
	return urljoin(url, urlparse(url).path)


	def check_local_file_access(url: str) -> bool:
	"""Check if the URL is a local file

	Args:
	url (str): The URL to check

	Returns:
	bool: True if the URL is a local file, False otherwise
	"""
	local_prefixes = [
	"file:///",
	"file://localhost/",
	"file://localhost",
	"http://localhost",
	"http://localhost/",
	"https://localhost",
	"https://localhost/",
	"http://2130706433",
	"http://2130706433/",
	"https://2130706433",
	"https://2130706433/",
	"http://127.0.0.1/",
	"http://127.0.0.1",
	"https://127.0.0.1/",
	"https://127.0.0.1",
	"https://0.0.0.0/",
	"https://0.0.0.0",
	"http://0.0.0.0/",
	"http://0.0.0.0",
	"http://0000",
	"http://0000/",
	"https://0000",
	"https://0000/",
	]
	return any(url.startswith(prefix) for prefix in local_prefixes)


	def get_response(
	url: str, timeout: int = 10
	) -> tuple[None, str] \| tuple[Response, None]:
	"""Get the response from a URL

	Args:
	url (str): The URL to get the response from
	timeout (int): The timeout for the HTTP request

	Returns:
	tuple[None, str] \| tuple[Response, None]: The response and error message

	Raises:
	ValueError: If the URL is invalid
	requests.exceptions.RequestException: If the HTTP request fails
	"""
	try:
	# Restrict access to local files
	if check_local_file_access(url):
	raise ValueError("Access to local files is restricted")

	# Most basic check if the URL is valid:
	if not url.startswith("http://") and not url.startswith("https://"):
	raise ValueError("Invalid URL format")

	sanitized_url = sanitize_url(url)

	response = session.get(sanitized_url, timeout=timeout)

	# Check if the response contains an HTTP error
	if response.status_code >= 400:
	return None, f"Error: HTTP {str(response.status_code)} error"

	return response, None
	except ValueError as ve:
	# Handle invalid URL format
	return None, f"Error: {str(ve)}"

	except requests.exceptions.RequestException as re:
	# Handle exceptions related to the HTTP request
	# (e.g., connection errors, timeouts, etc.)
	return None, f"Error: {str(re)}"


	def scrape_text(url: str) -> str:
	"""Scrape text from a webpage

	Args:
	url (str): The URL to scrape text from

	Returns:
	str: The scraped text
	"""
	response, error_message = get_response(url)
	if error_message:
	return error_message
	if not response:
	return "Error: Could not get response"

	soup = BeautifulSoup(response.text, "html.parser")

	for script in soup(["script", "style"]):
	script.extract()

	text = soup.get_text()
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = "\n".join(chunk for chunk in chunks if chunk)

	return text


	def scrape_links(url: str) -> str \| list[str]:
	"""Scrape links from a webpage

	Args:
	url (str): The URL to scrape links from

	Returns:
	str \| list[str]: The scraped links
	"""
	response, error_message = get_response(url)
	if error_message:
	return error_message
	if not response:
	return "Error: Could not get response"
	soup = BeautifulSoup(response.text, "html.parser")

	for script in soup(["script", "style"]):
	script.extract()

	hyperlinks = extract_hyperlinks(soup, url)

	return format_hyperlinks(hyperlinks)


	def create_message(chunk, question):
	"""Create a message for the user to summarize a chunk of text"""
	return {
	"role": "user",
	"content": f'"""{chunk}""" Using the above text, answer the following'
	f' question: "{question}" -- if the question cannot be answered using the'
	" text, summarize the text.",
	}