u

Paused

u / backend /open_webui /apps /retrieval /web /utils.py

github-actions[bot]

GitHub deploy: 457360dae70a4bc565e2f727a7570316e4829eb0

6f8cb64 over 1 year ago

3.57 kB

	import socket
	import urllib.parse
	import validators
	from typing import Union, Sequence, Iterator

	from langchain_community.document_loaders import (
	WebBaseLoader,
	)
	from langchain_core.documents import Document


	from open_webui.constants import ERROR_MESSAGES
	from open_webui.config import ENABLE_RAG_LOCAL_WEB_FETCH
	from open_webui.env import SRC_LOG_LEVELS

	import logging

	log = logging.getLogger(__name__)
	log.setLevel(SRC_LOG_LEVELS["RAG"])


	def validate_url(url: Union[str, Sequence[str]]):
	if isinstance(url, str):
	if isinstance(validators.url(url), validators.ValidationError):
	raise ValueError(ERROR_MESSAGES.INVALID_URL)
	if not ENABLE_RAG_LOCAL_WEB_FETCH:
	# Local web fetch is disabled, filter out any URLs that resolve to private IP addresses
	parsed_url = urllib.parse.urlparse(url)
	# Get IPv4 and IPv6 addresses
	ipv4_addresses, ipv6_addresses = resolve_hostname(parsed_url.hostname)
	# Check if any of the resolved addresses are private
	# This is technically still vulnerable to DNS rebinding attacks, as we don't control WebBaseLoader
	for ip in ipv4_addresses:
	if validators.ipv4(ip, private=True):
	raise ValueError(ERROR_MESSAGES.INVALID_URL)
	for ip in ipv6_addresses:
	if validators.ipv6(ip, private=True):
	raise ValueError(ERROR_MESSAGES.INVALID_URL)
	return True
	elif isinstance(url, Sequence):
	return all(validate_url(u) for u in url)
	else:
	return False


	def resolve_hostname(hostname):
	# Get address information
	addr_info = socket.getaddrinfo(hostname, None)

	# Extract IP addresses from address information
	ipv4_addresses = [info[4][0] for info in addr_info if info[0] == socket.AF_INET]
	ipv6_addresses = [info[4][0] for info in addr_info if info[0] == socket.AF_INET6]

	return ipv4_addresses, ipv6_addresses


	class SafeWebBaseLoader(WebBaseLoader):
	"""WebBaseLoader with enhanced error handling for URLs."""

	def lazy_load(self) -> Iterator[Document]:
	"""Lazy load text from the url(s) in web_path with error handling."""
	for path in self.web_paths:
	try:
	soup = self._scrape(path, bs_kwargs=self.bs_kwargs)
	text = soup.get_text(**self.bs_get_text_kwargs)

	# Build metadata
	metadata = {"source": path}
	if title := soup.find("title"):
	metadata["title"] = title.get_text()
	if description := soup.find("meta", attrs={"name": "description"}):
	metadata["description"] = description.get(
	"content", "No description found."
	)
	if html := soup.find("html"):
	metadata["language"] = html.get("lang", "No language found.")

	yield Document(page_content=text, metadata=metadata)
	except Exception as e:
	# Log the error and continue with the next URL
	log.error(f"Error loading {path}: {e}")


	def get_web_loader(
	url: Union[str, Sequence[str]],
	verify_ssl: bool = True,
	requests_per_second: int = 2,
	):
	# Check if the URL is valid
	if not validate_url(url):
	raise ValueError(ERROR_MESSAGES.INVALID_URL)
	return SafeWebBaseLoader(
	url,
	verify_ssl=verify_ssl,
	requests_per_second=requests_per_second,
	continue_on_failure=True,
	)