| | import socket |
| | import urllib.parse |
| | import validators |
| | from typing import Union, Sequence, Iterator |
| |
|
| | from langchain_community.document_loaders import ( |
| | WebBaseLoader, |
| | ) |
| | from langchain_core.documents import Document |
| |
|
| |
|
| | from open_webui.constants import ERROR_MESSAGES |
| | from open_webui.config import ENABLE_RAG_LOCAL_WEB_FETCH |
| | from open_webui.env import SRC_LOG_LEVELS |
| |
|
| | import logging |
| |
|
| | log = logging.getLogger(__name__) |
| | log.setLevel(SRC_LOG_LEVELS["RAG"]) |
| |
|
| |
|
| | def validate_url(url: Union[str, Sequence[str]]): |
| | if isinstance(url, str): |
| | if isinstance(validators.url(url), validators.ValidationError): |
| | raise ValueError(ERROR_MESSAGES.INVALID_URL) |
| | if not ENABLE_RAG_LOCAL_WEB_FETCH: |
| | |
| | parsed_url = urllib.parse.urlparse(url) |
| | |
| | ipv4_addresses, ipv6_addresses = resolve_hostname(parsed_url.hostname) |
| | |
| | |
| | for ip in ipv4_addresses: |
| | if validators.ipv4(ip, private=True): |
| | raise ValueError(ERROR_MESSAGES.INVALID_URL) |
| | for ip in ipv6_addresses: |
| | if validators.ipv6(ip, private=True): |
| | raise ValueError(ERROR_MESSAGES.INVALID_URL) |
| | return True |
| | elif isinstance(url, Sequence): |
| | return all(validate_url(u) for u in url) |
| | else: |
| | return False |
| |
|
| |
|
| | def resolve_hostname(hostname): |
| | |
| | addr_info = socket.getaddrinfo(hostname, None) |
| |
|
| | |
| | ipv4_addresses = [info[4][0] for info in addr_info if info[0] == socket.AF_INET] |
| | ipv6_addresses = [info[4][0] for info in addr_info if info[0] == socket.AF_INET6] |
| |
|
| | return ipv4_addresses, ipv6_addresses |
| |
|
| |
|
| | class SafeWebBaseLoader(WebBaseLoader): |
| | """WebBaseLoader with enhanced error handling for URLs.""" |
| |
|
| | def lazy_load(self) -> Iterator[Document]: |
| | """Lazy load text from the url(s) in web_path with error handling.""" |
| | for path in self.web_paths: |
| | try: |
| | soup = self._scrape(path, bs_kwargs=self.bs_kwargs) |
| | text = soup.get_text(**self.bs_get_text_kwargs) |
| |
|
| | |
| | metadata = {"source": path} |
| | if title := soup.find("title"): |
| | metadata["title"] = title.get_text() |
| | if description := soup.find("meta", attrs={"name": "description"}): |
| | metadata["description"] = description.get( |
| | "content", "No description found." |
| | ) |
| | if html := soup.find("html"): |
| | metadata["language"] = html.get("lang", "No language found.") |
| |
|
| | yield Document(page_content=text, metadata=metadata) |
| | except Exception as e: |
| | |
| | log.error(f"Error loading {path}: {e}") |
| |
|
| |
|
| | def get_web_loader( |
| | url: Union[str, Sequence[str]], |
| | verify_ssl: bool = True, |
| | requests_per_second: int = 2, |
| | ): |
| | |
| | if not validate_url(url): |
| | raise ValueError(ERROR_MESSAGES.INVALID_URL) |
| | return SafeWebBaseLoader( |
| | url, |
| | verify_ssl=verify_ssl, |
| | requests_per_second=requests_per_second, |
| | continue_on_failure=True, |
| | ) |
| |
|