Spaces:
Runtime error
Runtime error
| from typing import Iterator, List, Union | |
| import requests | |
| from langchain.docstore.document import Document | |
| from langchain.document_loaders.base import BaseLoader | |
| class BrowserlessLoader(BaseLoader): | |
| """Load webpages with `Browserless` /content endpoint.""" | |
| def __init__( | |
| self, api_token: str, urls: Union[str, List[str]], text_content: bool = True | |
| ): | |
| """Initialize with API token and the URLs to scrape""" | |
| self.api_token = api_token | |
| """Browserless API token.""" | |
| self.urls = urls | |
| """List of URLs to scrape.""" | |
| self.text_content = text_content | |
| def lazy_load(self) -> Iterator[Document]: | |
| """Lazy load Documents from URLs.""" | |
| for url in self.urls: | |
| if self.text_content: | |
| response = requests.post( | |
| "https://chrome.browserless.io/scrape", | |
| params={ | |
| "token": self.api_token, | |
| }, | |
| json={ | |
| "url": url, | |
| "elements": [ | |
| { | |
| "selector": "body", | |
| } | |
| ], | |
| }, | |
| ) | |
| yield Document( | |
| page_content=response.json()["data"][0]["results"][0]["text"], | |
| metadata={ | |
| "source": url, | |
| }, | |
| ) | |
| else: | |
| response = requests.post( | |
| "https://chrome.browserless.io/content", | |
| params={ | |
| "token": self.api_token, | |
| }, | |
| json={ | |
| "url": url, | |
| }, | |
| ) | |
| yield Document( | |
| page_content=response.text, | |
| metadata={ | |
| "source": url, | |
| }, | |
| ) | |
| def load(self) -> List[Document]: | |
| """Load Documents from URLs.""" | |
| return list(self.lazy_load()) | |