Spaces:
Runtime error
Runtime error
import asyncio | |
import logging | |
from typing import Iterator, List | |
from langchain.docstore.document import Document | |
from langchain.document_loaders.base import BaseLoader | |
logger = logging.getLogger(__name__) | |
class AsyncChromiumLoader(BaseLoader): | |
"""Scrape HTML pages from URLs using a | |
headless instance of the Chromium.""" | |
def __init__( | |
self, | |
urls: List[str], | |
): | |
""" | |
Initialize the loader with a list of URL paths. | |
Args: | |
urls (List[str]): A list of URLs to scrape content from. | |
Raises: | |
ImportError: If the required 'playwright' package is not installed. | |
""" | |
self.urls = urls | |
try: | |
import playwright # noqa: F401 | |
except ImportError: | |
raise ImportError( | |
"playwright is required for AsyncChromiumLoader. " | |
"Please install it with `pip install playwright`." | |
) | |
async def ascrape_playwright(self, url: str) -> str: | |
""" | |
Asynchronously scrape the content of a given URL using Playwright's async API. | |
Args: | |
url (str): The URL to scrape. | |
Returns: | |
str: The scraped HTML content or an error message if an exception occurs. | |
""" | |
from playwright.async_api import async_playwright | |
logger.info("Starting scraping...") | |
results = "" | |
async with async_playwright() as p: | |
browser = await p.chromium.launch(headless=True) | |
try: | |
page = await browser.new_page() | |
await page.goto(url) | |
results = await page.content() # Simply get the HTML content | |
logger.info("Content scraped") | |
except Exception as e: | |
results = f"Error: {e}" | |
await browser.close() | |
return results | |
def lazy_load(self) -> Iterator[Document]: | |
""" | |
Lazily load text content from the provided URLs. | |
This method yields Documents one at a time as they're scraped, | |
instead of waiting to scrape all URLs before returning. | |
Yields: | |
Document: The scraped content encapsulated within a Document object. | |
""" | |
for url in self.urls: | |
html_content = asyncio.run(self.ascrape_playwright(url)) | |
metadata = {"source": url} | |
yield Document(page_content=html_content, metadata=metadata) | |
def load(self) -> List[Document]: | |
""" | |
Load and return all Documents from the provided URLs. | |
Returns: | |
List[Document]: A list of Document objects | |
containing the scraped content from each URL. | |
""" | |
return list(self.lazy_load()) | |