Spaces:

zhangyi617
/

webui

Runtime error

App Files Files Community

webui / langchain /document_loaders /chromium.py

zhangyi617

Upload folder using huggingface_hub

129cd69 over 1 year ago

raw

history blame contribute delete

2.74 kB

	import asyncio
	import logging
	from typing import Iterator, List

	from langchain.docstore.document import Document
	from langchain.document_loaders.base import BaseLoader

	logger = logging.getLogger(__name__)


	class AsyncChromiumLoader(BaseLoader):
	"""Scrape HTML pages from URLs using a
	headless instance of the Chromium."""

	def __init__(
	self,
	urls: List[str],
	):
	"""
	Initialize the loader with a list of URL paths.

	Args:
	urls (List[str]): A list of URLs to scrape content from.

	Raises:
	ImportError: If the required 'playwright' package is not installed.
	"""
	self.urls = urls

	try:
	import playwright # noqa: F401
	except ImportError:
	raise ImportError(
	"playwright is required for AsyncChromiumLoader. "
	"Please install it with `pip install playwright`."
	)

	async def ascrape_playwright(self, url: str) -> str:
	"""
	Asynchronously scrape the content of a given URL using Playwright's async API.

	Args:
	url (str): The URL to scrape.

	Returns:
	str: The scraped HTML content or an error message if an exception occurs.

	"""
	from playwright.async_api import async_playwright

	logger.info("Starting scraping...")
	results = ""
	async with async_playwright() as p:
	browser = await p.chromium.launch(headless=True)
	try:
	page = await browser.new_page()
	await page.goto(url)
	results = await page.content() # Simply get the HTML content
	logger.info("Content scraped")
	except Exception as e:
	results = f"Error: {e}"
	await browser.close()
	return results

	def lazy_load(self) -> Iterator[Document]:
	"""
	Lazily load text content from the provided URLs.

	This method yields Documents one at a time as they're scraped,
	instead of waiting to scrape all URLs before returning.

	Yields:
	Document: The scraped content encapsulated within a Document object.

	"""
	for url in self.urls:
	html_content = asyncio.run(self.ascrape_playwright(url))
	metadata = {"source": url}
	yield Document(page_content=html_content, metadata=metadata)

	def load(self) -> List[Document]:
	"""
	Load and return all Documents from the provided URLs.

	Returns:
	List[Document]: A list of Document objects
	containing the scraped content from each URL.

	"""
	return list(self.lazy_load())