Spaces:
Sleeping
Sleeping
import os | |
import json | |
from pydantic import BaseModel, Field | |
class CrawlUrl(BaseModel): | |
base_url: str | |
prefix: str | |
max_depth: int = Field(default=1, ge=0) | |
class WebLoaderConfig(BaseModel): | |
driver_arguments: list[str] = Field(default=None) | |
urls: list[CrawlUrl] | |
def get_web_documents(config: WebLoaderConfig): | |
from llama_index.readers.web import WholeSiteReader | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
options = Options() | |
driver_arguments = config.driver_arguments or [] | |
for arg in driver_arguments: | |
options.add_argument(arg) | |
docs = [] | |
for url in config.urls: | |
scraper = WholeSiteReader( | |
prefix=url.prefix, | |
max_depth=url.max_depth, | |
driver=webdriver.Chrome(options=options), | |
) | |
docs.extend(scraper.load_data(url.base_url)) | |
return docs | |