Spaces:
Running
Running
from typing import List | |
from .search import get_search_results | |
from langchain_community.document_loaders import RecursiveUrlLoader | |
from langchain_core.documents.base import Document | |
from pydantic import StrictStr | |
def crawl(query: str) -> List[StrictStr]: | |
# get links from search results | |
links_search_engine: List[StrictStr] = get_search_results(query=query) | |
links_crawler: List[StrictStr] = [] | |
# load the documents | |
for link in links_search_engine: | |
try: | |
html_loader = RecursiveUrlLoader(url=link, max_depth=1, timeout=5) | |
docs: List[Document] = html_loader.load() | |
for doc in docs: | |
source: StrictStr = doc.metadata.get("source") # type: ignore | |
links_crawler.append(source) | |
except Exception as e: | |
print(f"Error: {e}") | |
return list(set(links_crawler + links_search_engine)) | |
if __name__ == "__main__": | |
print(crawl("What is the capital of France")) | |