MH0386's picture
Upload folder using huggingface_hub
68051dd verified
raw
history blame contribute delete
971 Bytes
from typing import List
from .search import get_search_results
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_core.documents.base import Document
from pydantic import StrictStr
def crawl(query: str) -> List[StrictStr]:
# get links from search results
links_search_engine: List[StrictStr] = get_search_results(query=query)
links_crawler: List[StrictStr] = []
# load the documents
for link in links_search_engine:
try:
html_loader = RecursiveUrlLoader(url=link, max_depth=1, timeout=5)
docs: List[Document] = html_loader.load()
for doc in docs:
source: StrictStr = doc.metadata.get("source") # type: ignore
links_crawler.append(source)
except Exception as e:
print(f"Error: {e}")
return list(set(links_crawler + links_search_engine))
if __name__ == "__main__":
print(crawl("What is the capital of France"))