Spaces:
Sleeping
Sleeping
File size: 1,715 Bytes
ee8fb16 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import os
from langchain_community.document_loaders import WebBaseLoader, WikipediaLoader, OnlinePDFLoader
from langchain.docstore.document import Document
import requests
CONTENT_DIR = os.path.dirname(__file__)
def load_web_page(page_url):
"""Loads a web page and returns a list of Document objects."""
try:
loader = WebBaseLoader(page_url)
return loader.load()
except requests.exceptions.RequestException as e:
print(f"Error loading web page from {page_url}: {e}")
return []
def load_online_pdf(pdf_url):
"""Loads an online PDF and returns a list of Document objects."""
try:
loader = OnlinePDFLoader(pdf_url)
return loader.load()
except Exception as e:
print(f"Error loading online PDF from {pdf_url}: {e}")
return []
def load_wiki_articles(query, load_max_docs=2):
"""Fetches Wikipedia articles related to a query and returns a list of Document objects."""
try:
wiki_loader = WikipediaLoader(query=query, load_max_docs=load_max_docs)
return wiki_loader.load()
except Exception as e:
print(f"Error loading Wikipedia articles for query '{query}': {e}")
return []
if __name__ == "__main__":
# Example usage:
web_page_url = "https://equitygroupholdings.com/ke/open-an-account/ordinary-account"
web_page_docs = load_web_page(web_page_url)
print(f"Loaded {len(web_page_docs)} documents from {web_page_url}")
pdf_url = "https://equitygroupholdings.com/wp-content/uploads/2023/06/Equity-Group-Holdings-PLC-2022-Integrated-Report-and-Financial-Statements.pdf"
pdf_docs = load_online_pdf(pdf_url)
print(f"Loaded {len(pdf_docs)} documents from {pdf_url}") |