compliance / src /parser.py
ANAMARIAMAGALHAES's picture
Upload 12 files
cb8830c verified
raw
history blame contribute delete
723 Bytes
from langchain_community.document_loaders import (
PyPDFLoader,
TextLoader,
CSVLoader,
UnstructuredHTMLLoader
)
from pathlib import Path
def load_documents(directory):
docs = []
path = Path(directory)
for file in path.rglob("*"):
if file.suffix.lower() == ".pdf":
docs.extend(PyPDFLoader(str(file)).load())
elif file.suffix.lower() == ".txt":
docs.extend(TextLoader(str(file)).load())
elif file.suffix.lower() == ".csv":
docs.extend(CSVLoader(file_path=str(file), encoding='utf-8').load())
elif file.suffix.lower() in [".html", ".htm"]:
docs.extend(UnstructuredHTMLLoader(str(file)).load())
return docs