api / src /data_loader /loader.py
Chandima Prabhath
Refactor code structure for improved readability and maintainability
10b392a
# src/data_loader/loader.py
import os
from glob import glob
from langchain_community.document_loaders import TextLoader # cite: embed_pipeline.py
from langchain.schema import Document # cite: embed_pipeline.py
from config.settings import DOCS_FOLDER
import logging
logger = logging.getLogger(__name__)
def load_documents(docs_folder: str = DOCS_FOLDER) -> list[Document]:
"""
Loads documents from the specified folder.
Args:
docs_folder: The path to the folder containing documents.
Returns:
A list of loaded Langchain Document objects.
"""
all_docs = []
files = glob(os.path.join(docs_folder, "*.*")) # cite: embed_pipeline.py
for path in files:
try:
# --- Financial Ministry Adaptation ---
# TODO: Implement more sophisticated loading for specific government ruling formats (PDFs, DOCX, XML, etc.)
# This might involve using libraries like pdfminer.six, python-docx, or custom parsers.
# Handle scanned documents (OCR).
# ------------------------------------
# Attempt UTF-8 loading with autodetect fallback
loader = TextLoader(
path,
encoding="utf-8",
autodetect_encoding=True
)
docs = loader.load()
logger.info(f"Successfully loaded {os.path.basename(path)}")
except UnicodeDecodeError: # cite: embed_pipeline.py
# Fallback to a lenient read if decoding fails
logger.warning(f"Decoding error on {path}, falling back to ignore-errors mode") # cite: embed_pipeline.py
try:
with open(path, "r", encoding="utf-8", errors="ignore") as f: # cite: embed_pipeline.py
text = f.read()
docs = [Document(page_content=text, metadata={"source": path})] # cite: embed_pipeline.py
except Exception as e:
logger.error(f"Failed to read file {path}: {e}")
continue # Skip this file if even lenient read fails
except Exception as e:
logger.error(f"Failed to load file {path}: {e}")
continue # Skip this file if loading fails
all_docs.extend(docs)
logger.info(f"Finished loading documents. Total documents loaded: {len(all_docs)}")
return all_docs