Spaces:

insight-ai
/

api

Sleeping

Chandima Prabhath

Refactor code structure for improved readability and maintainability

10b392a 2 months ago

2.35 kB

	# src/data_loader/loader.py
	import os
	from glob import glob
	from langchain_community.document_loaders import TextLoader # cite: embed_pipeline.py
	from langchain.schema import Document # cite: embed_pipeline.py
	from config.settings import DOCS_FOLDER
	import logging

	logger = logging.getLogger(__name__)

	def load_documents(docs_folder: str = DOCS_FOLDER) -> list[Document]:
	"""
	Loads documents from the specified folder.

	Args:
	docs_folder: The path to the folder containing documents.

	Returns:
	A list of loaded Langchain Document objects.
	"""
	all_docs = []
	files = glob(os.path.join(docs_folder, ".")) # cite: embed_pipeline.py
	for path in files:
	try:
	# --- Financial Ministry Adaptation ---
	# TODO: Implement more sophisticated loading for specific government ruling formats (PDFs, DOCX, XML, etc.)
	# This might involve using libraries like pdfminer.six, python-docx, or custom parsers.
	# Handle scanned documents (OCR).
	# ------------------------------------

	# Attempt UTF-8 loading with autodetect fallback
	loader = TextLoader(
	path,
	encoding="utf-8",
	autodetect_encoding=True
	)
	docs = loader.load()
	logger.info(f"Successfully loaded {os.path.basename(path)}")

	except UnicodeDecodeError: # cite: embed_pipeline.py
	# Fallback to a lenient read if decoding fails
	logger.warning(f"Decoding error on {path}, falling back to ignore-errors mode") # cite: embed_pipeline.py
	try:
	with open(path, "r", encoding="utf-8", errors="ignore") as f: # cite: embed_pipeline.py
	text = f.read()
	docs = [Document(page_content=text, metadata={"source": path})] # cite: embed_pipeline.py
	except Exception as e:
	logger.error(f"Failed to read file {path}: {e}")
	continue # Skip this file if even lenient read fails
	except Exception as e:
	logger.error(f"Failed to load file {path}: {e}")
	continue # Skip this file if loading fails

	all_docs.extend(docs)

	logger.info(f"Finished loading documents. Total documents loaded: {len(all_docs)}")
	return all_docs