Spaces:

insight-ai
/

api

Sleeping

api / src /document_processor /processor.py

Chandima Prabhath

Refactor code structure for improved readability and maintainability

10b392a 2 months ago

3.62 kB

	# src/document_processor/processor.py
	from langchain_text_splitters import RecursiveCharacterTextSplitter # cite: embed_pipeline.py
	from langchain.schema import Document # cite: embed_pipeline.py
	from config.settings import CHUNK_SIZE, CHUNK_OVERLAP
	import logging
	import os

	logger = logging.getLogger(__name__)

	def split_documents(docs: list[Document]) -> list[Document]:
	"""
	Splits loaded documents into smaller chunks.

	Args:
	docs: A list of Langchain Document objects.

	Returns:
	A list of Langchain Document objects representing the chunks.
	"""
	# --- Financial Ministry Adaptation ---
	# TODO: Implement a splitting strategy that understands the structure of financial documents.
	# This might involve splitting by sections, articles, or using semantic chunking
	# based on document structures, rather than just character count.
	# Ensure metadata is carried over or enriched during splitting.
	# ------------------------------------
	splitter = RecursiveCharacterTextSplitter( # cite: embed_pipeline.py
	chunk_size=CHUNK_SIZE, # cite: embed_pipeline.py
	chunk_overlap=CHUNK_OVERLAP # cite: embed_pipeline.py
	)
	chunks = splitter.split_documents(docs) # cite: embed_pipeline.py
	logger.info(f"Split {len(docs)} documents into {len(chunks)} chunks.")
	return chunks

	def extract_metadata(doc: Document) -> dict:
	"""
	Extracts relevant metadata from a document.

	Args:
	doc: A Langchain Document object.

	Returns:
	A dictionary of extracted metadata.
	"""
	# --- Financial Ministry Adaptation ---
	# TODO: Implement robust metadata extraction logic specifically for government rulings.
	# This should parse the document content or use pre-extracted information to get:
	# - Date of ruling
	# - Relevant law or statute references
	# - Topic(s) of the ruling
	# - Case number or identifier
	# - Source file path (already included in your script)
	# - Any other relevant identifiers or classifications.
	# This metadata is CRITICAL for accurate filtering and retrieval.
	# ------------------------------------
	metadata = doc.metadata.copy()
	# Example: Placeholder for parsing date from content or filename
	# try:
	# # Attempt to parse date from filename or content
	# metadata['ruling_date'] = parse_date_from_doc(doc)
	# except Exception as e:
	# logger.warning(f"Could not extract date for {metadata.get('source', 'unknown')}: {e}")
	# metadata['ruling_date'] = None # Or a default value

	# Example: Placeholder for extracting topic from content
	# metadata['topic'] = extract_topic_from_doc(doc)

	return metadata

	def process_documents(docs: list[Document]) -> list[Document]:
	"""
	Processes a list of raw documents by splitting and extracting metadata.

	Args:
	docs: A list of raw Langchain Document objects.

	Returns:
	A list of processed Langchain Document chunks with enriched metadata.
	"""
	chunks = split_documents(docs)
	processed_chunks = []
	for chunk in chunks:
	# Extract/enrich metadata for each chunk
	chunk.metadata = extract_metadata(chunk)
	processed_chunks.append(chunk)
	logger.info(f"Processed {len(chunks)} chunks with metadata.")
	return processed_chunks

	# Placeholder functions for metadata extraction (to be implemented)
	def parse_date_from_doc(doc: Document):
	"""Placeholder for date extraction logic."""
	pass

	def extract_topic_from_doc(doc: Document):
	"""Placeholder for topic extraction logic."""
	pass