api / src /document_processor /processor.py
Chandima Prabhath
Refactor code structure for improved readability and maintainability
10b392a
# src/document_processor/processor.py
from langchain_text_splitters import RecursiveCharacterTextSplitter # cite: embed_pipeline.py
from langchain.schema import Document # cite: embed_pipeline.py
from config.settings import CHUNK_SIZE, CHUNK_OVERLAP
import logging
import os
logger = logging.getLogger(__name__)
def split_documents(docs: list[Document]) -> list[Document]:
"""
Splits loaded documents into smaller chunks.
Args:
docs: A list of Langchain Document objects.
Returns:
A list of Langchain Document objects representing the chunks.
"""
# --- Financial Ministry Adaptation ---
# TODO: Implement a splitting strategy that understands the structure of financial documents.
# This might involve splitting by sections, articles, or using semantic chunking
# based on document structures, rather than just character count.
# Ensure metadata is carried over or enriched during splitting.
# ------------------------------------
splitter = RecursiveCharacterTextSplitter( # cite: embed_pipeline.py
chunk_size=CHUNK_SIZE, # cite: embed_pipeline.py
chunk_overlap=CHUNK_OVERLAP # cite: embed_pipeline.py
)
chunks = splitter.split_documents(docs) # cite: embed_pipeline.py
logger.info(f"Split {len(docs)} documents into {len(chunks)} chunks.")
return chunks
def extract_metadata(doc: Document) -> dict:
"""
Extracts relevant metadata from a document.
Args:
doc: A Langchain Document object.
Returns:
A dictionary of extracted metadata.
"""
# --- Financial Ministry Adaptation ---
# TODO: Implement robust metadata extraction logic specifically for government rulings.
# This should parse the document content or use pre-extracted information to get:
# - Date of ruling
# - Relevant law or statute references
# - Topic(s) of the ruling
# - Case number or identifier
# - Source file path (already included in your script)
# - Any other relevant identifiers or classifications.
# This metadata is CRITICAL for accurate filtering and retrieval.
# ------------------------------------
metadata = doc.metadata.copy()
# Example: Placeholder for parsing date from content or filename
# try:
# # Attempt to parse date from filename or content
# metadata['ruling_date'] = parse_date_from_doc(doc)
# except Exception as e:
# logger.warning(f"Could not extract date for {metadata.get('source', 'unknown')}: {e}")
# metadata['ruling_date'] = None # Or a default value
# Example: Placeholder for extracting topic from content
# metadata['topic'] = extract_topic_from_doc(doc)
return metadata
def process_documents(docs: list[Document]) -> list[Document]:
"""
Processes a list of raw documents by splitting and extracting metadata.
Args:
docs: A list of raw Langchain Document objects.
Returns:
A list of processed Langchain Document chunks with enriched metadata.
"""
chunks = split_documents(docs)
processed_chunks = []
for chunk in chunks:
# Extract/enrich metadata for each chunk
chunk.metadata = extract_metadata(chunk)
processed_chunks.append(chunk)
logger.info(f"Processed {len(chunks)} chunks with metadata.")
return processed_chunks
# Placeholder functions for metadata extraction (to be implemented)
def parse_date_from_doc(doc: Document):
"""Placeholder for date extraction logic."""
pass
def extract_topic_from_doc(doc: Document):
"""Placeholder for topic extraction logic."""
pass