Spaces:
Sleeping
Sleeping
| # src/document_processor/processor.py | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter # cite: embed_pipeline.py | |
| from langchain.schema import Document # cite: embed_pipeline.py | |
| from config.settings import CHUNK_SIZE, CHUNK_OVERLAP | |
| import logging | |
| import os | |
| logger = logging.getLogger(__name__) | |
| def split_documents(docs: list[Document]) -> list[Document]: | |
| """ | |
| Splits loaded documents into smaller chunks. | |
| Args: | |
| docs: A list of Langchain Document objects. | |
| Returns: | |
| A list of Langchain Document objects representing the chunks. | |
| """ | |
| # --- Financial Ministry Adaptation --- | |
| # TODO: Implement a splitting strategy that understands the structure of financial documents. | |
| # This might involve splitting by sections, articles, or using semantic chunking | |
| # based on document structures, rather than just character count. | |
| # Ensure metadata is carried over or enriched during splitting. | |
| # ------------------------------------ | |
| splitter = RecursiveCharacterTextSplitter( # cite: embed_pipeline.py | |
| chunk_size=CHUNK_SIZE, # cite: embed_pipeline.py | |
| chunk_overlap=CHUNK_OVERLAP # cite: embed_pipeline.py | |
| ) | |
| chunks = splitter.split_documents(docs) # cite: embed_pipeline.py | |
| logger.info(f"Split {len(docs)} documents into {len(chunks)} chunks.") | |
| return chunks | |
| def extract_metadata(doc: Document) -> dict: | |
| """ | |
| Extracts relevant metadata from a document. | |
| Args: | |
| doc: A Langchain Document object. | |
| Returns: | |
| A dictionary of extracted metadata. | |
| """ | |
| # --- Financial Ministry Adaptation --- | |
| # TODO: Implement robust metadata extraction logic specifically for government rulings. | |
| # This should parse the document content or use pre-extracted information to get: | |
| # - Date of ruling | |
| # - Relevant law or statute references | |
| # - Topic(s) of the ruling | |
| # - Case number or identifier | |
| # - Source file path (already included in your script) | |
| # - Any other relevant identifiers or classifications. | |
| # This metadata is CRITICAL for accurate filtering and retrieval. | |
| # ------------------------------------ | |
| metadata = doc.metadata.copy() | |
| # Example: Placeholder for parsing date from content or filename | |
| # try: | |
| # # Attempt to parse date from filename or content | |
| # metadata['ruling_date'] = parse_date_from_doc(doc) | |
| # except Exception as e: | |
| # logger.warning(f"Could not extract date for {metadata.get('source', 'unknown')}: {e}") | |
| # metadata['ruling_date'] = None # Or a default value | |
| # Example: Placeholder for extracting topic from content | |
| # metadata['topic'] = extract_topic_from_doc(doc) | |
| return metadata | |
| def process_documents(docs: list[Document]) -> list[Document]: | |
| """ | |
| Processes a list of raw documents by splitting and extracting metadata. | |
| Args: | |
| docs: A list of raw Langchain Document objects. | |
| Returns: | |
| A list of processed Langchain Document chunks with enriched metadata. | |
| """ | |
| chunks = split_documents(docs) | |
| processed_chunks = [] | |
| for chunk in chunks: | |
| # Extract/enrich metadata for each chunk | |
| chunk.metadata = extract_metadata(chunk) | |
| processed_chunks.append(chunk) | |
| logger.info(f"Processed {len(chunks)} chunks with metadata.") | |
| return processed_chunks | |
| # Placeholder functions for metadata extraction (to be implemented) | |
| def parse_date_from_doc(doc: Document): | |
| """Placeholder for date extraction logic.""" | |
| pass | |
| def extract_topic_from_doc(doc: Document): | |
| """Placeholder for topic extraction logic.""" | |
| pass | |