| | """ |
| | Storage module for saving and loading processed chunks |
| | """ |
| |
|
| | import json |
| | import logging |
| | from pathlib import Path |
| | from typing import List, Dict, Any |
| |
|
| | from .models import DocumentChunk, ProcessingStats |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | class ChunkStorage: |
| | """Handles saving and loading of document chunks""" |
| | |
| | def __init__(self, output_file: Path): |
| | """ |
| | Initialize storage |
| | |
| | Args: |
| | output_file: Path to output JSON file |
| | """ |
| | self.output_file = output_file |
| | self.output_file.parent.mkdir(parents=True, exist_ok=True) |
| | |
| | def save_chunks( |
| | self, |
| | chunks: List[DocumentChunk], |
| | stats: ProcessingStats = None |
| | ) -> None: |
| | """ |
| | Save chunks to JSON file |
| | |
| | Args: |
| | chunks: List of DocumentChunk objects |
| | stats: Optional processing statistics |
| | """ |
| | logger.info(f"Saving {len(chunks)} chunks to {self.output_file}") |
| | |
| | |
| | chunks_data = [chunk.to_dict() for chunk in chunks] |
| | |
| | |
| | output = { |
| | 'metadata': { |
| | 'total_chunks': len(chunks), |
| | 'version': '1.0', |
| | }, |
| | 'chunks': chunks_data |
| | } |
| | |
| | |
| | if stats: |
| | output['metadata']['processing_stats'] = stats.to_dict() |
| | |
| | |
| | with open(self.output_file, 'w', encoding='utf-8') as f: |
| | json.dump(output, f, indent=2, ensure_ascii=False) |
| | |
| | logger.info(f"Successfully saved chunks to {self.output_file}") |
| | |
| | |
| | self._save_summary(chunks, stats) |
| | |
| | def _save_summary( |
| | self, |
| | chunks: List[DocumentChunk], |
| | stats: ProcessingStats = None |
| | ) -> None: |
| | """Save a human-readable summary""" |
| | summary_file = self.output_file.parent / "chunks_summary.txt" |
| | |
| | with open(summary_file, 'w', encoding='utf-8') as f: |
| | f.write("=" * 80 + "\n") |
| | f.write("DOCUMENT CHUNKS SUMMARY\n") |
| | f.write("=" * 80 + "\n\n") |
| | |
| | if stats: |
| | f.write(f"Total Documents Processed: {stats.total_documents}\n") |
| | f.write(f"Total Chunks Created: {stats.total_chunks}\n") |
| | f.write(f"Total Words: {stats.total_words}\n") |
| | f.write(f"Average Chunk Size: {stats.avg_chunk_size:.1f} words\n") |
| | f.write(f"Processing Time: {stats.processing_time_seconds:.2f} seconds\n") |
| | f.write(f"\nDocuments:\n") |
| | for doc in stats.documents_processed: |
| | f.write(f" - {doc}\n") |
| | f.write("\n") |
| | |
| | f.write("-" * 80 + "\n") |
| | f.write("SAMPLE CHUNKS (First 5)\n") |
| | f.write("-" * 80 + "\n\n") |
| | |
| | for i, chunk in enumerate(chunks[:5], 1): |
| | f.write(f"Chunk {i}: {chunk.chunk_id}\n") |
| | f.write(f"Source: {chunk.metadata.source_file}\n") |
| | f.write(f"Section: {chunk.metadata.article_section or 'N/A'}\n") |
| | f.write(f"Words: {chunk.metadata.word_count}\n") |
| | f.write(f"Preview: {chunk.text[:200]}...\n") |
| | f.write("\n" + "-" * 80 + "\n\n") |
| | |
| | logger.info(f"Summary saved to {summary_file}") |
| | |
| | def load_chunks(self) -> List[DocumentChunk]: |
| | """ |
| | Load chunks from JSON file |
| | |
| | Returns: |
| | List of DocumentChunk objects |
| | """ |
| | logger.info(f"Loading chunks from {self.output_file}") |
| | |
| | if not self.output_file.exists(): |
| | raise FileNotFoundError(f"Chunks file not found: {self.output_file}") |
| | |
| | with open(self.output_file, 'r', encoding='utf-8') as f: |
| | data = json.load(f) |
| | |
| | chunks = [DocumentChunk.from_dict(chunk_data) for chunk_data in data['chunks']] |
| | |
| | logger.info(f"Loaded {len(chunks)} chunks") |
| | |
| | return chunks |
| | |
| | def validate_chunks(self, chunks: List[DocumentChunk]) -> bool: |
| | """ |
| | Validate chunks before saving |
| | |
| | Args: |
| | chunks: List of chunks to validate |
| | |
| | Returns: |
| | True if valid, raises exception otherwise |
| | """ |
| | if not chunks: |
| | raise ValueError("No chunks to save") |
| | |
| | for i, chunk in enumerate(chunks): |
| | if not chunk.text or not chunk.text.strip(): |
| | raise ValueError(f"Chunk {i} has empty text") |
| | |
| | if not chunk.chunk_id: |
| | raise ValueError(f"Chunk {i} has no ID") |
| | |
| | if chunk.metadata.word_count == 0: |
| | raise ValueError(f"Chunk {i} has zero word count") |
| | |
| | logger.info(f"Validated {len(chunks)} chunks successfully") |
| | return True |
| |
|