Spaces:
Sleeping
Sleeping
| """Data loading utilities for chunks and JSON files.""" | |
| import json | |
| from pathlib import Path | |
| from typing import List, Dict, Any | |
| from langchain.docstore.document import Document | |
| def load_json(filepath: Path | str) -> List[Dict[str, Any]]: | |
| """ | |
| Load JSON data from file. | |
| Args: | |
| filepath: Path to JSON file | |
| Returns: | |
| List of dictionaries containing the JSON data | |
| """ | |
| filepath = Path(filepath) | |
| if not filepath.exists(): | |
| raise FileNotFoundError(f"JSON file not found: {filepath}") | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| return data | |
| def open_file(filepath: Path | str) -> str: | |
| """ | |
| Open and read a text file. | |
| Args: | |
| filepath: Path to text file | |
| Returns: | |
| File contents as string | |
| """ | |
| filepath = Path(filepath) | |
| if not filepath.exists(): | |
| raise FileNotFoundError(f"File not found: {filepath}") | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| return content | |
| def load_chunks(chunks_file: Path | str = None) -> List[Dict[str, Any]]: | |
| """ | |
| Load document chunks from JSON file. | |
| Args: | |
| chunks_file: Path to chunks JSON file. If None, uses default path. | |
| Returns: | |
| List of chunk dictionaries | |
| """ | |
| if chunks_file is None: | |
| chunks_file = Path("reports/docling_chunks.json") | |
| return load_json(chunks_file) | |
| def chunks_to_documents(chunks: List[Dict[str, Any]]) -> List[Document]: | |
| """ | |
| Convert chunk dictionaries to LangChain Document objects. | |
| Args: | |
| chunks: List of chunk dictionaries | |
| Returns: | |
| List of Document objects | |
| """ | |
| documents = [] | |
| for chunk in chunks: | |
| doc = Document( | |
| page_content=chunk.get("content", ""), | |
| metadata=chunk.get("metadata", {}) | |
| ) | |
| documents.append(doc) | |
| return documents | |
| def validate_chunks(chunks: List[Dict[str, Any]]) -> bool: | |
| """ | |
| Validate that chunks have required fields. | |
| Args: | |
| chunks: List of chunk dictionaries | |
| Returns: | |
| True if valid, raises ValueError if invalid | |
| """ | |
| required_fields = ["content", "metadata"] | |
| for i, chunk in enumerate(chunks): | |
| for field in required_fields: | |
| if field not in chunk: | |
| raise ValueError(f"Chunk {i} missing required field: {field}") | |
| # Validate metadata has required fields | |
| metadata = chunk["metadata"] | |
| if not isinstance(metadata, dict): | |
| raise ValueError(f"Chunk {i} metadata must be a dictionary") | |
| # Check for common metadata fields | |
| if "filename" not in metadata: | |
| raise ValueError(f"Chunk {i} metadata missing 'filename' field") | |
| return True | |