| """ |
| Data models for embeddings and vector storage. |
| |
| This module defines Pydantic models for embedding metadata. |
| """ |
|
|
| from pydantic import BaseModel, Field |
|
|
|
|
| class EmbeddingMetadata(BaseModel): |
| """Metadata stored with each embedding in ChromaDB.""" |
|
|
| chunk_id: str = Field(..., description="Chunk UUID as string") |
| document_id: str = Field(..., description="Document UUID as string") |
| parent_id: str = Field(default="", description="Parent chunk UUID (empty for parent chunks)") |
| filename: str = Field(..., description="Original PDF filename") |
| file_hash: str = Field(..., description="SHA256 hash of source file") |
| page_numbers: str = Field(..., description="JSON-encoded list of page numbers") |
| chunk_index: int = Field(..., ge=0, description="Position in document") |
| chunk_type: str = Field(..., description="'parent' or 'child'") |
| token_count: int = Field(..., ge=0, description="Number of tokens") |
| start_char: int = Field(..., ge=0, description="Start position in document") |
| end_char: int = Field(..., ge=0, description="End position in document") |
| ingestion_date: str = Field(..., description="ISO datetime of ingestion") |
|
|
| class Config: |
| json_schema_extra = { |
| "example": { |
| "chunk_id": "uuid-string", |
| "document_id": "doc-uuid-string", |
| "parent_id": "parent-uuid-string", |
| "filename": "sample.pdf", |
| "file_hash": "abc123...", |
| "page_numbers": "[1, 2]", |
| "chunk_index": 0, |
| "chunk_type": "child", |
| "token_count": 800, |
| "start_char": 0, |
| "end_char": 1000, |
| "ingestion_date": "2024-01-01T12:00:00" |
| } |
| } |
|
|