Initial commit: SPARKNET framework
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .dockerignore +90 -0
- .streamlit/config.toml +14 -0
- CHANGELOG.md +232 -0
- Dockerfile +109 -0
- IMPLEMENTATION_REPORT.md +474 -0
- api/auth.py +320 -0
- api/routes/documents.py +553 -0
- api/routes/rag.py +415 -0
- api/schemas.py +302 -0
- config/document.yaml +147 -0
- config/rag.yaml +141 -0
- configs/rag.yaml +201 -0
- demo/README.md +185 -0
- demo/app.py +944 -0
- demo/llm_providers.py +339 -0
- demo/pages/1_🔬_Live_Processing.py +714 -0
- demo/pages/2_💬_Interactive_RAG.py +844 -0
- demo/pages/3_📊_Document_Comparison.py +528 -0
- demo/pages/4_🎯_Evidence_Viewer.py +529 -0
- demo/pages/5_📄_Document_Viewer.py +565 -0
- demo/rag_config.py +396 -0
- demo/requirements.txt +19 -0
- demo/state_manager.py +833 -0
- docker-compose.dev.yml +66 -0
- docker-compose.yml +163 -0
- docs/CLOUD_ARCHITECTURE.md +392 -0
- docs/DOCUMENT_INTELLIGENCE.md +470 -0
- docs/SPARKNET_Progress_Report.py +1432 -0
- examples/document_agent.py +240 -0
- examples/document_intelligence_demo.py +314 -0
- examples/document_processing.py +133 -0
- examples/document_rag_end_to_end.py +359 -0
- examples/rag_pipeline.py +192 -0
- nginx/nginx.conf +254 -0
- run_demo.py +110 -0
- run_demo.sh +52 -0
- scripts to get ideas from/ides.txt +151 -0
- src/agents/document_agent.py +661 -0
- src/cli/__init__.py +9 -0
- src/cli/docint.py +681 -0
- src/cli/document.py +322 -0
- src/cli/main.py +110 -0
- src/cli/rag.py +314 -0
- src/document/__init__.py +75 -0
- src/document/chunking/__init__.py +19 -0
- src/document/chunking/chunker.py +944 -0
- src/document/grounding/__init__.py +21 -0
- src/document/grounding/evidence.py +365 -0
- src/document/io/__init__.py +28 -0
- src/document/io/cache.py +268 -0
.dockerignore
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Git
|
| 2 |
+
.git
|
| 3 |
+
.gitignore
|
| 4 |
+
.gitattributes
|
| 5 |
+
|
| 6 |
+
# Python
|
| 7 |
+
__pycache__
|
| 8 |
+
*.py[cod]
|
| 9 |
+
*$py.class
|
| 10 |
+
*.so
|
| 11 |
+
.Python
|
| 12 |
+
build/
|
| 13 |
+
develop-eggs/
|
| 14 |
+
dist/
|
| 15 |
+
downloads/
|
| 16 |
+
eggs/
|
| 17 |
+
.eggs/
|
| 18 |
+
lib/
|
| 19 |
+
lib64/
|
| 20 |
+
parts/
|
| 21 |
+
sdist/
|
| 22 |
+
var/
|
| 23 |
+
wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
.mypy_cache/
|
| 28 |
+
.pytest_cache/
|
| 29 |
+
.coverage
|
| 30 |
+
htmlcov/
|
| 31 |
+
|
| 32 |
+
# Virtual environments
|
| 33 |
+
venv/
|
| 34 |
+
ENV/
|
| 35 |
+
env/
|
| 36 |
+
.venv/
|
| 37 |
+
sparknet/
|
| 38 |
+
|
| 39 |
+
# IDE
|
| 40 |
+
.idea/
|
| 41 |
+
.vscode/
|
| 42 |
+
*.swp
|
| 43 |
+
*.swo
|
| 44 |
+
*~
|
| 45 |
+
|
| 46 |
+
# OS
|
| 47 |
+
.DS_Store
|
| 48 |
+
Thumbs.db
|
| 49 |
+
|
| 50 |
+
# Logs
|
| 51 |
+
*.log
|
| 52 |
+
logs/
|
| 53 |
+
|
| 54 |
+
# Local data (will be mounted as volumes)
|
| 55 |
+
data/vectorstore/
|
| 56 |
+
data/embedding_cache/
|
| 57 |
+
uploads/
|
| 58 |
+
outputs/
|
| 59 |
+
|
| 60 |
+
# Tests
|
| 61 |
+
tests/
|
| 62 |
+
.pytest_cache/
|
| 63 |
+
|
| 64 |
+
# Documentation
|
| 65 |
+
docs/
|
| 66 |
+
*.md
|
| 67 |
+
!README.md
|
| 68 |
+
|
| 69 |
+
# Notebooks
|
| 70 |
+
*.ipynb
|
| 71 |
+
.ipynb_checkpoints/
|
| 72 |
+
|
| 73 |
+
# Backup files
|
| 74 |
+
.backup/
|
| 75 |
+
*.bak
|
| 76 |
+
|
| 77 |
+
# Screenshots
|
| 78 |
+
screenshots/
|
| 79 |
+
|
| 80 |
+
# Development files
|
| 81 |
+
*.env.local
|
| 82 |
+
*.env.development
|
| 83 |
+
*.env.test
|
| 84 |
+
|
| 85 |
+
# Large files
|
| 86 |
+
*.pdf
|
| 87 |
+
*.pptx
|
| 88 |
+
*.docx
|
| 89 |
+
Dataset/
|
| 90 |
+
presentation/
|
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[server]
|
| 2 |
+
headless = true
|
| 3 |
+
port = 8501
|
| 4 |
+
enableCORS = false
|
| 5 |
+
maxUploadSize = 50
|
| 6 |
+
|
| 7 |
+
[theme]
|
| 8 |
+
primaryColor = "#4ECDC4"
|
| 9 |
+
backgroundColor = "#0e1117"
|
| 10 |
+
secondaryBackgroundColor = "#1a1a2e"
|
| 11 |
+
textColor = "#ffffff"
|
| 12 |
+
|
| 13 |
+
[browser]
|
| 14 |
+
gatherUsageStats = false
|
CHANGELOG.md
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPARKNET Changelog
|
| 2 |
+
|
| 3 |
+
All notable changes to the SPARKNET project are documented in this file.
|
| 4 |
+
|
| 5 |
+
## [1.2.0] - 2026-01-20
|
| 6 |
+
|
| 7 |
+
### Added (Phase 1B Continuation)
|
| 8 |
+
|
| 9 |
+
#### Table Extraction Preservation (FG-002) - HIGH PRIORITY
|
| 10 |
+
- **Enhanced SemanticChunker** (`src/document/chunking/chunker.py`)
|
| 11 |
+
- Table structure reconstruction from OCR regions
|
| 12 |
+
- Markdown table generation with proper formatting
|
| 13 |
+
- Header row detection using heuristics
|
| 14 |
+
- Structured data storage in `extra.table_structure`
|
| 15 |
+
- Cell positions preserved for evidence highlighting
|
| 16 |
+
- Searchable text includes header context for better embedding
|
| 17 |
+
- Configurable row/column thresholds
|
| 18 |
+
|
| 19 |
+
- **ChunkerConfig enhancements**
|
| 20 |
+
- `preserve_table_structure` - Enable markdown conversion
|
| 21 |
+
- `table_row_threshold` - Y-coordinate grouping threshold
|
| 22 |
+
- `table_col_threshold` - X-coordinate clustering threshold
|
| 23 |
+
- `detect_table_headers` - Automatic header detection
|
| 24 |
+
|
| 25 |
+
#### Nginx Configuration (TG-005)
|
| 26 |
+
- **Nginx Reverse Proxy** (`nginx/nginx.conf`)
|
| 27 |
+
- Production-ready reverse proxy configuration
|
| 28 |
+
- Rate limiting (30 req/s API, 5 req/s uploads)
|
| 29 |
+
- WebSocket support for Streamlit
|
| 30 |
+
- SSE support for RAG streaming
|
| 31 |
+
- Gzip compression
|
| 32 |
+
- Security headers (XSS, CSRF protection)
|
| 33 |
+
- SSL/TLS configuration (commented, ready for production)
|
| 34 |
+
- Connection limits and timeout tuning
|
| 35 |
+
|
| 36 |
+
#### Integration Tests (TG-006)
|
| 37 |
+
- **API Integration Tests** (`tests/integration/test_api_v2.py`)
|
| 38 |
+
- TestClient-based testing without server
|
| 39 |
+
- Health/status endpoint tests
|
| 40 |
+
- Authentication flow tests
|
| 41 |
+
- Document upload/process/index workflow
|
| 42 |
+
- RAG query and search tests
|
| 43 |
+
- Error handling verification
|
| 44 |
+
- Concurrency tests
|
| 45 |
+
- Performance benchmarks (marked slow)
|
| 46 |
+
|
| 47 |
+
- **Table Chunker Unit Tests** (`tests/unit/test_table_chunker.py`)
|
| 48 |
+
- Table structure reconstruction tests
|
| 49 |
+
- Markdown generation tests
|
| 50 |
+
- Header detection tests
|
| 51 |
+
- Column detection tests
|
| 52 |
+
- Edge case handling
|
| 53 |
+
|
| 54 |
+
#### Cross-Module State Synchronization (Phase 1B)
|
| 55 |
+
- **Enhanced State Manager** (`demo/state_manager.py`)
|
| 56 |
+
- Event system with pub/sub pattern
|
| 57 |
+
- `EventType` enum for type-safe events
|
| 58 |
+
- Evidence highlighting synchronization
|
| 59 |
+
- Page/chunk selection sync across modules
|
| 60 |
+
- RAG query/response sharing
|
| 61 |
+
- Module-specific state storage
|
| 62 |
+
- Sync version tracking for change detection
|
| 63 |
+
- Helper components: `render_evidence_panel()`, `render_document_selector()`
|
| 64 |
+
|
| 65 |
+
---
|
| 66 |
+
|
| 67 |
+
## [1.1.0] - 2026-01-20
|
| 68 |
+
|
| 69 |
+
### Added
|
| 70 |
+
|
| 71 |
+
#### REST API (Phase 1B - TG-003)
|
| 72 |
+
- **Document API** (`api/routes/documents.py`)
|
| 73 |
+
- `POST /api/documents/upload` - Upload and process documents
|
| 74 |
+
- `GET /api/documents` - List all documents with filtering
|
| 75 |
+
- `GET /api/documents/{doc_id}` - Get document by ID
|
| 76 |
+
- `GET /api/documents/{doc_id}/detail` - Get detailed document info
|
| 77 |
+
- `GET /api/documents/{doc_id}/chunks` - Get document chunks
|
| 78 |
+
- `POST /api/documents/{doc_id}/process` - Trigger processing
|
| 79 |
+
- `POST /api/documents/{doc_id}/index` - Index to RAG
|
| 80 |
+
- `POST /api/documents/batch-index` - Batch index multiple documents
|
| 81 |
+
- `DELETE /api/documents/{doc_id}` - Delete a document
|
| 82 |
+
|
| 83 |
+
- **RAG API** (`api/routes/rag.py`)
|
| 84 |
+
- `POST /api/rag/query` - Execute RAG query with 5-agent pipeline
|
| 85 |
+
- `POST /api/rag/query/stream` - Stream RAG response (SSE)
|
| 86 |
+
- `POST /api/rag/search` - Semantic search without synthesis
|
| 87 |
+
- `GET /api/rag/store/status` - Get vector store status
|
| 88 |
+
- `DELETE /api/rag/store/collection/{name}` - Clear collection
|
| 89 |
+
- `GET /api/rag/cache/stats` - Get cache statistics
|
| 90 |
+
- `DELETE /api/rag/cache` - Clear query cache
|
| 91 |
+
|
| 92 |
+
- **API Schemas** (`api/schemas.py`)
|
| 93 |
+
- Request/response models for all endpoints
|
| 94 |
+
- Document, Query, Search, Citation schemas
|
| 95 |
+
- Pydantic validation with comprehensive field definitions
|
| 96 |
+
|
| 97 |
+
#### Authentication (Phase 1C - TG-002)
|
| 98 |
+
- **JWT Authentication** (`api/auth.py`)
|
| 99 |
+
- OAuth2 password bearer scheme
|
| 100 |
+
- `POST /api/auth/token` - Get access token
|
| 101 |
+
- `POST /api/auth/register` - Register new user
|
| 102 |
+
- `GET /api/auth/me` - Get current user info
|
| 103 |
+
- `GET /api/auth/users` - List users (admin only)
|
| 104 |
+
- `DELETE /api/auth/users/{username}` - Delete user (admin only)
|
| 105 |
+
- Password hashing with bcrypt
|
| 106 |
+
- Default admin user creation on startup
|
| 107 |
+
|
| 108 |
+
#### Extended Document Support (Phase 1B - FG-001)
|
| 109 |
+
- Added support for new document formats in document processing:
|
| 110 |
+
- **Word (.docx)** - Full text and table extraction
|
| 111 |
+
- **Excel (.xlsx, .xls)** - Multi-sheet extraction
|
| 112 |
+
- **PowerPoint (.pptx)** - Slide-by-slide text extraction
|
| 113 |
+
- **Text (.txt)** - Plain text processing
|
| 114 |
+
- **Markdown (.md)** - Markdown file support
|
| 115 |
+
|
| 116 |
+
#### Caching (Phase 1B - TG-004)
|
| 117 |
+
- **Cache Manager** (`src/utils/cache_manager.py`)
|
| 118 |
+
- Redis-based caching with in-memory fallback
|
| 119 |
+
- `QueryCache` - Cache RAG query results (1 hour TTL)
|
| 120 |
+
- `EmbeddingCache` - Cache embeddings (24 hour TTL)
|
| 121 |
+
- `@cached` decorator for function-level caching
|
| 122 |
+
- Automatic cache cleanup and size limits
|
| 123 |
+
|
| 124 |
+
#### Docker Containerization (Phase 1C - TG-007)
|
| 125 |
+
- **Dockerfile** - Multi-stage build
|
| 126 |
+
- Production stage with optimized image
|
| 127 |
+
- Development stage with hot reload
|
| 128 |
+
- Health checks and proper dependencies
|
| 129 |
+
|
| 130 |
+
- **docker-compose.yml** - Full stack deployment
|
| 131 |
+
- SPARKNET API service
|
| 132 |
+
- Streamlit Demo service
|
| 133 |
+
- Ollama LLM service with GPU support
|
| 134 |
+
- ChromaDB vector store
|
| 135 |
+
- Redis cache
|
| 136 |
+
- Optional Nginx reverse proxy
|
| 137 |
+
|
| 138 |
+
- **docker-compose.dev.yml** - Development configuration
|
| 139 |
+
- Volume mounts for code changes
|
| 140 |
+
- Hot reload enabled
|
| 141 |
+
- Connects to host Ollama
|
| 142 |
+
|
| 143 |
+
- **.dockerignore** - Optimized build context
|
| 144 |
+
|
| 145 |
+
### Changed
|
| 146 |
+
|
| 147 |
+
#### API Main (`api/main.py`)
|
| 148 |
+
- Enhanced lifespan initialization with graceful degradation
|
| 149 |
+
- Added RAG component initialization
|
| 150 |
+
- Improved health check with component status
|
| 151 |
+
- New `/api/status` endpoint for comprehensive system status
|
| 152 |
+
- Better error handling allowing partial functionality
|
| 153 |
+
|
| 154 |
+
### Technical Details
|
| 155 |
+
|
| 156 |
+
#### New Files Created
|
| 157 |
+
```
|
| 158 |
+
api/
|
| 159 |
+
├── auth.py # Authentication module
|
| 160 |
+
├── schemas.py # Pydantic models
|
| 161 |
+
└── routes/
|
| 162 |
+
├── documents.py # Document endpoints
|
| 163 |
+
└── rag.py # RAG endpoints
|
| 164 |
+
|
| 165 |
+
src/utils/
|
| 166 |
+
└── cache_manager.py # Redis/memory caching
|
| 167 |
+
|
| 168 |
+
docker/
|
| 169 |
+
├── Dockerfile # Multi-stage build
|
| 170 |
+
├── docker-compose.yml # Production stack
|
| 171 |
+
├── docker-compose.dev.yml # Development stack
|
| 172 |
+
└── .dockerignore # Build optimization
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
#### Dependencies Added
|
| 176 |
+
- `python-jose[cryptography]` - JWT tokens
|
| 177 |
+
- `passlib[bcrypt]` - Password hashing
|
| 178 |
+
- `python-multipart` - Form data handling
|
| 179 |
+
- `redis` - Redis client (optional)
|
| 180 |
+
- `python-docx` - Word document support
|
| 181 |
+
- `openpyxl` - Excel support
|
| 182 |
+
- `python-pptx` - PowerPoint support
|
| 183 |
+
|
| 184 |
+
#### Configuration
|
| 185 |
+
- `SPARKNET_SECRET_KEY` - JWT secret (environment variable)
|
| 186 |
+
- `REDIS_URL` - Redis connection string
|
| 187 |
+
- `OLLAMA_HOST` - Ollama server URL
|
| 188 |
+
- `CHROMA_HOST` / `CHROMA_PORT` - ChromaDB connection
|
| 189 |
+
|
| 190 |
+
### API Quick Reference
|
| 191 |
+
|
| 192 |
+
```bash
|
| 193 |
+
# Health check
|
| 194 |
+
curl http://localhost:8000/api/health
|
| 195 |
+
|
| 196 |
+
# Upload document
|
| 197 |
+
curl -X POST -F "file=@document.pdf" http://localhost:8000/api/documents/upload
|
| 198 |
+
|
| 199 |
+
# Query RAG
|
| 200 |
+
curl -X POST http://localhost:8000/api/rag/query \
|
| 201 |
+
-H "Content-Type: application/json" \
|
| 202 |
+
-d '{"query": "What are the main findings?"}'
|
| 203 |
+
|
| 204 |
+
# Get token
|
| 205 |
+
curl -X POST http://localhost:8000/api/auth/token \
|
| 206 |
+
-d "username=admin&password=admin123"
|
| 207 |
+
```
|
| 208 |
+
|
| 209 |
+
### Docker Quick Start
|
| 210 |
+
|
| 211 |
+
```bash
|
| 212 |
+
# Production deployment
|
| 213 |
+
docker-compose up -d
|
| 214 |
+
|
| 215 |
+
# Development with hot reload
|
| 216 |
+
docker-compose -f docker-compose.dev.yml up
|
| 217 |
+
|
| 218 |
+
# Pull Ollama models
|
| 219 |
+
docker exec sparknet-ollama ollama pull llama3.2:latest
|
| 220 |
+
docker exec sparknet-ollama ollama pull mxbai-embed-large:latest
|
| 221 |
+
```
|
| 222 |
+
|
| 223 |
+
---
|
| 224 |
+
|
| 225 |
+
## [1.0.0] - 2026-01-19
|
| 226 |
+
|
| 227 |
+
### Initial Release
|
| 228 |
+
- Multi-Agent RAG Pipeline (5 agents)
|
| 229 |
+
- Document Processing Pipeline (OCR, Layout, Chunking)
|
| 230 |
+
- Streamlit Demo Application (5 modules)
|
| 231 |
+
- ChromaDB Vector Store
|
| 232 |
+
- Ollama LLM Integration
|
Dockerfile
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPARKNET Dockerfile
|
| 2 |
+
# Multi-stage build for optimized production image
|
| 3 |
+
|
| 4 |
+
# ============== Build Stage ==============
|
| 5 |
+
FROM python:3.11-slim as builder
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
# Install build dependencies
|
| 10 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 11 |
+
build-essential \
|
| 12 |
+
gcc \
|
| 13 |
+
g++ \
|
| 14 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
+
|
| 16 |
+
# Copy requirements first for caching
|
| 17 |
+
COPY requirements.txt .
|
| 18 |
+
COPY api/requirements.txt ./api_requirements.txt
|
| 19 |
+
|
| 20 |
+
# Create virtual environment and install dependencies
|
| 21 |
+
RUN python -m venv /opt/venv
|
| 22 |
+
ENV PATH="/opt/venv/bin:$PATH"
|
| 23 |
+
|
| 24 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 25 |
+
pip install --no-cache-dir -r requirements.txt && \
|
| 26 |
+
pip install --no-cache-dir -r api_requirements.txt
|
| 27 |
+
|
| 28 |
+
# ============== Production Stage ==============
|
| 29 |
+
FROM python:3.11-slim as production
|
| 30 |
+
|
| 31 |
+
LABEL maintainer="SPARKNET Team"
|
| 32 |
+
LABEL description="SPARKNET: Multi-Agentic Document Intelligence Platform"
|
| 33 |
+
LABEL version="1.0.0"
|
| 34 |
+
|
| 35 |
+
WORKDIR /app
|
| 36 |
+
|
| 37 |
+
# Install runtime dependencies
|
| 38 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 39 |
+
# PDF processing
|
| 40 |
+
poppler-utils \
|
| 41 |
+
libpoppler-cpp-dev \
|
| 42 |
+
# Image processing
|
| 43 |
+
libgl1-mesa-glx \
|
| 44 |
+
libglib2.0-0 \
|
| 45 |
+
libsm6 \
|
| 46 |
+
libxext6 \
|
| 47 |
+
libxrender-dev \
|
| 48 |
+
# OCR support
|
| 49 |
+
tesseract-ocr \
|
| 50 |
+
tesseract-ocr-eng \
|
| 51 |
+
# Utilities
|
| 52 |
+
curl \
|
| 53 |
+
wget \
|
| 54 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 55 |
+
|
| 56 |
+
# Copy virtual environment from builder
|
| 57 |
+
COPY --from=builder /opt/venv /opt/venv
|
| 58 |
+
ENV PATH="/opt/venv/bin:$PATH"
|
| 59 |
+
|
| 60 |
+
# Set Python environment
|
| 61 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 62 |
+
PYTHONUNBUFFERED=1 \
|
| 63 |
+
PYTHONPATH=/app
|
| 64 |
+
|
| 65 |
+
# Copy application code
|
| 66 |
+
COPY src/ ./src/
|
| 67 |
+
COPY api/ ./api/
|
| 68 |
+
COPY config/ ./config/
|
| 69 |
+
COPY demo/ ./demo/
|
| 70 |
+
|
| 71 |
+
# Create necessary directories
|
| 72 |
+
RUN mkdir -p /app/data/vectorstore \
|
| 73 |
+
/app/data/embedding_cache \
|
| 74 |
+
/app/uploads/documents \
|
| 75 |
+
/app/uploads/patents \
|
| 76 |
+
/app/outputs \
|
| 77 |
+
/app/logs
|
| 78 |
+
|
| 79 |
+
# Set permissions
|
| 80 |
+
RUN chmod -R 755 /app
|
| 81 |
+
|
| 82 |
+
# Expose ports
|
| 83 |
+
# 8000 - FastAPI
|
| 84 |
+
# 4000 - Streamlit
|
| 85 |
+
EXPOSE 8000 4000
|
| 86 |
+
|
| 87 |
+
# Health check
|
| 88 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
| 89 |
+
CMD curl -f http://localhost:8000/api/health || exit 1
|
| 90 |
+
|
| 91 |
+
# Default command - run FastAPI
|
| 92 |
+
CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
| 93 |
+
|
| 94 |
+
# ============== Development Stage ==============
|
| 95 |
+
FROM production as development
|
| 96 |
+
|
| 97 |
+
# Install development dependencies
|
| 98 |
+
RUN pip install --no-cache-dir \
|
| 99 |
+
pytest \
|
| 100 |
+
pytest-asyncio \
|
| 101 |
+
pytest-cov \
|
| 102 |
+
black \
|
| 103 |
+
flake8 \
|
| 104 |
+
mypy \
|
| 105 |
+
ipython \
|
| 106 |
+
jupyter
|
| 107 |
+
|
| 108 |
+
# Development command with hot reload
|
| 109 |
+
CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
|
IMPLEMENTATION_REPORT.md
ADDED
|
@@ -0,0 +1,474 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPARKNET Implementation Report
|
| 2 |
+
## Agentic Document Intelligence Platform
|
| 3 |
+
|
| 4 |
+
**Report Date:** January 2025
|
| 5 |
+
**Version:** 0.1.0
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Executive Summary
|
| 10 |
+
|
| 11 |
+
SPARKNET is an enterprise-grade **Agentic Document Intelligence Platform** that follows FAANG best practices for:
|
| 12 |
+
- **Modular Architecture**: Clean separation of concerns with well-defined interfaces
|
| 13 |
+
- **Local-First Privacy**: All processing happens locally via Ollama
|
| 14 |
+
- **Evidence Grounding**: Every extraction includes verifiable source references
|
| 15 |
+
- **Production-Ready**: Type-safe, tested, configurable, and scalable
|
| 16 |
+
|
| 17 |
+
---
|
| 18 |
+
|
| 19 |
+
## 1. What Has Been Implemented
|
| 20 |
+
|
| 21 |
+
### 1.1 Core Subsystems
|
| 22 |
+
|
| 23 |
+
| Subsystem | Location | Status | Description |
|
| 24 |
+
|-----------|----------|--------|-------------|
|
| 25 |
+
| **Document Intelligence** | `src/document_intelligence/` | Complete | Vision-first document understanding |
|
| 26 |
+
| **Legacy Document Pipeline** | `src/document/` | Complete | OCR, layout, chunking pipeline |
|
| 27 |
+
| **RAG Subsystem** | `src/rag/` | Complete | Vector search with grounded retrieval |
|
| 28 |
+
| **Multi-Agent System** | `src/agents/` | Complete | ReAct-style agents with tools |
|
| 29 |
+
| **LLM Integration** | `src/llm/` | Complete | Ollama client with routing |
|
| 30 |
+
| **CLI** | `src/cli/` | Complete | Full command-line interface |
|
| 31 |
+
| **API** | `api/` | Complete | FastAPI REST endpoints |
|
| 32 |
+
| **Demo UI** | `demo/` | Complete | Streamlit dashboard |
|
| 33 |
+
|
| 34 |
+
### 1.2 Document Intelligence Module (`src/document_intelligence/`)
|
| 35 |
+
|
| 36 |
+
**Architecture (FAANG-inspired: Google DocAI pattern):**
|
| 37 |
+
|
| 38 |
+
```
|
| 39 |
+
src/document_intelligence/
|
| 40 |
+
├── chunks/ # Core data models (BoundingBox, DocumentChunk, TableChunk)
|
| 41 |
+
│ ├── models.py # Pydantic models with full type safety
|
| 42 |
+
│ └── __init__.py
|
| 43 |
+
├── io/ # Document loading with caching
|
| 44 |
+
│ ├── base.py # Abstract interfaces
|
| 45 |
+
│ ├── pdf.py # PyMuPDF-based PDF loading
|
| 46 |
+
│ ├── image.py # PIL image loading
|
| 47 |
+
│ └── cache.py # LRU page caching
|
| 48 |
+
├── models/ # ML model interfaces
|
| 49 |
+
│ ├── base.py # BaseModel, BatchableModel
|
| 50 |
+
│ ├── ocr.py # OCRModel interface
|
| 51 |
+
│ ├── layout.py # LayoutModel interface
|
| 52 |
+
│ ├── table.py # TableModel interface
|
| 53 |
+
│ └── vlm.py # VisionLanguageModel interface
|
| 54 |
+
├── parsing/ # Document parsing pipeline
|
| 55 |
+
│ ├── parser.py # DocumentParser orchestrator
|
| 56 |
+
│ └── chunking.py # SemanticChunker
|
| 57 |
+
├── grounding/ # Visual evidence
|
| 58 |
+
│ ├── evidence.py # EvidenceBuilder, EvidenceTracker
|
| 59 |
+
│ └── crops.py # Image cropping utilities
|
| 60 |
+
├── extraction/ # Field extraction
|
| 61 |
+
│ ├── schema.py # ExtractionSchema, FieldSpec
|
| 62 |
+
│ ├── extractor.py # FieldExtractor
|
| 63 |
+
│ └── validator.py # ExtractionValidator
|
| 64 |
+
├── tools/ # Agent tools
|
| 65 |
+
│ ├── document_tools.py # ParseDocumentTool, ExtractFieldsTool, etc.
|
| 66 |
+
│ └── rag_tools.py # IndexDocumentTool, RetrieveChunksTool, RAGAnswerTool
|
| 67 |
+
└── agent_adapter.py # EnhancedDocumentAgent integration
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
**Key Features:**
|
| 71 |
+
- **Zero-Shot Capability**: Works across document formats without training
|
| 72 |
+
- **Schema-Driven Extraction**: Define fields using JSON Schema or Pydantic
|
| 73 |
+
- **Abstention Policy**: Never guesses - abstains when confidence is low
|
| 74 |
+
- **Visual Grounding**: Every extraction includes page, bbox, snippet, confidence
|
| 75 |
+
|
| 76 |
+
### 1.3 RAG Subsystem (`src/rag/`)
|
| 77 |
+
|
| 78 |
+
**Architecture (FAANG-inspired: Meta FAISS + Google Vertex AI pattern):**
|
| 79 |
+
|
| 80 |
+
```
|
| 81 |
+
src/rag/
|
| 82 |
+
├── store.py # VectorStore interface + ChromaVectorStore
|
| 83 |
+
├── embeddings.py # OllamaEmbedding + OpenAIEmbedding (feature-flagged)
|
| 84 |
+
├── indexer.py # DocumentIndexer for chunked documents
|
| 85 |
+
├── retriever.py # DocumentRetriever with evidence support
|
| 86 |
+
├── generator.py # GroundedGenerator with citations
|
| 87 |
+
├── docint_bridge.py # Bridge to document_intelligence subsystem
|
| 88 |
+
└── __init__.py # Clean exports
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
**Key Features:**
|
| 92 |
+
- **Local-First Embeddings**: Ollama `nomic-embed-text` by default
|
| 93 |
+
- **Cloud Opt-In**: OpenAI embeddings disabled by default, feature-flagged
|
| 94 |
+
- **Metadata Filtering**: Filter by document_id, chunk_type, page_range
|
| 95 |
+
- **Citation Generation**: Answers include `[1]`, `[2]` references
|
| 96 |
+
- **Confidence-Based Abstention**: Returns "I don't know" when uncertain
|
| 97 |
+
|
| 98 |
+
### 1.4 Multi-Agent System (`src/agents/`)
|
| 99 |
+
|
| 100 |
+
**Agents Implemented:**
|
| 101 |
+
| Agent | Purpose | Model |
|
| 102 |
+
|-------|---------|-------|
|
| 103 |
+
| `ExecutorAgent` | Task execution with tools | llama3.1:8b |
|
| 104 |
+
| `DocumentAgent` | ReAct-style document analysis | llama3.1:8b |
|
| 105 |
+
| `PlannerAgent` | Task decomposition | mistral |
|
| 106 |
+
| `CriticAgent` | Output validation | phi3 |
|
| 107 |
+
| `MemoryAgent` | Context management | llama3.2 |
|
| 108 |
+
| `VisionOCRAgent` | Vision-based OCR | llava (optional) |
|
| 109 |
+
|
| 110 |
+
### 1.5 CLI Commands
|
| 111 |
+
|
| 112 |
+
```bash
|
| 113 |
+
# Document Intelligence
|
| 114 |
+
sparknet docint parse document.pdf -o result.json
|
| 115 |
+
sparknet docint extract invoice.pdf --preset invoice
|
| 116 |
+
sparknet docint ask document.pdf "What is the total?"
|
| 117 |
+
sparknet docint classify document.pdf
|
| 118 |
+
|
| 119 |
+
# RAG Operations
|
| 120 |
+
sparknet docint index document.pdf # Index into vector store
|
| 121 |
+
sparknet docint index-stats # Show index statistics
|
| 122 |
+
sparknet docint retrieve "payment terms" -k 10 # Semantic search
|
| 123 |
+
sparknet docint ask doc.pdf "question" --use-rag # RAG-powered Q&A
|
| 124 |
+
|
| 125 |
+
# Legacy Document Commands
|
| 126 |
+
sparknet document parse invoice.pdf
|
| 127 |
+
sparknet document extract contract.pdf -f "party_name"
|
| 128 |
+
sparknet rag index *.pdf --collection my_docs
|
| 129 |
+
sparknet rag search "query" --top 10
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
---
|
| 133 |
+
|
| 134 |
+
## 2. How to Execute SPARKNET
|
| 135 |
+
|
| 136 |
+
### 2.1 Prerequisites
|
| 137 |
+
|
| 138 |
+
```bash
|
| 139 |
+
# 1. System Requirements
|
| 140 |
+
# - Python 3.10+
|
| 141 |
+
# - NVIDIA GPU with CUDA 12.0+ (optional but recommended)
|
| 142 |
+
# - 16GB+ RAM
|
| 143 |
+
# - 50GB+ disk space
|
| 144 |
+
|
| 145 |
+
# 2. Install Ollama (if not installed)
|
| 146 |
+
curl -fsSL https://ollama.com/install.sh | sh
|
| 147 |
+
|
| 148 |
+
# 3. Start Ollama server
|
| 149 |
+
ollama serve
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
### 2.2 Installation
|
| 153 |
+
|
| 154 |
+
```bash
|
| 155 |
+
cd /home/mhamdan/SPARKNET
|
| 156 |
+
|
| 157 |
+
# Option A: Use existing virtual environment
|
| 158 |
+
source sparknet/bin/activate
|
| 159 |
+
|
| 160 |
+
# Option B: Create new environment
|
| 161 |
+
python3 -m venv sparknet
|
| 162 |
+
source sparknet/bin/activate
|
| 163 |
+
|
| 164 |
+
# Install dependencies
|
| 165 |
+
pip install -r requirements.txt
|
| 166 |
+
pip install -r demo/requirements.txt
|
| 167 |
+
|
| 168 |
+
# Install SPARKNET in development mode
|
| 169 |
+
pip install -e .
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
### 2.3 Download Required Models
|
| 173 |
+
|
| 174 |
+
```bash
|
| 175 |
+
# Embedding model (required for RAG)
|
| 176 |
+
ollama pull nomic-embed-text:latest
|
| 177 |
+
|
| 178 |
+
# LLM models (at least one required)
|
| 179 |
+
ollama pull llama3.2:latest # Fast, 2GB
|
| 180 |
+
ollama pull llama3.1:8b # General purpose, 5GB
|
| 181 |
+
ollama pull mistral:latest # Good reasoning, 4GB
|
| 182 |
+
|
| 183 |
+
# Optional: Larger models for complex tasks
|
| 184 |
+
ollama pull qwen2.5:14b # Complex reasoning, 9GB
|
| 185 |
+
```
|
| 186 |
+
|
| 187 |
+
### 2.4 Running the Demo UI
|
| 188 |
+
|
| 189 |
+
**Method 1: Using the launcher script**
|
| 190 |
+
```bash
|
| 191 |
+
cd /home/mhamdan/SPARKNET
|
| 192 |
+
./run_demo.sh 8501
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
**Method 2: Direct Streamlit command**
|
| 196 |
+
```bash
|
| 197 |
+
cd /home/mhamdan/SPARKNET
|
| 198 |
+
source sparknet/bin/activate
|
| 199 |
+
streamlit run demo/app.py --server.port 8501
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
**Method 3: Bind to specific IP (for remote access)**
|
| 203 |
+
```bash
|
| 204 |
+
streamlit run demo/app.py \
|
| 205 |
+
--server.address 172.24.50.21 \
|
| 206 |
+
--server.port 8501 \
|
| 207 |
+
--server.headless true
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
**Access at:** http://172.24.50.21:8501 or http://localhost:8501
|
| 211 |
+
|
| 212 |
+
### 2.5 Running the API Server
|
| 213 |
+
|
| 214 |
+
```bash
|
| 215 |
+
cd /home/mhamdan/SPARKNET
|
| 216 |
+
source sparknet/bin/activate
|
| 217 |
+
uvicorn api.main:app --host 0.0.0.0 --port 8000 --reload
|
| 218 |
+
```
|
| 219 |
+
|
| 220 |
+
**API Endpoints:**
|
| 221 |
+
- `GET /health` - Health check
|
| 222 |
+
- `POST /api/documents/parse` - Parse document
|
| 223 |
+
- `POST /api/documents/extract` - Extract fields
|
| 224 |
+
- `POST /api/rag/index` - Index document
|
| 225 |
+
- `POST /api/rag/query` - Query RAG
|
| 226 |
+
|
| 227 |
+
### 2.6 Running Examples
|
| 228 |
+
|
| 229 |
+
```bash
|
| 230 |
+
cd /home/mhamdan/SPARKNET
|
| 231 |
+
source sparknet/bin/activate
|
| 232 |
+
|
| 233 |
+
# Document Intelligence Demo
|
| 234 |
+
python examples/document_intelligence_demo.py
|
| 235 |
+
|
| 236 |
+
# RAG End-to-End Pipeline
|
| 237 |
+
python examples/document_rag_end_to_end.py
|
| 238 |
+
|
| 239 |
+
# Simple Agent Task
|
| 240 |
+
python examples/simple_task.py
|
| 241 |
+
|
| 242 |
+
# Document Agent
|
| 243 |
+
python examples/document_agent.py
|
| 244 |
+
```
|
| 245 |
+
|
| 246 |
+
### 2.7 Running Tests
|
| 247 |
+
|
| 248 |
+
```bash
|
| 249 |
+
cd /home/mhamdan/SPARKNET
|
| 250 |
+
source sparknet/bin/activate
|
| 251 |
+
|
| 252 |
+
# Run all tests
|
| 253 |
+
pytest tests/ -v
|
| 254 |
+
|
| 255 |
+
# Run specific test suites
|
| 256 |
+
pytest tests/unit/test_document_intelligence.py -v
|
| 257 |
+
pytest tests/unit/test_rag_integration.py -v
|
| 258 |
+
|
| 259 |
+
# Run with coverage
|
| 260 |
+
pytest tests/ --cov=src --cov-report=html
|
| 261 |
+
```
|
| 262 |
+
|
| 263 |
+
---
|
| 264 |
+
|
| 265 |
+
## 3. Configuration
|
| 266 |
+
|
| 267 |
+
### 3.1 RAG Configuration (`configs/rag.yaml`)
|
| 268 |
+
|
| 269 |
+
```yaml
|
| 270 |
+
vector_store:
|
| 271 |
+
type: chroma
|
| 272 |
+
chroma:
|
| 273 |
+
persist_directory: "./.sparknet/chroma_db"
|
| 274 |
+
collection_name: "sparknet_documents"
|
| 275 |
+
distance_metric: cosine
|
| 276 |
+
|
| 277 |
+
embeddings:
|
| 278 |
+
provider: ollama # Local-first
|
| 279 |
+
ollama:
|
| 280 |
+
model: nomic-embed-text
|
| 281 |
+
base_url: "http://localhost:11434"
|
| 282 |
+
openai:
|
| 283 |
+
enabled: false # Disabled by default
|
| 284 |
+
|
| 285 |
+
generator:
|
| 286 |
+
provider: ollama
|
| 287 |
+
ollama:
|
| 288 |
+
model: llama3.2
|
| 289 |
+
abstain_on_low_confidence: true
|
| 290 |
+
abstain_threshold: 0.3
|
| 291 |
+
```
|
| 292 |
+
|
| 293 |
+
### 3.2 Document Configuration (`config/document.yaml`)
|
| 294 |
+
|
| 295 |
+
```yaml
|
| 296 |
+
ocr:
|
| 297 |
+
engine: paddleocr # or tesseract
|
| 298 |
+
languages: ["en"]
|
| 299 |
+
confidence_threshold: 0.5
|
| 300 |
+
|
| 301 |
+
layout:
|
| 302 |
+
enabled: true
|
| 303 |
+
reading_order: true
|
| 304 |
+
|
| 305 |
+
chunking:
|
| 306 |
+
min_chunk_chars: 10
|
| 307 |
+
max_chunk_chars: 4000
|
| 308 |
+
target_chunk_chars: 500
|
| 309 |
+
```
|
| 310 |
+
|
| 311 |
+
---
|
| 312 |
+
|
| 313 |
+
## 4. FAANG Best Practices Applied
|
| 314 |
+
|
| 315 |
+
### 4.1 Google-Inspired Patterns
|
| 316 |
+
- **DocAI Architecture**: Modular vision-first document understanding
|
| 317 |
+
- **Structured Output**: Schema-driven extraction with validation
|
| 318 |
+
- **Abstention Policy**: Never hallucinate, return "I don't know"
|
| 319 |
+
|
| 320 |
+
### 4.2 Meta-Inspired Patterns
|
| 321 |
+
- **FAISS Integration**: Fast similarity search (optional alongside ChromaDB)
|
| 322 |
+
- **RAG Pipeline**: Retrieve-then-generate with citations
|
| 323 |
+
|
| 324 |
+
### 4.3 Amazon-Inspired Patterns
|
| 325 |
+
- **Textract-like API**: Structured field extraction with confidence scores
|
| 326 |
+
- **Evidence Grounding**: Every output traceable to source
|
| 327 |
+
|
| 328 |
+
### 4.4 Microsoft-Inspired Patterns
|
| 329 |
+
- **Form Recognizer Pattern**: Pre-built schemas for invoices, contracts
|
| 330 |
+
- **Confidence Thresholds**: Configurable abstention levels
|
| 331 |
+
|
| 332 |
+
### 4.5 Apple-Inspired Patterns
|
| 333 |
+
- **Privacy-First**: All processing local by default
|
| 334 |
+
- **Opt-In Cloud**: OpenAI and cloud services disabled by default
|
| 335 |
+
|
| 336 |
+
---
|
| 337 |
+
|
| 338 |
+
## 5. Quick Start Commands
|
| 339 |
+
|
| 340 |
+
```bash
|
| 341 |
+
# === SETUP ===
|
| 342 |
+
cd /home/mhamdan/SPARKNET
|
| 343 |
+
source sparknet/bin/activate
|
| 344 |
+
ollama serve & # Start in background
|
| 345 |
+
|
| 346 |
+
# === DEMO UI ===
|
| 347 |
+
streamlit run demo/app.py --server.port 8501
|
| 348 |
+
|
| 349 |
+
# === CLI USAGE ===
|
| 350 |
+
# Parse a document
|
| 351 |
+
python -m src.cli.main docint parse Dataset/IBM*.pdf -o result.json
|
| 352 |
+
|
| 353 |
+
# Index for RAG
|
| 354 |
+
python -m src.cli.main docint index Dataset/*.pdf
|
| 355 |
+
|
| 356 |
+
# Ask questions with RAG
|
| 357 |
+
python -m src.cli.main docint ask Dataset/IBM*.pdf "What is this document about?" --use-rag
|
| 358 |
+
|
| 359 |
+
# === PYTHON API ===
|
| 360 |
+
python -c "
|
| 361 |
+
from src.document_intelligence import DocumentParser
|
| 362 |
+
parser = DocumentParser()
|
| 363 |
+
result = parser.parse('Dataset/IBM N_A.pdf')
|
| 364 |
+
print(f'Parsed {len(result.chunks)} chunks')
|
| 365 |
+
"
|
| 366 |
+
|
| 367 |
+
# === RUN TESTS ===
|
| 368 |
+
pytest tests/unit/ -v
|
| 369 |
+
```
|
| 370 |
+
|
| 371 |
+
---
|
| 372 |
+
|
| 373 |
+
## 6. Troubleshooting
|
| 374 |
+
|
| 375 |
+
### Issue: Ollama not running
|
| 376 |
+
```bash
|
| 377 |
+
# Check status
|
| 378 |
+
curl http://localhost:11434/api/tags
|
| 379 |
+
|
| 380 |
+
# Start Ollama
|
| 381 |
+
ollama serve
|
| 382 |
+
|
| 383 |
+
# If port in use
|
| 384 |
+
pkill ollama && ollama serve
|
| 385 |
+
```
|
| 386 |
+
|
| 387 |
+
### Issue: Missing models
|
| 388 |
+
```bash
|
| 389 |
+
ollama list # See installed models
|
| 390 |
+
ollama pull nomic-embed-text # Install embedding model
|
| 391 |
+
ollama pull llama3.2 # Install LLM
|
| 392 |
+
```
|
| 393 |
+
|
| 394 |
+
### Issue: ChromaDB errors
|
| 395 |
+
```bash
|
| 396 |
+
# Reset vector store
|
| 397 |
+
rm -rf .sparknet/chroma_db
|
| 398 |
+
```
|
| 399 |
+
|
| 400 |
+
### Issue: Import errors
|
| 401 |
+
```bash
|
| 402 |
+
# Ensure in correct directory
|
| 403 |
+
cd /home/mhamdan/SPARKNET
|
| 404 |
+
|
| 405 |
+
# Ensure venv activated
|
| 406 |
+
source sparknet/bin/activate
|
| 407 |
+
|
| 408 |
+
# Reinstall
|
| 409 |
+
pip install -e .
|
| 410 |
+
```
|
| 411 |
+
|
| 412 |
+
---
|
| 413 |
+
|
| 414 |
+
## 7. Architecture Diagram
|
| 415 |
+
|
| 416 |
+
```
|
| 417 |
+
┌─────────────────────────────────────────────────────────────────┐
|
| 418 |
+
│ SPARKNET Platform │
|
| 419 |
+
├─────────────────────────────────────────────────────────────────┤
|
| 420 |
+
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
| 421 |
+
│ │ Streamlit │ │ FastAPI │ │ CLI │ Interfaces │
|
| 422 |
+
│ │ Demo │ │ API │ │ Commands │ │
|
| 423 |
+
│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │
|
| 424 |
+
├─────────┴────────────────┴────────────────┴─────────────────────┤
|
| 425 |
+
│ │
|
| 426 |
+
│ ┌──────────────────────────────────────────────────────────┐ │
|
| 427 |
+
│ │ Agent Layer │ │
|
| 428 |
+
│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │
|
| 429 |
+
│ │ │ Document │ │ Executor │ │ Planner │ │ Critic │ │ │
|
| 430 |
+
│ │ │ Agent │ │ Agent │ │ Agent │ │ Agent │ │ │
|
| 431 |
+
│ │ └────┬─────┘ └────┬─────┘ └────┬─────┘ └────┬─────┘ │ │
|
| 432 |
+
│ └───────┴────────────┴────────────┴────────────┴───────────┘ │
|
| 433 |
+
│ │
|
| 434 |
+
│ ┌────────────────────┐ ┌─────────────────────────────────┐ │
|
| 435 |
+
│ │ Document Intel │ │ RAG Subsystem │ │
|
| 436 |
+
│ │ ┌───────┐ ┌──────┐ │ │ ┌─────────┐ ┌─────────────────┐ │ │
|
| 437 |
+
│ │ │Parser │ │Extract│ │ │ │Indexer │ │ Retriever │ │ │
|
| 438 |
+
│ │ └───────┘ └──────┘ │ │ └─────────┘ └─────────────────┘ │ │
|
| 439 |
+
│ │ ┌───────┐ ┌──────┐ │ │ ┌─────────┐ ┌─────────────────┐ │ │
|
| 440 |
+
│ │ │Ground │ │Valid │ │ │ │Embedder │ │ Generator │ │ │
|
| 441 |
+
│ │ └───────┘ └──────┘ │ │ └─────────┘ └──────��──────────┘ │ │
|
| 442 |
+
│ └────────────────────┘ └─────────────────────────────────┘ │
|
| 443 |
+
│ │
|
| 444 |
+
│ ┌─────────────────────────────────────────────────────────┐ │
|
| 445 |
+
│ │ Infrastructure │ │
|
| 446 |
+
│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │
|
| 447 |
+
│ │ │ Ollama │ │ ChromaDB │ │ GPU │ │ Cache │ │ │
|
| 448 |
+
│ │ │ Client │ │ Store │ │ Manager │ │ Layer │ │ │
|
| 449 |
+
│ │ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │ │
|
| 450 |
+
│ └─────────────────────────────────────────────────────────┘ │
|
| 451 |
+
└─────────────────────────────────────────────────────────────────┘
|
| 452 |
+
```
|
| 453 |
+
|
| 454 |
+
---
|
| 455 |
+
|
| 456 |
+
## 8. Files Modified/Created in Recent Session
|
| 457 |
+
|
| 458 |
+
| File | Action | Description |
|
| 459 |
+
|------|--------|-------------|
|
| 460 |
+
| `src/rag/docint_bridge.py` | Created | Bridge between document_intelligence and RAG |
|
| 461 |
+
| `src/document_intelligence/tools/rag_tools.py` | Created | RAG tools for agents |
|
| 462 |
+
| `src/document_intelligence/tools/__init__.py` | Modified | Added RAG tool exports |
|
| 463 |
+
| `src/document_intelligence/tools/document_tools.py` | Modified | Enhanced AnswerQuestionTool with RAG |
|
| 464 |
+
| `src/cli/docint.py` | Modified | Added index, retrieve, delete-index commands |
|
| 465 |
+
| `src/rag/__init__.py` | Modified | Added bridge exports |
|
| 466 |
+
| `configs/rag.yaml` | Created | RAG configuration file |
|
| 467 |
+
| `tests/unit/test_rag_integration.py` | Created | RAG integration tests |
|
| 468 |
+
| `examples/document_rag_end_to_end.py` | Created | End-to-end RAG example |
|
| 469 |
+
|
| 470 |
+
---
|
| 471 |
+
|
| 472 |
+
**Report Complete**
|
| 473 |
+
|
| 474 |
+
For questions or issues, refer to the troubleshooting section above or check the test files for usage examples.
|
api/auth.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SPARKNET Authentication Module
|
| 3 |
+
JWT-based authentication with OAuth2 support.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from fastapi import Depends, HTTPException, status
|
| 7 |
+
from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
|
| 8 |
+
from jose import JWTError, jwt
|
| 9 |
+
from passlib.context import CryptContext
|
| 10 |
+
from pydantic import BaseModel
|
| 11 |
+
from datetime import datetime, timedelta
|
| 12 |
+
from typing import Optional, List
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
import os
|
| 15 |
+
import json
|
| 16 |
+
import uuid
|
| 17 |
+
|
| 18 |
+
# Configuration (use environment variables in production)
|
| 19 |
+
SECRET_KEY = os.getenv("SPARKNET_SECRET_KEY", "sparknet-super-secret-key-change-in-production")
|
| 20 |
+
ALGORITHM = "HS256"
|
| 21 |
+
ACCESS_TOKEN_EXPIRE_MINUTES = 30
|
| 22 |
+
|
| 23 |
+
# Password hashing
|
| 24 |
+
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
|
| 25 |
+
|
| 26 |
+
# OAuth2 scheme
|
| 27 |
+
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="api/auth/token", auto_error=False)
|
| 28 |
+
|
| 29 |
+
# Simple file-based user store (replace with database in production)
|
| 30 |
+
USERS_FILE = Path(__file__).parent.parent / "data" / "users.json"
|
| 31 |
+
USERS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class User(BaseModel):
|
| 35 |
+
"""User model."""
|
| 36 |
+
user_id: str
|
| 37 |
+
username: str
|
| 38 |
+
email: str
|
| 39 |
+
hashed_password: str
|
| 40 |
+
is_active: bool = True
|
| 41 |
+
is_admin: bool = False
|
| 42 |
+
scopes: List[str] = []
|
| 43 |
+
created_at: datetime = None
|
| 44 |
+
|
| 45 |
+
class Config:
|
| 46 |
+
json_encoders = {
|
| 47 |
+
datetime: lambda v: v.isoformat() if v else None
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class UserInDB(User):
|
| 52 |
+
"""User model with password hash."""
|
| 53 |
+
pass
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class TokenData(BaseModel):
|
| 57 |
+
"""JWT token payload."""
|
| 58 |
+
username: Optional[str] = None
|
| 59 |
+
user_id: Optional[str] = None
|
| 60 |
+
scopes: List[str] = []
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _load_users() -> dict:
|
| 64 |
+
"""Load users from file."""
|
| 65 |
+
if USERS_FILE.exists():
|
| 66 |
+
try:
|
| 67 |
+
with open(USERS_FILE) as f:
|
| 68 |
+
data = json.load(f)
|
| 69 |
+
return {u["username"]: User(**u) for u in data}
|
| 70 |
+
except Exception:
|
| 71 |
+
pass
|
| 72 |
+
return {}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _save_users(users: dict):
|
| 76 |
+
"""Save users to file."""
|
| 77 |
+
with open(USERS_FILE, "w") as f:
|
| 78 |
+
json.dump([u.dict() for u in users.values()], f, default=str, indent=2)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def verify_password(plain_password: str, hashed_password: str) -> bool:
|
| 82 |
+
"""Verify a password against its hash."""
|
| 83 |
+
return pwd_context.verify(plain_password, hashed_password)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def get_password_hash(password: str) -> str:
|
| 87 |
+
"""Hash a password."""
|
| 88 |
+
return pwd_context.hash(password)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def get_user(username: str) -> Optional[UserInDB]:
|
| 92 |
+
"""Get a user by username."""
|
| 93 |
+
users = _load_users()
|
| 94 |
+
if username in users:
|
| 95 |
+
return UserInDB(**users[username].dict())
|
| 96 |
+
return None
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def authenticate_user(username: str, password: str) -> Optional[UserInDB]:
|
| 100 |
+
"""Authenticate a user."""
|
| 101 |
+
user = get_user(username)
|
| 102 |
+
if not user:
|
| 103 |
+
return None
|
| 104 |
+
if not verify_password(password, user.hashed_password):
|
| 105 |
+
return None
|
| 106 |
+
return user
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -> str:
|
| 110 |
+
"""Create a JWT access token."""
|
| 111 |
+
to_encode = data.copy()
|
| 112 |
+
if expires_delta:
|
| 113 |
+
expire = datetime.utcnow() + expires_delta
|
| 114 |
+
else:
|
| 115 |
+
expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
|
| 116 |
+
to_encode.update({"exp": expire})
|
| 117 |
+
encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
|
| 118 |
+
return encoded_jwt
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
async def get_current_user(token: str = Depends(oauth2_scheme)) -> Optional[UserInDB]:
|
| 122 |
+
"""Get the current user from JWT token."""
|
| 123 |
+
if not token:
|
| 124 |
+
return None
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
|
| 128 |
+
username: str = payload.get("sub")
|
| 129 |
+
if username is None:
|
| 130 |
+
return None
|
| 131 |
+
token_data = TokenData(
|
| 132 |
+
username=username,
|
| 133 |
+
user_id=payload.get("user_id"),
|
| 134 |
+
scopes=payload.get("scopes", [])
|
| 135 |
+
)
|
| 136 |
+
except JWTError:
|
| 137 |
+
return None
|
| 138 |
+
|
| 139 |
+
user = get_user(token_data.username)
|
| 140 |
+
return user
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
async def get_current_active_user(
|
| 144 |
+
current_user: Optional[UserInDB] = Depends(get_current_user)
|
| 145 |
+
) -> Optional[UserInDB]:
|
| 146 |
+
"""Get current active user (authentication optional)."""
|
| 147 |
+
if current_user and not current_user.is_active:
|
| 148 |
+
return None
|
| 149 |
+
return current_user
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
async def require_auth(
|
| 153 |
+
current_user: Optional[UserInDB] = Depends(get_current_user)
|
| 154 |
+
) -> UserInDB:
|
| 155 |
+
"""Require authentication (raises exception if not authenticated)."""
|
| 156 |
+
credentials_exception = HTTPException(
|
| 157 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 158 |
+
detail="Could not validate credentials",
|
| 159 |
+
headers={"WWW-Authenticate": "Bearer"},
|
| 160 |
+
)
|
| 161 |
+
if not current_user:
|
| 162 |
+
raise credentials_exception
|
| 163 |
+
if not current_user.is_active:
|
| 164 |
+
raise HTTPException(status_code=400, detail="Inactive user")
|
| 165 |
+
return current_user
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
async def require_admin(
|
| 169 |
+
current_user: UserInDB = Depends(require_auth)
|
| 170 |
+
) -> UserInDB:
|
| 171 |
+
"""Require admin privileges."""
|
| 172 |
+
if not current_user.is_admin:
|
| 173 |
+
raise HTTPException(
|
| 174 |
+
status_code=status.HTTP_403_FORBIDDEN,
|
| 175 |
+
detail="Admin privileges required"
|
| 176 |
+
)
|
| 177 |
+
return current_user
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def create_user(username: str, email: str, password: str, is_admin: bool = False) -> User:
|
| 181 |
+
"""Create a new user."""
|
| 182 |
+
users = _load_users()
|
| 183 |
+
|
| 184 |
+
if username in users:
|
| 185 |
+
raise ValueError(f"User {username} already exists")
|
| 186 |
+
|
| 187 |
+
user = User(
|
| 188 |
+
user_id=str(uuid.uuid4()),
|
| 189 |
+
username=username,
|
| 190 |
+
email=email,
|
| 191 |
+
hashed_password=get_password_hash(password),
|
| 192 |
+
is_active=True,
|
| 193 |
+
is_admin=is_admin,
|
| 194 |
+
scopes=["read", "write"] if not is_admin else ["read", "write", "admin"],
|
| 195 |
+
created_at=datetime.now()
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
users[username] = user
|
| 199 |
+
_save_users(users)
|
| 200 |
+
return user
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def delete_user(username: str) -> bool:
|
| 204 |
+
"""Delete a user."""
|
| 205 |
+
users = _load_users()
|
| 206 |
+
if username in users:
|
| 207 |
+
del users[username]
|
| 208 |
+
_save_users(users)
|
| 209 |
+
return True
|
| 210 |
+
return False
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
# Initialize default admin user if none exists
|
| 214 |
+
def init_default_admin():
|
| 215 |
+
"""Create default admin user if no users exist."""
|
| 216 |
+
users = _load_users()
|
| 217 |
+
if not users:
|
| 218 |
+
try:
|
| 219 |
+
create_user(
|
| 220 |
+
username="admin",
|
| 221 |
+
email="admin@sparknet.local",
|
| 222 |
+
password="admin123", # Change in production!
|
| 223 |
+
is_admin=True
|
| 224 |
+
)
|
| 225 |
+
print("Default admin user created: admin / admin123")
|
| 226 |
+
except Exception as e:
|
| 227 |
+
print(f"Could not create default admin: {e}")
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
# Auth routes
|
| 231 |
+
from fastapi import APIRouter
|
| 232 |
+
|
| 233 |
+
auth_router = APIRouter()
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
@auth_router.post("/token")
|
| 237 |
+
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
|
| 238 |
+
"""OAuth2 compatible token login."""
|
| 239 |
+
user = authenticate_user(form_data.username, form_data.password)
|
| 240 |
+
if not user:
|
| 241 |
+
raise HTTPException(
|
| 242 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 243 |
+
detail="Incorrect username or password",
|
| 244 |
+
headers={"WWW-Authenticate": "Bearer"},
|
| 245 |
+
)
|
| 246 |
+
access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
|
| 247 |
+
access_token = create_access_token(
|
| 248 |
+
data={
|
| 249 |
+
"sub": user.username,
|
| 250 |
+
"user_id": user.user_id,
|
| 251 |
+
"scopes": user.scopes
|
| 252 |
+
},
|
| 253 |
+
expires_delta=access_token_expires
|
| 254 |
+
)
|
| 255 |
+
return {
|
| 256 |
+
"access_token": access_token,
|
| 257 |
+
"token_type": "bearer",
|
| 258 |
+
"expires_in": ACCESS_TOKEN_EXPIRE_MINUTES * 60
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
@auth_router.post("/register")
|
| 263 |
+
async def register_user(
|
| 264 |
+
username: str,
|
| 265 |
+
email: str,
|
| 266 |
+
password: str,
|
| 267 |
+
):
|
| 268 |
+
"""Register a new user."""
|
| 269 |
+
try:
|
| 270 |
+
user = create_user(username, email, password)
|
| 271 |
+
return {
|
| 272 |
+
"user_id": user.user_id,
|
| 273 |
+
"username": user.username,
|
| 274 |
+
"email": user.email,
|
| 275 |
+
"message": "User created successfully"
|
| 276 |
+
}
|
| 277 |
+
except ValueError as e:
|
| 278 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
@auth_router.get("/me")
|
| 282 |
+
async def read_users_me(current_user: UserInDB = Depends(require_auth)):
|
| 283 |
+
"""Get current user information."""
|
| 284 |
+
return {
|
| 285 |
+
"user_id": current_user.user_id,
|
| 286 |
+
"username": current_user.username,
|
| 287 |
+
"email": current_user.email,
|
| 288 |
+
"is_active": current_user.is_active,
|
| 289 |
+
"is_admin": current_user.is_admin,
|
| 290 |
+
"scopes": current_user.scopes
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
@auth_router.get("/users")
|
| 295 |
+
async def list_users(current_user: UserInDB = Depends(require_admin)):
|
| 296 |
+
"""List all users (admin only)."""
|
| 297 |
+
users = _load_users()
|
| 298 |
+
return [
|
| 299 |
+
{
|
| 300 |
+
"user_id": u.user_id,
|
| 301 |
+
"username": u.username,
|
| 302 |
+
"email": u.email,
|
| 303 |
+
"is_active": u.is_active,
|
| 304 |
+
"is_admin": u.is_admin
|
| 305 |
+
}
|
| 306 |
+
for u in users.values()
|
| 307 |
+
]
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
@auth_router.delete("/users/{username}")
|
| 311 |
+
async def delete_user_endpoint(
|
| 312 |
+
username: str,
|
| 313 |
+
current_user: UserInDB = Depends(require_admin)
|
| 314 |
+
):
|
| 315 |
+
"""Delete a user (admin only)."""
|
| 316 |
+
if username == current_user.username:
|
| 317 |
+
raise HTTPException(status_code=400, detail="Cannot delete yourself")
|
| 318 |
+
if delete_user(username):
|
| 319 |
+
return {"status": "deleted", "username": username}
|
| 320 |
+
raise HTTPException(status_code=404, detail=f"User not found: {username}")
|
api/routes/documents.py
ADDED
|
@@ -0,0 +1,553 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SPARKNET Document API Routes
|
| 3 |
+
Endpoints for document upload, processing, and management.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Depends, BackgroundTasks
|
| 7 |
+
from fastapi.responses import StreamingResponse
|
| 8 |
+
from typing import List, Optional
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
import hashlib
|
| 12 |
+
import shutil
|
| 13 |
+
import uuid
|
| 14 |
+
import io
|
| 15 |
+
import sys
|
| 16 |
+
|
| 17 |
+
# Add project root to path
|
| 18 |
+
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
| 19 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 20 |
+
|
| 21 |
+
from api.schemas import (
|
| 22 |
+
DocumentUploadResponse, DocumentResponse, DocumentMetadata,
|
| 23 |
+
DocumentDetailResponse, ChunksResponse, ChunkInfo,
|
| 24 |
+
OCRRegionInfo, LayoutRegionInfo, DocumentStatus,
|
| 25 |
+
IndexRequest, IndexResponse, BatchIndexRequest, BatchIndexResponse
|
| 26 |
+
)
|
| 27 |
+
from loguru import logger
|
| 28 |
+
|
| 29 |
+
router = APIRouter()
|
| 30 |
+
|
| 31 |
+
# In-memory document store (replace with database in production)
|
| 32 |
+
_documents = {}
|
| 33 |
+
_processing_tasks = {}
|
| 34 |
+
|
| 35 |
+
# Supported file types
|
| 36 |
+
SUPPORTED_EXTENSIONS = {
|
| 37 |
+
'.pdf': 'application/pdf',
|
| 38 |
+
'.png': 'image/png',
|
| 39 |
+
'.jpg': 'image/jpeg',
|
| 40 |
+
'.jpeg': 'image/jpeg',
|
| 41 |
+
'.tiff': 'image/tiff',
|
| 42 |
+
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
| 43 |
+
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
| 44 |
+
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
| 45 |
+
'.txt': 'text/plain',
|
| 46 |
+
'.md': 'text/markdown',
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
UPLOAD_DIR = PROJECT_ROOT / "uploads" / "documents"
|
| 50 |
+
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def generate_doc_id(filename: str, content: bytes) -> str:
|
| 54 |
+
"""Generate unique document ID from filename and content hash."""
|
| 55 |
+
content_hash = hashlib.md5(content[:4096]).hexdigest()[:8]
|
| 56 |
+
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
| 57 |
+
return f"doc_{timestamp}_{content_hash}"
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
async def process_document_task(doc_id: str, file_path: Path, file_type: str):
|
| 61 |
+
"""Background task to process a document."""
|
| 62 |
+
try:
|
| 63 |
+
logger.info(f"Processing document: {doc_id}")
|
| 64 |
+
_documents[doc_id]["status"] = DocumentStatus.PROCESSING
|
| 65 |
+
|
| 66 |
+
# Try to use actual document processor
|
| 67 |
+
try:
|
| 68 |
+
from src.document.pipeline.processor import DocumentProcessor, PipelineConfig
|
| 69 |
+
|
| 70 |
+
config = PipelineConfig(
|
| 71 |
+
ocr_enabled=True,
|
| 72 |
+
layout_enabled=True,
|
| 73 |
+
chunking_enabled=True,
|
| 74 |
+
)
|
| 75 |
+
processor = DocumentProcessor(config)
|
| 76 |
+
result = processor.process(str(file_path))
|
| 77 |
+
|
| 78 |
+
# Extract data from result
|
| 79 |
+
chunks = []
|
| 80 |
+
for i, chunk in enumerate(getattr(result, 'chunks', [])):
|
| 81 |
+
chunks.append({
|
| 82 |
+
"chunk_id": f"{doc_id}_chunk_{i}",
|
| 83 |
+
"doc_id": doc_id,
|
| 84 |
+
"text": getattr(chunk, 'text', str(chunk)),
|
| 85 |
+
"chunk_type": getattr(chunk, 'chunk_type', 'text'),
|
| 86 |
+
"page_num": getattr(chunk, 'page', 0),
|
| 87 |
+
"confidence": getattr(chunk, 'confidence', 1.0),
|
| 88 |
+
"bbox": getattr(chunk, 'bbox', None),
|
| 89 |
+
})
|
| 90 |
+
|
| 91 |
+
_documents[doc_id].update({
|
| 92 |
+
"status": DocumentStatus.COMPLETED,
|
| 93 |
+
"raw_text": getattr(result, 'raw_text', ''),
|
| 94 |
+
"chunks": chunks,
|
| 95 |
+
"page_count": getattr(result, 'page_count', 1),
|
| 96 |
+
"ocr_regions": getattr(result, 'ocr_regions', []),
|
| 97 |
+
"layout_regions": getattr(result, 'layout_regions', []),
|
| 98 |
+
"processing_time": getattr(result, 'processing_time', 0.0),
|
| 99 |
+
"updated_at": datetime.now(),
|
| 100 |
+
})
|
| 101 |
+
|
| 102 |
+
logger.success(f"Document {doc_id} processed successfully: {len(chunks)} chunks")
|
| 103 |
+
|
| 104 |
+
except Exception as proc_error:
|
| 105 |
+
logger.warning(f"Full processor unavailable: {proc_error}, using fallback")
|
| 106 |
+
# Fallback: simple text extraction
|
| 107 |
+
raw_text = ""
|
| 108 |
+
|
| 109 |
+
if file_type in ['.pdf']:
|
| 110 |
+
try:
|
| 111 |
+
import fitz
|
| 112 |
+
doc = fitz.open(str(file_path))
|
| 113 |
+
for page in doc:
|
| 114 |
+
raw_text += page.get_text() + "\n"
|
| 115 |
+
page_count = len(doc)
|
| 116 |
+
doc.close()
|
| 117 |
+
except Exception as e:
|
| 118 |
+
logger.error(f"PDF extraction failed: {e}")
|
| 119 |
+
page_count = 1
|
| 120 |
+
|
| 121 |
+
elif file_type in ['.txt', '.md']:
|
| 122 |
+
raw_text = file_path.read_text(errors='ignore')
|
| 123 |
+
page_count = 1
|
| 124 |
+
|
| 125 |
+
elif file_type == '.docx':
|
| 126 |
+
try:
|
| 127 |
+
from docx import Document
|
| 128 |
+
doc = Document(str(file_path))
|
| 129 |
+
raw_text = "\n".join([p.text for p in doc.paragraphs])
|
| 130 |
+
page_count = max(1, len(raw_text) // 3000)
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.error(f"DOCX extraction failed: {e}")
|
| 133 |
+
page_count = 1
|
| 134 |
+
|
| 135 |
+
elif file_type == '.xlsx':
|
| 136 |
+
try:
|
| 137 |
+
import pandas as pd
|
| 138 |
+
df_dict = pd.read_excel(str(file_path), sheet_name=None)
|
| 139 |
+
for sheet_name, df in df_dict.items():
|
| 140 |
+
raw_text += f"\n=== Sheet: {sheet_name} ===\n"
|
| 141 |
+
raw_text += df.to_string() + "\n"
|
| 142 |
+
page_count = len(df_dict)
|
| 143 |
+
except Exception as e:
|
| 144 |
+
logger.error(f"XLSX extraction failed: {e}")
|
| 145 |
+
page_count = 1
|
| 146 |
+
|
| 147 |
+
elif file_type == '.pptx':
|
| 148 |
+
try:
|
| 149 |
+
from pptx import Presentation
|
| 150 |
+
prs = Presentation(str(file_path))
|
| 151 |
+
for i, slide in enumerate(prs.slides):
|
| 152 |
+
raw_text += f"\n=== Slide {i+1} ===\n"
|
| 153 |
+
for shape in slide.shapes:
|
| 154 |
+
if hasattr(shape, "text"):
|
| 155 |
+
raw_text += shape.text + "\n"
|
| 156 |
+
page_count = len(prs.slides)
|
| 157 |
+
except Exception as e:
|
| 158 |
+
logger.error(f"PPTX extraction failed: {e}")
|
| 159 |
+
page_count = 1
|
| 160 |
+
|
| 161 |
+
# Create simple chunks
|
| 162 |
+
chunks = []
|
| 163 |
+
chunk_size = 1000
|
| 164 |
+
text_chunks = [raw_text[i:i+chunk_size] for i in range(0, len(raw_text), chunk_size - 100)]
|
| 165 |
+
for i, text in enumerate(text_chunks):
|
| 166 |
+
if text.strip():
|
| 167 |
+
chunks.append({
|
| 168 |
+
"chunk_id": f"{doc_id}_chunk_{i}",
|
| 169 |
+
"doc_id": doc_id,
|
| 170 |
+
"text": text.strip(),
|
| 171 |
+
"chunk_type": "text",
|
| 172 |
+
"page_num": min(i * chunk_size // 3000 + 1, page_count),
|
| 173 |
+
"confidence": 1.0,
|
| 174 |
+
"bbox": None,
|
| 175 |
+
})
|
| 176 |
+
|
| 177 |
+
_documents[doc_id].update({
|
| 178 |
+
"status": DocumentStatus.COMPLETED,
|
| 179 |
+
"raw_text": raw_text,
|
| 180 |
+
"chunks": chunks,
|
| 181 |
+
"page_count": page_count,
|
| 182 |
+
"ocr_regions": [],
|
| 183 |
+
"layout_regions": [],
|
| 184 |
+
"processing_time": 0.0,
|
| 185 |
+
"updated_at": datetime.now(),
|
| 186 |
+
})
|
| 187 |
+
|
| 188 |
+
logger.info(f"Document {doc_id} processed with fallback: {len(chunks)} chunks")
|
| 189 |
+
|
| 190 |
+
except Exception as e:
|
| 191 |
+
logger.error(f"Document processing failed for {doc_id}: {e}")
|
| 192 |
+
_documents[doc_id]["status"] = DocumentStatus.ERROR
|
| 193 |
+
_documents[doc_id]["error"] = str(e)
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
@router.post("/upload", response_model=DocumentUploadResponse)
|
| 197 |
+
async def upload_document(
|
| 198 |
+
background_tasks: BackgroundTasks,
|
| 199 |
+
file: UploadFile = File(...),
|
| 200 |
+
auto_process: bool = Query(True, description="Automatically process after upload"),
|
| 201 |
+
auto_index: bool = Query(False, description="Automatically index to RAG after processing"),
|
| 202 |
+
):
|
| 203 |
+
"""
|
| 204 |
+
Upload a document for processing.
|
| 205 |
+
|
| 206 |
+
Supported formats: PDF, PNG, JPG, DOCX, XLSX, PPTX, TXT, MD
|
| 207 |
+
"""
|
| 208 |
+
# Validate file extension
|
| 209 |
+
file_ext = Path(file.filename).suffix.lower()
|
| 210 |
+
if file_ext not in SUPPORTED_EXTENSIONS:
|
| 211 |
+
raise HTTPException(
|
| 212 |
+
status_code=400,
|
| 213 |
+
detail=f"Unsupported file type: {file_ext}. Supported: {list(SUPPORTED_EXTENSIONS.keys())}"
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
# Read file content
|
| 217 |
+
content = await file.read()
|
| 218 |
+
if len(content) == 0:
|
| 219 |
+
raise HTTPException(status_code=400, detail="Empty file uploaded")
|
| 220 |
+
|
| 221 |
+
# Generate document ID
|
| 222 |
+
doc_id = generate_doc_id(file.filename, content)
|
| 223 |
+
|
| 224 |
+
# Save file
|
| 225 |
+
file_path = UPLOAD_DIR / f"{doc_id}{file_ext}"
|
| 226 |
+
with open(file_path, "wb") as f:
|
| 227 |
+
f.write(content)
|
| 228 |
+
|
| 229 |
+
# Create document record
|
| 230 |
+
_documents[doc_id] = {
|
| 231 |
+
"doc_id": doc_id,
|
| 232 |
+
"filename": file.filename,
|
| 233 |
+
"file_type": file_ext,
|
| 234 |
+
"file_path": str(file_path),
|
| 235 |
+
"status": DocumentStatus.PENDING,
|
| 236 |
+
"raw_text": "",
|
| 237 |
+
"chunks": [],
|
| 238 |
+
"page_count": 0,
|
| 239 |
+
"ocr_regions": [],
|
| 240 |
+
"layout_regions": [],
|
| 241 |
+
"indexed": False,
|
| 242 |
+
"indexed_chunks": 0,
|
| 243 |
+
"processing_time": None,
|
| 244 |
+
"created_at": datetime.now(),
|
| 245 |
+
"updated_at": None,
|
| 246 |
+
"auto_index": auto_index,
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
# Start processing in background
|
| 250 |
+
if auto_process:
|
| 251 |
+
background_tasks.add_task(process_document_task, doc_id, file_path, file_ext)
|
| 252 |
+
status = DocumentStatus.PROCESSING
|
| 253 |
+
message = "Document uploaded and processing started"
|
| 254 |
+
else:
|
| 255 |
+
status = DocumentStatus.PENDING
|
| 256 |
+
message = "Document uploaded successfully. Call /process to begin processing."
|
| 257 |
+
|
| 258 |
+
_documents[doc_id]["status"] = status
|
| 259 |
+
|
| 260 |
+
return DocumentUploadResponse(
|
| 261 |
+
doc_id=doc_id,
|
| 262 |
+
filename=file.filename,
|
| 263 |
+
status=status,
|
| 264 |
+
message=message,
|
| 265 |
+
created_at=_documents[doc_id]["created_at"]
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
@router.get("", response_model=List[DocumentMetadata])
|
| 270 |
+
async def list_documents(
|
| 271 |
+
status: Optional[DocumentStatus] = Query(None, description="Filter by status"),
|
| 272 |
+
indexed: Optional[bool] = Query(None, description="Filter by indexed status"),
|
| 273 |
+
limit: int = Query(50, ge=1, le=200),
|
| 274 |
+
offset: int = Query(0, ge=0),
|
| 275 |
+
):
|
| 276 |
+
"""List all documents with optional filtering."""
|
| 277 |
+
docs = list(_documents.values())
|
| 278 |
+
|
| 279 |
+
# Apply filters
|
| 280 |
+
if status:
|
| 281 |
+
docs = [d for d in docs if d["status"] == status]
|
| 282 |
+
if indexed is not None:
|
| 283 |
+
docs = [d for d in docs if d.get("indexed", False) == indexed]
|
| 284 |
+
|
| 285 |
+
# Apply pagination
|
| 286 |
+
docs = docs[offset:offset + limit]
|
| 287 |
+
|
| 288 |
+
return [
|
| 289 |
+
DocumentMetadata(
|
| 290 |
+
doc_id=d["doc_id"],
|
| 291 |
+
filename=d["filename"],
|
| 292 |
+
file_type=d["file_type"],
|
| 293 |
+
page_count=d.get("page_count", 0),
|
| 294 |
+
chunk_count=len(d.get("chunks", [])),
|
| 295 |
+
text_length=len(d.get("raw_text", "")),
|
| 296 |
+
status=d["status"],
|
| 297 |
+
indexed=d.get("indexed", False),
|
| 298 |
+
indexed_chunks=d.get("indexed_chunks", 0),
|
| 299 |
+
processing_time=d.get("processing_time"),
|
| 300 |
+
created_at=d["created_at"],
|
| 301 |
+
updated_at=d.get("updated_at"),
|
| 302 |
+
)
|
| 303 |
+
for d in docs
|
| 304 |
+
]
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
@router.get("/{doc_id}", response_model=DocumentResponse)
|
| 308 |
+
async def get_document(
|
| 309 |
+
doc_id: str,
|
| 310 |
+
include_text: bool = Query(False, description="Include full raw text"),
|
| 311 |
+
):
|
| 312 |
+
"""Get document by ID."""
|
| 313 |
+
if doc_id not in _documents:
|
| 314 |
+
raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")
|
| 315 |
+
|
| 316 |
+
d = _documents[doc_id]
|
| 317 |
+
|
| 318 |
+
return DocumentResponse(
|
| 319 |
+
doc_id=d["doc_id"],
|
| 320 |
+
filename=d["filename"],
|
| 321 |
+
file_type=d["file_type"],
|
| 322 |
+
status=d["status"],
|
| 323 |
+
metadata=DocumentMetadata(
|
| 324 |
+
doc_id=d["doc_id"],
|
| 325 |
+
filename=d["filename"],
|
| 326 |
+
file_type=d["file_type"],
|
| 327 |
+
page_count=d.get("page_count", 0),
|
| 328 |
+
chunk_count=len(d.get("chunks", [])),
|
| 329 |
+
text_length=len(d.get("raw_text", "")),
|
| 330 |
+
status=d["status"],
|
| 331 |
+
indexed=d.get("indexed", False),
|
| 332 |
+
indexed_chunks=d.get("indexed_chunks", 0),
|
| 333 |
+
processing_time=d.get("processing_time"),
|
| 334 |
+
created_at=d["created_at"],
|
| 335 |
+
updated_at=d.get("updated_at"),
|
| 336 |
+
),
|
| 337 |
+
raw_text=d.get("raw_text") if include_text else None,
|
| 338 |
+
preview=d.get("raw_text", "")[:500] if d.get("raw_text") else None,
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
@router.get("/{doc_id}/detail", response_model=DocumentDetailResponse)
|
| 343 |
+
async def get_document_detail(doc_id: str):
|
| 344 |
+
"""Get detailed document information including chunks and regions."""
|
| 345 |
+
if doc_id not in _documents:
|
| 346 |
+
raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")
|
| 347 |
+
|
| 348 |
+
d = _documents[doc_id]
|
| 349 |
+
|
| 350 |
+
return DocumentDetailResponse(
|
| 351 |
+
doc_id=d["doc_id"],
|
| 352 |
+
filename=d["filename"],
|
| 353 |
+
status=d["status"],
|
| 354 |
+
metadata=DocumentMetadata(
|
| 355 |
+
doc_id=d["doc_id"],
|
| 356 |
+
filename=d["filename"],
|
| 357 |
+
file_type=d["file_type"],
|
| 358 |
+
page_count=d.get("page_count", 0),
|
| 359 |
+
chunk_count=len(d.get("chunks", [])),
|
| 360 |
+
text_length=len(d.get("raw_text", "")),
|
| 361 |
+
status=d["status"],
|
| 362 |
+
indexed=d.get("indexed", False),
|
| 363 |
+
indexed_chunks=d.get("indexed_chunks", 0),
|
| 364 |
+
processing_time=d.get("processing_time"),
|
| 365 |
+
created_at=d["created_at"],
|
| 366 |
+
updated_at=d.get("updated_at"),
|
| 367 |
+
),
|
| 368 |
+
chunks=[ChunkInfo(**c) for c in d.get("chunks", [])],
|
| 369 |
+
ocr_regions=[OCRRegionInfo(**r) for r in d.get("ocr_regions", []) if isinstance(r, dict)],
|
| 370 |
+
layout_regions=[LayoutRegionInfo(**r) for r in d.get("layout_regions", []) if isinstance(r, dict)],
|
| 371 |
+
)
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
@router.get("/{doc_id}/chunks", response_model=ChunksResponse)
|
| 375 |
+
async def get_document_chunks(
|
| 376 |
+
doc_id: str,
|
| 377 |
+
page: Optional[int] = Query(None, description="Filter by page number"),
|
| 378 |
+
chunk_type: Optional[str] = Query(None, description="Filter by chunk type"),
|
| 379 |
+
):
|
| 380 |
+
"""Get all chunks for a document."""
|
| 381 |
+
if doc_id not in _documents:
|
| 382 |
+
raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")
|
| 383 |
+
|
| 384 |
+
d = _documents[doc_id]
|
| 385 |
+
chunks = d.get("chunks", [])
|
| 386 |
+
|
| 387 |
+
# Apply filters
|
| 388 |
+
if page is not None:
|
| 389 |
+
chunks = [c for c in chunks if c.get("page_num") == page]
|
| 390 |
+
if chunk_type:
|
| 391 |
+
chunks = [c for c in chunks if c.get("chunk_type") == chunk_type]
|
| 392 |
+
|
| 393 |
+
return ChunksResponse(
|
| 394 |
+
doc_id=doc_id,
|
| 395 |
+
total_chunks=len(chunks),
|
| 396 |
+
chunks=[ChunkInfo(**c) for c in chunks],
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
|
| 400 |
+
@router.post("/{doc_id}/process")
|
| 401 |
+
async def process_document(
|
| 402 |
+
doc_id: str,
|
| 403 |
+
background_tasks: BackgroundTasks,
|
| 404 |
+
force: bool = Query(False, description="Force reprocessing"),
|
| 405 |
+
):
|
| 406 |
+
"""Trigger document processing."""
|
| 407 |
+
if doc_id not in _documents:
|
| 408 |
+
raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")
|
| 409 |
+
|
| 410 |
+
d = _documents[doc_id]
|
| 411 |
+
|
| 412 |
+
if d["status"] == DocumentStatus.PROCESSING:
|
| 413 |
+
raise HTTPException(status_code=400, detail="Document is already being processed")
|
| 414 |
+
|
| 415 |
+
if d["status"] == DocumentStatus.COMPLETED and not force:
|
| 416 |
+
raise HTTPException(
|
| 417 |
+
status_code=400,
|
| 418 |
+
detail="Document already processed. Use force=true to reprocess."
|
| 419 |
+
)
|
| 420 |
+
|
| 421 |
+
file_path = Path(d["file_path"])
|
| 422 |
+
if not file_path.exists():
|
| 423 |
+
raise HTTPException(status_code=404, detail="Document file not found")
|
| 424 |
+
|
| 425 |
+
background_tasks.add_task(process_document_task, doc_id, file_path, d["file_type"])
|
| 426 |
+
_documents[doc_id]["status"] = DocumentStatus.PROCESSING
|
| 427 |
+
|
| 428 |
+
return {"doc_id": doc_id, "status": "processing", "message": "Processing started"}
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
@router.delete("/{doc_id}")
|
| 432 |
+
async def delete_document(doc_id: str):
|
| 433 |
+
"""Delete a document."""
|
| 434 |
+
if doc_id not in _documents:
|
| 435 |
+
raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")
|
| 436 |
+
|
| 437 |
+
d = _documents[doc_id]
|
| 438 |
+
|
| 439 |
+
# Delete file
|
| 440 |
+
file_path = Path(d["file_path"])
|
| 441 |
+
if file_path.exists():
|
| 442 |
+
file_path.unlink()
|
| 443 |
+
|
| 444 |
+
# Remove from store
|
| 445 |
+
del _documents[doc_id]
|
| 446 |
+
|
| 447 |
+
return {"doc_id": doc_id, "status": "deleted", "message": "Document deleted successfully"}
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
@router.post("/{doc_id}/index", response_model=IndexResponse)
|
| 451 |
+
async def index_document(doc_id: str, force_reindex: bool = Query(False)):
|
| 452 |
+
"""Index a document to the RAG vector store."""
|
| 453 |
+
if doc_id not in _documents:
|
| 454 |
+
raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")
|
| 455 |
+
|
| 456 |
+
d = _documents[doc_id]
|
| 457 |
+
|
| 458 |
+
if d["status"] != DocumentStatus.COMPLETED:
|
| 459 |
+
raise HTTPException(
|
| 460 |
+
status_code=400,
|
| 461 |
+
detail=f"Document not ready for indexing. Current status: {d['status']}"
|
| 462 |
+
)
|
| 463 |
+
|
| 464 |
+
if d.get("indexed") and not force_reindex:
|
| 465 |
+
return IndexResponse(
|
| 466 |
+
doc_id=doc_id,
|
| 467 |
+
status="already_indexed",
|
| 468 |
+
chunks_indexed=d.get("indexed_chunks", 0),
|
| 469 |
+
message="Document already indexed. Use force_reindex=true to reindex."
|
| 470 |
+
)
|
| 471 |
+
|
| 472 |
+
try:
|
| 473 |
+
# Try to use actual indexer
|
| 474 |
+
from src.rag.indexer import DocumentIndexer
|
| 475 |
+
from src.rag.embeddings import get_embedding_model
|
| 476 |
+
from src.rag.store import get_vector_store
|
| 477 |
+
|
| 478 |
+
embeddings = get_embedding_model()
|
| 479 |
+
store = get_vector_store()
|
| 480 |
+
indexer = DocumentIndexer(embeddings, store)
|
| 481 |
+
|
| 482 |
+
# Index chunks
|
| 483 |
+
chunks_to_index = d.get("chunks", [])
|
| 484 |
+
indexed_count = 0
|
| 485 |
+
|
| 486 |
+
for chunk in chunks_to_index:
|
| 487 |
+
try:
|
| 488 |
+
indexer.index_chunk(
|
| 489 |
+
text=chunk["text"],
|
| 490 |
+
document_id=doc_id,
|
| 491 |
+
chunk_id=chunk["chunk_id"],
|
| 492 |
+
metadata={
|
| 493 |
+
"filename": d["filename"],
|
| 494 |
+
"page_num": chunk.get("page_num"),
|
| 495 |
+
"chunk_type": chunk.get("chunk_type", "text"),
|
| 496 |
+
}
|
| 497 |
+
)
|
| 498 |
+
indexed_count += 1
|
| 499 |
+
except Exception as e:
|
| 500 |
+
logger.warning(f"Failed to index chunk {chunk['chunk_id']}: {e}")
|
| 501 |
+
|
| 502 |
+
_documents[doc_id]["indexed"] = True
|
| 503 |
+
_documents[doc_id]["indexed_chunks"] = indexed_count
|
| 504 |
+
_documents[doc_id]["status"] = DocumentStatus.INDEXED
|
| 505 |
+
|
| 506 |
+
return IndexResponse(
|
| 507 |
+
doc_id=doc_id,
|
| 508 |
+
status="indexed",
|
| 509 |
+
chunks_indexed=indexed_count,
|
| 510 |
+
message=f"Successfully indexed {indexed_count} chunks"
|
| 511 |
+
)
|
| 512 |
+
|
| 513 |
+
except Exception as e:
|
| 514 |
+
logger.error(f"Indexing failed for {doc_id}: {e}")
|
| 515 |
+
raise HTTPException(status_code=500, detail=f"Indexing failed: {str(e)}")
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
@router.post("/batch-index", response_model=BatchIndexResponse)
|
| 519 |
+
async def batch_index_documents(request: BatchIndexRequest):
|
| 520 |
+
"""Batch index multiple documents."""
|
| 521 |
+
results = []
|
| 522 |
+
successful = 0
|
| 523 |
+
failed = 0
|
| 524 |
+
|
| 525 |
+
for doc_id in request.doc_ids:
|
| 526 |
+
try:
|
| 527 |
+
result = await index_document(doc_id, request.force_reindex)
|
| 528 |
+
results.append(result)
|
| 529 |
+
if result.status in ["indexed", "already_indexed"]:
|
| 530 |
+
successful += 1
|
| 531 |
+
else:
|
| 532 |
+
failed += 1
|
| 533 |
+
except HTTPException as e:
|
| 534 |
+
results.append(IndexResponse(
|
| 535 |
+
doc_id=doc_id,
|
| 536 |
+
status="error",
|
| 537 |
+
chunks_indexed=0,
|
| 538 |
+
message=e.detail
|
| 539 |
+
))
|
| 540 |
+
failed += 1
|
| 541 |
+
|
| 542 |
+
return BatchIndexResponse(
|
| 543 |
+
total_requested=len(request.doc_ids),
|
| 544 |
+
successful=successful,
|
| 545 |
+
failed=failed,
|
| 546 |
+
results=results
|
| 547 |
+
)
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
# Export document store for other modules
|
| 551 |
+
def get_document_store():
|
| 552 |
+
"""Get the in-memory document store."""
|
| 553 |
+
return _documents
|
api/routes/rag.py
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SPARKNET RAG API Routes
|
| 3 |
+
Endpoints for RAG queries, search, and indexing management.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from fastapi import APIRouter, HTTPException, Query, Depends
|
| 7 |
+
from fastapi.responses import StreamingResponse
|
| 8 |
+
from typing import List, Optional
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
import time
|
| 12 |
+
import json
|
| 13 |
+
import sys
|
| 14 |
+
import asyncio
|
| 15 |
+
|
| 16 |
+
# Add project root to path
|
| 17 |
+
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
| 18 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 19 |
+
|
| 20 |
+
from api.schemas import (
|
| 21 |
+
QueryRequest, RAGResponse, Citation, QueryPlan, QueryIntentType,
|
| 22 |
+
SearchRequest, SearchResponse, SearchResult,
|
| 23 |
+
StoreStatus, CollectionInfo
|
| 24 |
+
)
|
| 25 |
+
from loguru import logger
|
| 26 |
+
|
| 27 |
+
router = APIRouter()
|
| 28 |
+
|
| 29 |
+
# Simple in-memory cache for query results
|
| 30 |
+
_query_cache = {}
|
| 31 |
+
CACHE_TTL_SECONDS = 3600 # 1 hour
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def get_cache_key(query: str, doc_ids: Optional[List[str]]) -> str:
|
| 35 |
+
"""Generate cache key for query."""
|
| 36 |
+
import hashlib
|
| 37 |
+
doc_str = ",".join(sorted(doc_ids)) if doc_ids else "all"
|
| 38 |
+
content = f"{query}:{doc_str}"
|
| 39 |
+
return hashlib.md5(content.encode()).hexdigest()
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def get_cached_response(cache_key: str) -> Optional[RAGResponse]:
|
| 43 |
+
"""Get cached response if valid."""
|
| 44 |
+
if cache_key in _query_cache:
|
| 45 |
+
cached = _query_cache[cache_key]
|
| 46 |
+
if time.time() - cached["timestamp"] < CACHE_TTL_SECONDS:
|
| 47 |
+
response = cached["response"]
|
| 48 |
+
response.from_cache = True
|
| 49 |
+
return response
|
| 50 |
+
else:
|
| 51 |
+
del _query_cache[cache_key]
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def cache_response(cache_key: str, response: RAGResponse):
|
| 56 |
+
"""Cache a query response."""
|
| 57 |
+
_query_cache[cache_key] = {
|
| 58 |
+
"response": response,
|
| 59 |
+
"timestamp": time.time()
|
| 60 |
+
}
|
| 61 |
+
# Limit cache size
|
| 62 |
+
if len(_query_cache) > 1000:
|
| 63 |
+
oldest_key = min(_query_cache, key=lambda k: _query_cache[k]["timestamp"])
|
| 64 |
+
del _query_cache[oldest_key]
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _get_rag_system():
|
| 68 |
+
"""Get or initialize the RAG system."""
|
| 69 |
+
try:
|
| 70 |
+
from src.rag.agentic.orchestrator import AgenticRAG, RAGConfig
|
| 71 |
+
|
| 72 |
+
config = RAGConfig(
|
| 73 |
+
model_name="llama3.2:latest",
|
| 74 |
+
max_revision_attempts=2,
|
| 75 |
+
retrieval_top_k=10,
|
| 76 |
+
final_top_k=5,
|
| 77 |
+
min_confidence=0.5,
|
| 78 |
+
)
|
| 79 |
+
return AgenticRAG(config)
|
| 80 |
+
except Exception as e:
|
| 81 |
+
logger.error(f"Failed to initialize RAG system: {e}")
|
| 82 |
+
return None
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
@router.post("/query", response_model=RAGResponse)
|
| 86 |
+
async def query_documents(request: QueryRequest):
|
| 87 |
+
"""
|
| 88 |
+
Execute a RAG query across indexed documents.
|
| 89 |
+
|
| 90 |
+
The query goes through the 5-agent pipeline:
|
| 91 |
+
1. QueryPlanner - Intent classification and query decomposition
|
| 92 |
+
2. Retriever - Hybrid dense+sparse search
|
| 93 |
+
3. Reranker - Cross-encoder reranking with MMR
|
| 94 |
+
4. Synthesizer - Answer generation with citations
|
| 95 |
+
5. Critic - Hallucination detection and validation
|
| 96 |
+
"""
|
| 97 |
+
start_time = time.time()
|
| 98 |
+
|
| 99 |
+
# Check cache if enabled
|
| 100 |
+
if request.use_cache:
|
| 101 |
+
cache_key = get_cache_key(request.query, request.doc_ids)
|
| 102 |
+
cached = get_cached_response(cache_key)
|
| 103 |
+
if cached:
|
| 104 |
+
cached.latency_ms = (time.time() - start_time) * 1000
|
| 105 |
+
return cached
|
| 106 |
+
|
| 107 |
+
try:
|
| 108 |
+
# Initialize RAG system
|
| 109 |
+
rag = _get_rag_system()
|
| 110 |
+
if not rag:
|
| 111 |
+
raise HTTPException(status_code=503, detail="RAG system not available")
|
| 112 |
+
|
| 113 |
+
# Build filters
|
| 114 |
+
filters = {}
|
| 115 |
+
if request.doc_ids:
|
| 116 |
+
filters["document_id"] = {"$in": request.doc_ids}
|
| 117 |
+
|
| 118 |
+
# Execute query
|
| 119 |
+
logger.info(f"Executing RAG query: {request.query[:50]}...")
|
| 120 |
+
|
| 121 |
+
result = rag.query(
|
| 122 |
+
query=request.query,
|
| 123 |
+
filters=filters if filters else None,
|
| 124 |
+
top_k=request.top_k,
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# Build response
|
| 128 |
+
citations = []
|
| 129 |
+
for i, source in enumerate(result.get("sources", [])):
|
| 130 |
+
citations.append(Citation(
|
| 131 |
+
citation_id=i + 1,
|
| 132 |
+
doc_id=source.get("document_id", "unknown"),
|
| 133 |
+
document_name=source.get("filename", source.get("document_id", "unknown")),
|
| 134 |
+
chunk_id=source.get("chunk_id", f"chunk_{i}"),
|
| 135 |
+
chunk_text=source.get("text", "")[:300],
|
| 136 |
+
page_num=source.get("page_num"),
|
| 137 |
+
relevance_score=source.get("relevance_score", source.get("score", 0.0)),
|
| 138 |
+
bbox=source.get("bbox"),
|
| 139 |
+
))
|
| 140 |
+
|
| 141 |
+
# Query plan info
|
| 142 |
+
query_plan = None
|
| 143 |
+
if "plan" in result:
|
| 144 |
+
plan = result["plan"]
|
| 145 |
+
query_plan = QueryPlan(
|
| 146 |
+
intent=QueryIntentType(plan.get("intent", "factoid").lower()),
|
| 147 |
+
sub_queries=plan.get("sub_queries", []),
|
| 148 |
+
keywords=plan.get("keywords", []),
|
| 149 |
+
strategy=plan.get("strategy", "hybrid"),
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
response = RAGResponse(
|
| 153 |
+
query=request.query,
|
| 154 |
+
answer=result.get("answer", "I could not find an answer to your question."),
|
| 155 |
+
confidence=result.get("confidence", 0.0),
|
| 156 |
+
citations=citations,
|
| 157 |
+
source_count=len(citations),
|
| 158 |
+
query_plan=query_plan,
|
| 159 |
+
from_cache=False,
|
| 160 |
+
validation=result.get("validation"),
|
| 161 |
+
latency_ms=(time.time() - start_time) * 1000,
|
| 162 |
+
revision_count=result.get("revision_count", 0),
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
# Cache successful responses
|
| 166 |
+
if request.use_cache and response.confidence >= request.min_confidence:
|
| 167 |
+
cache_key = get_cache_key(request.query, request.doc_ids)
|
| 168 |
+
cache_response(cache_key, response)
|
| 169 |
+
|
| 170 |
+
return response
|
| 171 |
+
|
| 172 |
+
except HTTPException:
|
| 173 |
+
raise
|
| 174 |
+
except Exception as e:
|
| 175 |
+
logger.error(f"RAG query failed: {e}")
|
| 176 |
+
raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
@router.post("/query/stream")
|
| 180 |
+
async def query_documents_stream(request: QueryRequest):
|
| 181 |
+
"""
|
| 182 |
+
Stream RAG response for real-time updates.
|
| 183 |
+
|
| 184 |
+
Returns Server-Sent Events (SSE) with partial responses.
|
| 185 |
+
"""
|
| 186 |
+
async def generate():
|
| 187 |
+
try:
|
| 188 |
+
# Initialize RAG system
|
| 189 |
+
rag = _get_rag_system()
|
| 190 |
+
if not rag:
|
| 191 |
+
yield f"data: {json.dumps({'error': 'RAG system not available'})}\n\n"
|
| 192 |
+
return
|
| 193 |
+
|
| 194 |
+
# Send planning stage
|
| 195 |
+
yield f"data: {json.dumps({'stage': 'planning', 'message': 'Analyzing query...'})}\n\n"
|
| 196 |
+
await asyncio.sleep(0.1)
|
| 197 |
+
|
| 198 |
+
# Build filters
|
| 199 |
+
filters = {}
|
| 200 |
+
if request.doc_ids:
|
| 201 |
+
filters["document_id"] = {"$in": request.doc_ids}
|
| 202 |
+
|
| 203 |
+
# Send retrieval stage
|
| 204 |
+
yield f"data: {json.dumps({'stage': 'retrieving', 'message': 'Searching documents...'})}\n\n"
|
| 205 |
+
|
| 206 |
+
# Execute query (in chunks if streaming supported)
|
| 207 |
+
result = rag.query(
|
| 208 |
+
query=request.query,
|
| 209 |
+
filters=filters if filters else None,
|
| 210 |
+
top_k=request.top_k,
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
# Send sources
|
| 214 |
+
yield f"data: {json.dumps({'stage': 'sources', 'count': len(result.get('sources', []))})}\n\n"
|
| 215 |
+
|
| 216 |
+
# Send synthesis stage
|
| 217 |
+
yield f"data: {json.dumps({'stage': 'synthesizing', 'message': 'Generating answer...'})}\n\n"
|
| 218 |
+
|
| 219 |
+
# Stream answer in chunks
|
| 220 |
+
answer = result.get("answer", "")
|
| 221 |
+
chunk_size = 50
|
| 222 |
+
for i in range(0, len(answer), chunk_size):
|
| 223 |
+
chunk = answer[i:i+chunk_size]
|
| 224 |
+
yield f"data: {json.dumps({'stage': 'answer', 'chunk': chunk})}\n\n"
|
| 225 |
+
await asyncio.sleep(0.02)
|
| 226 |
+
|
| 227 |
+
# Send final result
|
| 228 |
+
citations = []
|
| 229 |
+
for i, source in enumerate(result.get("sources", [])):
|
| 230 |
+
citations.append({
|
| 231 |
+
"citation_id": i + 1,
|
| 232 |
+
"doc_id": source.get("document_id", "unknown"),
|
| 233 |
+
"chunk_text": source.get("text", "")[:200],
|
| 234 |
+
"relevance_score": source.get("score", 0.0),
|
| 235 |
+
})
|
| 236 |
+
|
| 237 |
+
final = {
|
| 238 |
+
"stage": "complete",
|
| 239 |
+
"confidence": result.get("confidence", 0.0),
|
| 240 |
+
"citations": citations,
|
| 241 |
+
"validation": result.get("validation"),
|
| 242 |
+
}
|
| 243 |
+
yield f"data: {json.dumps(final)}\n\n"
|
| 244 |
+
|
| 245 |
+
except Exception as e:
|
| 246 |
+
logger.error(f"Streaming query failed: {e}")
|
| 247 |
+
yield f"data: {json.dumps({'error': str(e)})}\n\n"
|
| 248 |
+
|
| 249 |
+
return StreamingResponse(
|
| 250 |
+
generate(),
|
| 251 |
+
media_type="text/event-stream",
|
| 252 |
+
headers={
|
| 253 |
+
"Cache-Control": "no-cache",
|
| 254 |
+
"Connection": "keep-alive",
|
| 255 |
+
}
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
@router.post("/search", response_model=SearchResponse)
|
| 260 |
+
async def search_documents(request: SearchRequest):
|
| 261 |
+
"""
|
| 262 |
+
Semantic search across indexed documents.
|
| 263 |
+
|
| 264 |
+
Returns matching chunks without answer synthesis.
|
| 265 |
+
"""
|
| 266 |
+
start_time = time.time()
|
| 267 |
+
|
| 268 |
+
try:
|
| 269 |
+
from src.rag.store import get_vector_store
|
| 270 |
+
from src.rag.embeddings import get_embedding_model
|
| 271 |
+
|
| 272 |
+
store = get_vector_store()
|
| 273 |
+
embeddings = get_embedding_model()
|
| 274 |
+
|
| 275 |
+
# Generate query embedding
|
| 276 |
+
query_embedding = embeddings.embed_query(request.query)
|
| 277 |
+
|
| 278 |
+
# Build filter
|
| 279 |
+
where_filter = None
|
| 280 |
+
if request.doc_ids:
|
| 281 |
+
where_filter = {"document_id": {"$in": request.doc_ids}}
|
| 282 |
+
|
| 283 |
+
# Search
|
| 284 |
+
results = store.similarity_search_with_score(
|
| 285 |
+
query_embedding=query_embedding,
|
| 286 |
+
k=request.top_k,
|
| 287 |
+
where=where_filter,
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
# Filter by minimum score
|
| 291 |
+
search_results = []
|
| 292 |
+
for doc, score in results:
|
| 293 |
+
if score >= request.min_score:
|
| 294 |
+
search_results.append(SearchResult(
|
| 295 |
+
chunk_id=doc.metadata.get("chunk_id", "unknown"),
|
| 296 |
+
doc_id=doc.metadata.get("document_id", "unknown"),
|
| 297 |
+
document_name=doc.metadata.get("filename", "unknown"),
|
| 298 |
+
text=doc.page_content,
|
| 299 |
+
score=score,
|
| 300 |
+
page_num=doc.metadata.get("page_num"),
|
| 301 |
+
chunk_type=doc.metadata.get("chunk_type", "text"),
|
| 302 |
+
))
|
| 303 |
+
|
| 304 |
+
return SearchResponse(
|
| 305 |
+
query=request.query,
|
| 306 |
+
total_results=len(search_results),
|
| 307 |
+
results=search_results,
|
| 308 |
+
latency_ms=(time.time() - start_time) * 1000,
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
except Exception as e:
|
| 312 |
+
logger.error(f"Search failed: {e}")
|
| 313 |
+
# Fallback: return empty results
|
| 314 |
+
return SearchResponse(
|
| 315 |
+
query=request.query,
|
| 316 |
+
total_results=0,
|
| 317 |
+
results=[],
|
| 318 |
+
latency_ms=(time.time() - start_time) * 1000,
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
@router.get("/store/status", response_model=StoreStatus)
|
| 323 |
+
async def get_store_status():
|
| 324 |
+
"""Get vector store status and statistics."""
|
| 325 |
+
try:
|
| 326 |
+
from src.rag.store import get_vector_store
|
| 327 |
+
|
| 328 |
+
store = get_vector_store()
|
| 329 |
+
|
| 330 |
+
# Get collection info
|
| 331 |
+
collection = store._collection
|
| 332 |
+
count = collection.count()
|
| 333 |
+
|
| 334 |
+
# Get unique documents
|
| 335 |
+
all_metadata = collection.get(include=["metadatas"])
|
| 336 |
+
doc_ids = set()
|
| 337 |
+
for meta in all_metadata.get("metadatas", []):
|
| 338 |
+
if meta and "document_id" in meta:
|
| 339 |
+
doc_ids.add(meta["document_id"])
|
| 340 |
+
|
| 341 |
+
collections = [CollectionInfo(
|
| 342 |
+
name=store.collection_name,
|
| 343 |
+
document_count=len(doc_ids),
|
| 344 |
+
chunk_count=count,
|
| 345 |
+
embedding_dimension=store.embedding_dimension if hasattr(store, 'embedding_dimension') else 1024,
|
| 346 |
+
)]
|
| 347 |
+
|
| 348 |
+
return StoreStatus(
|
| 349 |
+
status="healthy",
|
| 350 |
+
collections=collections,
|
| 351 |
+
total_documents=len(doc_ids),
|
| 352 |
+
total_chunks=count,
|
| 353 |
+
)
|
| 354 |
+
|
| 355 |
+
except Exception as e:
|
| 356 |
+
logger.error(f"Store status check failed: {e}")
|
| 357 |
+
return StoreStatus(
|
| 358 |
+
status="error",
|
| 359 |
+
collections=[],
|
| 360 |
+
total_documents=0,
|
| 361 |
+
total_chunks=0,
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
@router.delete("/store/collection/{collection_name}")
|
| 366 |
+
async def clear_collection(collection_name: str, confirm: bool = Query(False)):
|
| 367 |
+
"""Clear a vector store collection (dangerous operation)."""
|
| 368 |
+
if not confirm:
|
| 369 |
+
raise HTTPException(
|
| 370 |
+
status_code=400,
|
| 371 |
+
detail="This operation will delete all data. Set confirm=true to proceed."
|
| 372 |
+
)
|
| 373 |
+
|
| 374 |
+
try:
|
| 375 |
+
from src.rag.store import get_vector_store
|
| 376 |
+
|
| 377 |
+
store = get_vector_store()
|
| 378 |
+
if store.collection_name != collection_name:
|
| 379 |
+
raise HTTPException(status_code=404, detail=f"Collection not found: {collection_name}")
|
| 380 |
+
|
| 381 |
+
# Clear collection
|
| 382 |
+
store._collection.delete(where={})
|
| 383 |
+
|
| 384 |
+
return {"status": "cleared", "collection": collection_name, "message": "Collection cleared successfully"}
|
| 385 |
+
|
| 386 |
+
except HTTPException:
|
| 387 |
+
raise
|
| 388 |
+
except Exception as e:
|
| 389 |
+
logger.error(f"Collection clear failed: {e}")
|
| 390 |
+
raise HTTPException(status_code=500, detail=f"Clear failed: {str(e)}")
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
@router.get("/cache/stats")
|
| 394 |
+
async def get_cache_stats():
|
| 395 |
+
"""Get query cache statistics."""
|
| 396 |
+
current_time = time.time()
|
| 397 |
+
valid_entries = sum(
|
| 398 |
+
1 for v in _query_cache.values()
|
| 399 |
+
if current_time - v["timestamp"] < CACHE_TTL_SECONDS
|
| 400 |
+
)
|
| 401 |
+
|
| 402 |
+
return {
|
| 403 |
+
"total_entries": len(_query_cache),
|
| 404 |
+
"valid_entries": valid_entries,
|
| 405 |
+
"expired_entries": len(_query_cache) - valid_entries,
|
| 406 |
+
"ttl_seconds": CACHE_TTL_SECONDS,
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
@router.delete("/cache")
|
| 411 |
+
async def clear_cache():
|
| 412 |
+
"""Clear the query cache."""
|
| 413 |
+
count = len(_query_cache)
|
| 414 |
+
_query_cache.clear()
|
| 415 |
+
return {"status": "cleared", "entries_removed": count}
|
api/schemas.py
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SPARKNET API Schemas
|
| 3 |
+
Pydantic models for request/response validation.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from pydantic import BaseModel, Field, ConfigDict
|
| 7 |
+
from typing import List, Dict, Any, Optional
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from enum import Enum
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# ==================== Enums ====================
|
| 13 |
+
|
| 14 |
+
class DocumentStatus(str, Enum):
|
| 15 |
+
PENDING = "pending"
|
| 16 |
+
PROCESSING = "processing"
|
| 17 |
+
COMPLETED = "completed"
|
| 18 |
+
INDEXED = "indexed"
|
| 19 |
+
ERROR = "error"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class QueryIntentType(str, Enum):
|
| 23 |
+
FACTOID = "factoid"
|
| 24 |
+
COMPARISON = "comparison"
|
| 25 |
+
AGGREGATION = "aggregation"
|
| 26 |
+
CAUSAL = "causal"
|
| 27 |
+
PROCEDURAL = "procedural"
|
| 28 |
+
DEFINITION = "definition"
|
| 29 |
+
LIST = "list"
|
| 30 |
+
MULTI_HOP = "multi_hop"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class AnswerFormat(str, Enum):
|
| 34 |
+
PROSE = "prose"
|
| 35 |
+
BULLET_POINTS = "bullet_points"
|
| 36 |
+
TABLE = "table"
|
| 37 |
+
STEP_BY_STEP = "step_by_step"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# ==================== Document Schemas ====================
|
| 41 |
+
|
| 42 |
+
class DocumentUploadResponse(BaseModel):
|
| 43 |
+
"""Response after uploading a document."""
|
| 44 |
+
model_config = ConfigDict(from_attributes=True)
|
| 45 |
+
|
| 46 |
+
doc_id: str = Field(..., description="Unique document identifier")
|
| 47 |
+
filename: str = Field(..., description="Original filename")
|
| 48 |
+
status: DocumentStatus = Field(..., description="Document status")
|
| 49 |
+
message: str = Field(..., description="Status message")
|
| 50 |
+
created_at: datetime = Field(default_factory=datetime.now)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class DocumentMetadata(BaseModel):
|
| 54 |
+
"""Document metadata information."""
|
| 55 |
+
model_config = ConfigDict(from_attributes=True)
|
| 56 |
+
|
| 57 |
+
doc_id: str
|
| 58 |
+
filename: str
|
| 59 |
+
file_type: str
|
| 60 |
+
page_count: int = 0
|
| 61 |
+
chunk_count: int = 0
|
| 62 |
+
text_length: int = 0
|
| 63 |
+
status: DocumentStatus
|
| 64 |
+
indexed: bool = False
|
| 65 |
+
indexed_chunks: int = 0
|
| 66 |
+
processing_time: Optional[float] = None
|
| 67 |
+
created_at: datetime
|
| 68 |
+
updated_at: Optional[datetime] = None
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class DocumentResponse(BaseModel):
|
| 72 |
+
"""Full document response with metadata."""
|
| 73 |
+
model_config = ConfigDict(from_attributes=True)
|
| 74 |
+
|
| 75 |
+
doc_id: str
|
| 76 |
+
filename: str
|
| 77 |
+
file_type: str
|
| 78 |
+
status: DocumentStatus
|
| 79 |
+
metadata: DocumentMetadata
|
| 80 |
+
raw_text: Optional[str] = Field(None, description="Full extracted text (if requested)")
|
| 81 |
+
preview: Optional[str] = Field(None, description="Text preview (first 500 chars)")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
class ChunkInfo(BaseModel):
|
| 85 |
+
"""Information about a document chunk."""
|
| 86 |
+
model_config = ConfigDict(from_attributes=True)
|
| 87 |
+
|
| 88 |
+
chunk_id: str
|
| 89 |
+
doc_id: str
|
| 90 |
+
text: str
|
| 91 |
+
chunk_type: str = "text"
|
| 92 |
+
page_num: Optional[int] = None
|
| 93 |
+
confidence: float = 1.0
|
| 94 |
+
bbox: Optional[Dict[str, float]] = None
|
| 95 |
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
class ChunksResponse(BaseModel):
|
| 99 |
+
"""Response containing document chunks."""
|
| 100 |
+
doc_id: str
|
| 101 |
+
total_chunks: int
|
| 102 |
+
chunks: List[ChunkInfo]
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
class OCRRegionInfo(BaseModel):
|
| 106 |
+
"""OCR region information."""
|
| 107 |
+
region_id: str
|
| 108 |
+
text: str
|
| 109 |
+
confidence: float
|
| 110 |
+
page_num: int
|
| 111 |
+
bbox: Dict[str, float]
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
class LayoutRegionInfo(BaseModel):
|
| 115 |
+
"""Layout region information."""
|
| 116 |
+
region_id: str
|
| 117 |
+
region_type: str
|
| 118 |
+
confidence: float
|
| 119 |
+
page_num: int
|
| 120 |
+
bbox: Dict[str, float]
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
class DocumentDetailResponse(BaseModel):
|
| 124 |
+
"""Detailed document response with all extracted data."""
|
| 125 |
+
doc_id: str
|
| 126 |
+
filename: str
|
| 127 |
+
status: DocumentStatus
|
| 128 |
+
metadata: DocumentMetadata
|
| 129 |
+
chunks: List[ChunkInfo]
|
| 130 |
+
ocr_regions: List[OCRRegionInfo] = Field(default_factory=list)
|
| 131 |
+
layout_regions: List[LayoutRegionInfo] = Field(default_factory=list)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
# ==================== RAG Query Schemas ====================
|
| 135 |
+
|
| 136 |
+
class QueryRequest(BaseModel):
|
| 137 |
+
"""RAG query request."""
|
| 138 |
+
query: str = Field(..., min_length=1, max_length=2000, description="Query text")
|
| 139 |
+
doc_ids: Optional[List[str]] = Field(None, description="Filter by document IDs")
|
| 140 |
+
top_k: int = Field(5, ge=1, le=20, description="Number of chunks to retrieve")
|
| 141 |
+
answer_format: AnswerFormat = Field(AnswerFormat.PROSE, description="Desired answer format")
|
| 142 |
+
include_sources: bool = Field(True, description="Include source citations")
|
| 143 |
+
min_confidence: float = Field(0.5, ge=0.0, le=1.0, description="Minimum confidence threshold")
|
| 144 |
+
use_cache: bool = Field(True, description="Use cached results if available")
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
class Citation(BaseModel):
|
| 148 |
+
"""Citation/source reference."""
|
| 149 |
+
citation_id: int = Field(..., description="Citation number [1], [2], etc.")
|
| 150 |
+
doc_id: str
|
| 151 |
+
document_name: str
|
| 152 |
+
chunk_id: str
|
| 153 |
+
chunk_text: str
|
| 154 |
+
page_num: Optional[int] = None
|
| 155 |
+
relevance_score: float
|
| 156 |
+
bbox: Optional[Dict[str, float]] = None
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
class QueryPlan(BaseModel):
|
| 160 |
+
"""Query planning information."""
|
| 161 |
+
intent: QueryIntentType
|
| 162 |
+
sub_queries: List[str] = Field(default_factory=list)
|
| 163 |
+
keywords: List[str] = Field(default_factory=list)
|
| 164 |
+
strategy: str = "hybrid"
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
class RAGResponse(BaseModel):
|
| 168 |
+
"""Complete RAG response."""
|
| 169 |
+
query: str
|
| 170 |
+
answer: str
|
| 171 |
+
confidence: float = Field(..., ge=0.0, le=1.0)
|
| 172 |
+
citations: List[Citation] = Field(default_factory=list)
|
| 173 |
+
source_count: int = 0
|
| 174 |
+
query_plan: Optional[QueryPlan] = None
|
| 175 |
+
from_cache: bool = False
|
| 176 |
+
validation: Optional[Dict[str, Any]] = None
|
| 177 |
+
latency_ms: Optional[float] = None
|
| 178 |
+
revision_count: int = 0
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
class SearchRequest(BaseModel):
|
| 182 |
+
"""Semantic search request."""
|
| 183 |
+
query: str = Field(..., min_length=1, max_length=1000)
|
| 184 |
+
doc_ids: Optional[List[str]] = None
|
| 185 |
+
top_k: int = Field(10, ge=1, le=50)
|
| 186 |
+
min_score: float = Field(0.0, ge=0.0, le=1.0)
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
class SearchResult(BaseModel):
|
| 190 |
+
"""Single search result."""
|
| 191 |
+
chunk_id: str
|
| 192 |
+
doc_id: str
|
| 193 |
+
document_name: str
|
| 194 |
+
text: str
|
| 195 |
+
score: float
|
| 196 |
+
page_num: Optional[int] = None
|
| 197 |
+
chunk_type: str = "text"
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
class SearchResponse(BaseModel):
|
| 201 |
+
"""Search response with results."""
|
| 202 |
+
query: str
|
| 203 |
+
total_results: int
|
| 204 |
+
results: List[SearchResult]
|
| 205 |
+
latency_ms: float
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
# ==================== Indexing Schemas ====================
|
| 209 |
+
|
| 210 |
+
class IndexRequest(BaseModel):
|
| 211 |
+
"""Request to index a document."""
|
| 212 |
+
doc_id: str = Field(..., description="Document ID to index")
|
| 213 |
+
force_reindex: bool = Field(False, description="Force reindexing if already indexed")
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
class IndexResponse(BaseModel):
|
| 217 |
+
"""Indexing response."""
|
| 218 |
+
doc_id: str
|
| 219 |
+
status: str
|
| 220 |
+
chunks_indexed: int
|
| 221 |
+
message: str
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
class BatchIndexRequest(BaseModel):
|
| 225 |
+
"""Batch indexing request."""
|
| 226 |
+
doc_ids: List[str]
|
| 227 |
+
force_reindex: bool = False
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
class BatchIndexResponse(BaseModel):
|
| 231 |
+
"""Batch indexing response."""
|
| 232 |
+
total_requested: int
|
| 233 |
+
successful: int
|
| 234 |
+
failed: int
|
| 235 |
+
results: List[IndexResponse]
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
# ==================== System Schemas ====================
|
| 239 |
+
|
| 240 |
+
class HealthResponse(BaseModel):
|
| 241 |
+
"""Health check response."""
|
| 242 |
+
status: str = Field(..., description="healthy, degraded, or unhealthy")
|
| 243 |
+
version: str
|
| 244 |
+
components: Dict[str, bool]
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
class SystemStatus(BaseModel):
|
| 248 |
+
"""Detailed system status."""
|
| 249 |
+
status: str
|
| 250 |
+
version: str
|
| 251 |
+
uptime_seconds: float
|
| 252 |
+
components: Dict[str, bool]
|
| 253 |
+
statistics: Dict[str, Any]
|
| 254 |
+
models: Dict[str, str]
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
class CollectionInfo(BaseModel):
|
| 258 |
+
"""Vector store collection information."""
|
| 259 |
+
name: str
|
| 260 |
+
document_count: int
|
| 261 |
+
chunk_count: int
|
| 262 |
+
embedding_dimension: int
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
class StoreStatus(BaseModel):
|
| 266 |
+
"""Vector store status."""
|
| 267 |
+
status: str
|
| 268 |
+
collections: List[CollectionInfo]
|
| 269 |
+
total_documents: int
|
| 270 |
+
total_chunks: int
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
# ==================== Authentication Schemas ====================
|
| 274 |
+
|
| 275 |
+
class UserCreate(BaseModel):
|
| 276 |
+
"""User creation request."""
|
| 277 |
+
username: str = Field(..., min_length=3, max_length=50)
|
| 278 |
+
email: str
|
| 279 |
+
password: str = Field(..., min_length=8)
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
class UserResponse(BaseModel):
|
| 283 |
+
"""User response (no password)."""
|
| 284 |
+
user_id: str
|
| 285 |
+
username: str
|
| 286 |
+
email: str
|
| 287 |
+
is_active: bool = True
|
| 288 |
+
created_at: datetime
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
class Token(BaseModel):
|
| 292 |
+
"""JWT token response."""
|
| 293 |
+
access_token: str
|
| 294 |
+
token_type: str = "bearer"
|
| 295 |
+
expires_in: int
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
class TokenData(BaseModel):
|
| 299 |
+
"""Token payload data."""
|
| 300 |
+
username: Optional[str] = None
|
| 301 |
+
user_id: Optional[str] = None
|
| 302 |
+
scopes: List[str] = Field(default_factory=list)
|
config/document.yaml
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPARKNET Document Processing Configuration
|
| 2 |
+
# ===========================================
|
| 3 |
+
|
| 4 |
+
# OCR Configuration
|
| 5 |
+
ocr:
|
| 6 |
+
# Engine selection: "paddleocr" (default) or "tesseract"
|
| 7 |
+
engine: paddleocr
|
| 8 |
+
|
| 9 |
+
# PaddleOCR settings
|
| 10 |
+
paddleocr:
|
| 11 |
+
lang: en
|
| 12 |
+
use_gpu: false
|
| 13 |
+
det_db_thresh: 0.3
|
| 14 |
+
det_db_box_thresh: 0.5
|
| 15 |
+
rec_algorithm: CRNN
|
| 16 |
+
show_log: false
|
| 17 |
+
|
| 18 |
+
# Tesseract settings
|
| 19 |
+
tesseract:
|
| 20 |
+
lang: eng
|
| 21 |
+
config: "--psm 3" # Page segmentation mode
|
| 22 |
+
oem: 3 # OCR Engine mode (LSTM)
|
| 23 |
+
|
| 24 |
+
# Preprocessing
|
| 25 |
+
preprocessing:
|
| 26 |
+
deskew: true
|
| 27 |
+
denoise: false
|
| 28 |
+
contrast_enhance: false
|
| 29 |
+
|
| 30 |
+
# Layout Detection Configuration
|
| 31 |
+
layout:
|
| 32 |
+
# Detection method: "rule_based" (default) or "model_based"
|
| 33 |
+
method: rule_based
|
| 34 |
+
|
| 35 |
+
# Rule-based settings
|
| 36 |
+
rule_based:
|
| 37 |
+
merge_threshold: 20 # Pixels to merge nearby regions
|
| 38 |
+
column_detection: true
|
| 39 |
+
min_region_area: 100
|
| 40 |
+
|
| 41 |
+
# Confidence thresholds
|
| 42 |
+
thresholds:
|
| 43 |
+
text: 0.5
|
| 44 |
+
title: 0.7
|
| 45 |
+
table: 0.6
|
| 46 |
+
figure: 0.6
|
| 47 |
+
list: 0.5
|
| 48 |
+
|
| 49 |
+
# Reading Order Configuration
|
| 50 |
+
reading_order:
|
| 51 |
+
# Reconstruction method: "rule_based" (default)
|
| 52 |
+
method: rule_based
|
| 53 |
+
|
| 54 |
+
# Column detection
|
| 55 |
+
column_gap_threshold: 50 # Minimum gap between columns
|
| 56 |
+
reading_direction: ltr # Left-to-right
|
| 57 |
+
|
| 58 |
+
# Line grouping
|
| 59 |
+
line_height_tolerance: 0.5
|
| 60 |
+
|
| 61 |
+
# Chunking Configuration
|
| 62 |
+
chunking:
|
| 63 |
+
# Chunk size limits
|
| 64 |
+
target_size: 512 # Target tokens per chunk
|
| 65 |
+
max_size: 1024 # Maximum tokens per chunk
|
| 66 |
+
min_size: 50 # Minimum tokens per chunk
|
| 67 |
+
|
| 68 |
+
# Overlap for context
|
| 69 |
+
overlap_size: 50 # Tokens to overlap between chunks
|
| 70 |
+
|
| 71 |
+
# Semantic chunking
|
| 72 |
+
semantic_boundaries: true
|
| 73 |
+
respect_paragraphs: true
|
| 74 |
+
respect_sections: true
|
| 75 |
+
|
| 76 |
+
# Grounding/Evidence Configuration
|
| 77 |
+
grounding:
|
| 78 |
+
# Image cropping for evidence
|
| 79 |
+
include_images: true
|
| 80 |
+
crop_padding: 10 # Pixels around regions
|
| 81 |
+
max_image_size: 512
|
| 82 |
+
image_format: PNG # PNG or JPEG
|
| 83 |
+
image_quality: 85 # JPEG quality
|
| 84 |
+
|
| 85 |
+
# Snippet settings
|
| 86 |
+
max_snippet_length: 200
|
| 87 |
+
include_context: true
|
| 88 |
+
|
| 89 |
+
# Pipeline Configuration
|
| 90 |
+
pipeline:
|
| 91 |
+
# PDF rendering
|
| 92 |
+
render_dpi: 300
|
| 93 |
+
|
| 94 |
+
# Caching
|
| 95 |
+
enable_caching: true
|
| 96 |
+
cache_directory: ./data/cache
|
| 97 |
+
|
| 98 |
+
# Processing options
|
| 99 |
+
parallel_pages: false
|
| 100 |
+
max_pages: null # Limit pages (null for all)
|
| 101 |
+
|
| 102 |
+
# Output options
|
| 103 |
+
include_ocr_regions: true
|
| 104 |
+
include_layout_regions: true
|
| 105 |
+
generate_full_text: true
|
| 106 |
+
|
| 107 |
+
# Validation Configuration
|
| 108 |
+
validation:
|
| 109 |
+
# Critic settings
|
| 110 |
+
critic:
|
| 111 |
+
confidence_threshold: 0.7
|
| 112 |
+
evidence_required: true
|
| 113 |
+
strict_mode: false
|
| 114 |
+
max_fields_per_request: 10
|
| 115 |
+
|
| 116 |
+
# Verifier settings
|
| 117 |
+
verifier:
|
| 118 |
+
fuzzy_match: true
|
| 119 |
+
case_sensitive: false
|
| 120 |
+
min_match_ratio: 0.6
|
| 121 |
+
strong_threshold: 0.9
|
| 122 |
+
moderate_threshold: 0.7
|
| 123 |
+
weak_threshold: 0.5
|
| 124 |
+
|
| 125 |
+
# LLM Configuration for DocumentAgent
|
| 126 |
+
agent:
|
| 127 |
+
# Ollama settings
|
| 128 |
+
ollama_base_url: http://localhost:11434
|
| 129 |
+
default_model: llama3.2:3b
|
| 130 |
+
|
| 131 |
+
# Model routing by complexity
|
| 132 |
+
model_routing:
|
| 133 |
+
simple: llama3.2:1b
|
| 134 |
+
standard: llama3.2:3b
|
| 135 |
+
complex: llama3.1:8b
|
| 136 |
+
analysis: llama3.1:70b # For heavy analysis (optional)
|
| 137 |
+
|
| 138 |
+
# Agent behavior
|
| 139 |
+
max_iterations: 10
|
| 140 |
+
temperature: 0.1
|
| 141 |
+
timeout: 120 # Seconds
|
| 142 |
+
|
| 143 |
+
# Logging Configuration
|
| 144 |
+
logging:
|
| 145 |
+
level: INFO # DEBUG, INFO, WARNING, ERROR
|
| 146 |
+
format: "{time} | {level} | {message}"
|
| 147 |
+
file: null # Log file path (null for stderr only)
|
config/rag.yaml
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPARKNET RAG Configuration
|
| 2 |
+
# ===========================
|
| 3 |
+
|
| 4 |
+
# Vector Store Configuration
|
| 5 |
+
vector_store:
|
| 6 |
+
# Store type: "chromadb" (default)
|
| 7 |
+
type: chromadb
|
| 8 |
+
|
| 9 |
+
# ChromaDB settings
|
| 10 |
+
chromadb:
|
| 11 |
+
persist_directory: ./data/vectorstore
|
| 12 |
+
collection_name: sparknet_documents
|
| 13 |
+
anonymized_telemetry: false
|
| 14 |
+
|
| 15 |
+
# Search settings
|
| 16 |
+
default_top_k: 5
|
| 17 |
+
similarity_threshold: 0.7
|
| 18 |
+
|
| 19 |
+
# Embedding Configuration
|
| 20 |
+
embeddings:
|
| 21 |
+
# Adapter type: "ollama" (default) or "openai"
|
| 22 |
+
adapter_type: ollama
|
| 23 |
+
|
| 24 |
+
# Ollama settings (local, default)
|
| 25 |
+
ollama:
|
| 26 |
+
base_url: http://localhost:11434
|
| 27 |
+
model: nomic-embed-text # Options: nomic-embed-text, mxbai-embed-large, all-minilm
|
| 28 |
+
|
| 29 |
+
# OpenAI settings (optional, feature-flagged)
|
| 30 |
+
openai:
|
| 31 |
+
enabled: false
|
| 32 |
+
model: text-embedding-3-small # Options: text-embedding-3-small, text-embedding-3-large
|
| 33 |
+
# api_key: ${OPENAI_API_KEY} # Use env var
|
| 34 |
+
|
| 35 |
+
# Common settings
|
| 36 |
+
batch_size: 32
|
| 37 |
+
timeout: 60
|
| 38 |
+
|
| 39 |
+
# Caching
|
| 40 |
+
enable_cache: true
|
| 41 |
+
cache_directory: ./data/embedding_cache
|
| 42 |
+
|
| 43 |
+
# Indexer Configuration
|
| 44 |
+
indexer:
|
| 45 |
+
# Batch processing
|
| 46 |
+
batch_size: 32
|
| 47 |
+
|
| 48 |
+
# Metadata to index
|
| 49 |
+
include_bbox: true
|
| 50 |
+
include_page: true
|
| 51 |
+
include_chunk_type: true
|
| 52 |
+
|
| 53 |
+
# Filtering
|
| 54 |
+
skip_empty_chunks: true
|
| 55 |
+
min_chunk_length: 10
|
| 56 |
+
|
| 57 |
+
# Retriever Configuration
|
| 58 |
+
retriever:
|
| 59 |
+
# Search parameters
|
| 60 |
+
default_top_k: 5
|
| 61 |
+
similarity_threshold: 0.7
|
| 62 |
+
max_results: 20
|
| 63 |
+
|
| 64 |
+
# Reranking (future)
|
| 65 |
+
enable_reranking: false
|
| 66 |
+
rerank_top_k: 10
|
| 67 |
+
|
| 68 |
+
# Evidence settings
|
| 69 |
+
include_evidence: true
|
| 70 |
+
evidence_snippet_length: 200
|
| 71 |
+
|
| 72 |
+
# Generator Configuration
|
| 73 |
+
generator:
|
| 74 |
+
# LLM provider: "ollama" (default) or "openai"
|
| 75 |
+
llm_provider: ollama
|
| 76 |
+
|
| 77 |
+
# Ollama settings
|
| 78 |
+
ollama:
|
| 79 |
+
base_url: http://localhost:11434
|
| 80 |
+
model: llama3.2:3b # Options: llama3.2:3b, llama3.1:8b, mistral
|
| 81 |
+
|
| 82 |
+
# OpenAI settings (optional)
|
| 83 |
+
openai:
|
| 84 |
+
model: gpt-4o-mini # Options: gpt-4o-mini, gpt-4o
|
| 85 |
+
# api_key: ${OPENAI_API_KEY} # Use env var
|
| 86 |
+
|
| 87 |
+
# Generation settings
|
| 88 |
+
temperature: 0.1
|
| 89 |
+
max_tokens: 1024
|
| 90 |
+
timeout: 120
|
| 91 |
+
|
| 92 |
+
# Citation settings
|
| 93 |
+
require_citations: true
|
| 94 |
+
citation_format: "[{index}]"
|
| 95 |
+
|
| 96 |
+
# Abstention settings
|
| 97 |
+
abstain_on_low_confidence: true
|
| 98 |
+
confidence_threshold: 0.6
|
| 99 |
+
|
| 100 |
+
# Query Processing
|
| 101 |
+
query:
|
| 102 |
+
# Query expansion
|
| 103 |
+
expand_queries: false
|
| 104 |
+
max_expansions: 3
|
| 105 |
+
|
| 106 |
+
# Hybrid search (future)
|
| 107 |
+
enable_hybrid: false
|
| 108 |
+
keyword_weight: 0.3
|
| 109 |
+
semantic_weight: 0.7
|
| 110 |
+
|
| 111 |
+
# Metadata Filtering
|
| 112 |
+
filters:
|
| 113 |
+
# Supported filter types
|
| 114 |
+
supported:
|
| 115 |
+
- document_id
|
| 116 |
+
- chunk_type
|
| 117 |
+
- page
|
| 118 |
+
- confidence_min
|
| 119 |
+
|
| 120 |
+
# Default filters (applied to all queries)
|
| 121 |
+
defaults: {}
|
| 122 |
+
|
| 123 |
+
# Performance Settings
|
| 124 |
+
performance:
|
| 125 |
+
# Connection pooling
|
| 126 |
+
max_connections: 10
|
| 127 |
+
|
| 128 |
+
# Timeouts
|
| 129 |
+
embedding_timeout: 60
|
| 130 |
+
search_timeout: 30
|
| 131 |
+
generation_timeout: 120
|
| 132 |
+
|
| 133 |
+
# Caching
|
| 134 |
+
query_cache_enabled: true
|
| 135 |
+
query_cache_ttl: 3600 # Seconds
|
| 136 |
+
|
| 137 |
+
# Logging
|
| 138 |
+
logging:
|
| 139 |
+
level: INFO
|
| 140 |
+
include_queries: false # Log user queries (privacy consideration)
|
| 141 |
+
include_latency: true
|
configs/rag.yaml
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RAG (Retrieval-Augmented Generation) Configuration
|
| 2 |
+
# SPARKNET Document Intelligence Integration
|
| 3 |
+
|
| 4 |
+
# =============================================================================
|
| 5 |
+
# Vector Store Settings
|
| 6 |
+
# =============================================================================
|
| 7 |
+
vector_store:
|
| 8 |
+
# Store type: "chroma" (default) or "memory" (for testing)
|
| 9 |
+
type: chroma
|
| 10 |
+
|
| 11 |
+
# ChromaDB settings
|
| 12 |
+
chroma:
|
| 13 |
+
# Persistence directory for vector store
|
| 14 |
+
persist_directory: "./.sparknet/chroma_db"
|
| 15 |
+
|
| 16 |
+
# Collection name for document chunks
|
| 17 |
+
collection_name: "sparknet_documents"
|
| 18 |
+
|
| 19 |
+
# Distance metric: "cosine" (default), "l2", or "ip"
|
| 20 |
+
distance_metric: cosine
|
| 21 |
+
|
| 22 |
+
# Anonymized telemetry (set to false to disable)
|
| 23 |
+
anonymized_telemetry: false
|
| 24 |
+
|
| 25 |
+
# =============================================================================
|
| 26 |
+
# Embedding Settings
|
| 27 |
+
# =============================================================================
|
| 28 |
+
embeddings:
|
| 29 |
+
# Provider: "ollama" (default, local) or "openai" (cloud, requires API key)
|
| 30 |
+
provider: ollama
|
| 31 |
+
|
| 32 |
+
# Ollama settings (local, privacy-preserving)
|
| 33 |
+
ollama:
|
| 34 |
+
# Model name for embeddings
|
| 35 |
+
# Recommended: nomic-embed-text (768 dims) or mxbai-embed-large (1024 dims)
|
| 36 |
+
model: nomic-embed-text
|
| 37 |
+
|
| 38 |
+
# Ollama server URL
|
| 39 |
+
base_url: "http://localhost:11434"
|
| 40 |
+
|
| 41 |
+
# Request timeout in seconds
|
| 42 |
+
timeout: 30
|
| 43 |
+
|
| 44 |
+
# OpenAI settings (cloud, disabled by default)
|
| 45 |
+
openai:
|
| 46 |
+
# IMPORTANT: OpenAI is disabled by default for privacy
|
| 47 |
+
# Set to true only if you explicitly need cloud embeddings
|
| 48 |
+
enabled: false
|
| 49 |
+
|
| 50 |
+
# Model name (if enabled)
|
| 51 |
+
model: text-embedding-3-small
|
| 52 |
+
|
| 53 |
+
# API key (from environment variable OPENAI_API_KEY)
|
| 54 |
+
# Never store API keys in config files
|
| 55 |
+
api_key_env: OPENAI_API_KEY
|
| 56 |
+
|
| 57 |
+
# Caching settings
|
| 58 |
+
cache:
|
| 59 |
+
# Enable embedding cache for faster re-processing
|
| 60 |
+
enabled: true
|
| 61 |
+
|
| 62 |
+
# Maximum cache entries
|
| 63 |
+
max_entries: 10000
|
| 64 |
+
|
| 65 |
+
# =============================================================================
|
| 66 |
+
# Indexer Settings
|
| 67 |
+
# =============================================================================
|
| 68 |
+
indexer:
|
| 69 |
+
# Batch size for embedding generation
|
| 70 |
+
batch_size: 32
|
| 71 |
+
|
| 72 |
+
# Include bounding box metadata
|
| 73 |
+
include_bbox: true
|
| 74 |
+
|
| 75 |
+
# Include page numbers
|
| 76 |
+
include_page: true
|
| 77 |
+
|
| 78 |
+
# Include chunk type labels
|
| 79 |
+
include_chunk_type: true
|
| 80 |
+
|
| 81 |
+
# Skip empty chunks
|
| 82 |
+
skip_empty_chunks: true
|
| 83 |
+
|
| 84 |
+
# Minimum chunk text length (characters)
|
| 85 |
+
min_chunk_length: 10
|
| 86 |
+
|
| 87 |
+
# =============================================================================
|
| 88 |
+
# Retriever Settings
|
| 89 |
+
# =============================================================================
|
| 90 |
+
retriever:
|
| 91 |
+
# Default number of results to return
|
| 92 |
+
default_top_k: 5
|
| 93 |
+
|
| 94 |
+
# Maximum results to return
|
| 95 |
+
max_results: 20
|
| 96 |
+
|
| 97 |
+
# Minimum similarity score (0.0 - 1.0)
|
| 98 |
+
# Chunks below this threshold are filtered out
|
| 99 |
+
similarity_threshold: 0.5
|
| 100 |
+
|
| 101 |
+
# Enable result reranking (experimental)
|
| 102 |
+
enable_reranking: false
|
| 103 |
+
|
| 104 |
+
# Number of results to rerank
|
| 105 |
+
rerank_top_k: 10
|
| 106 |
+
|
| 107 |
+
# Include evidence references in results
|
| 108 |
+
include_evidence: true
|
| 109 |
+
|
| 110 |
+
# Maximum snippet length in evidence
|
| 111 |
+
evidence_snippet_length: 200
|
| 112 |
+
|
| 113 |
+
# =============================================================================
|
| 114 |
+
# Generator Settings (Answer Generation)
|
| 115 |
+
# =============================================================================
|
| 116 |
+
generator:
|
| 117 |
+
# LLM provider for answer generation: "ollama" (default) or "openai"
|
| 118 |
+
provider: ollama
|
| 119 |
+
|
| 120 |
+
# Ollama settings (local)
|
| 121 |
+
ollama:
|
| 122 |
+
# Model for answer generation
|
| 123 |
+
# Recommended: llama3.2, mistral, or phi3
|
| 124 |
+
model: llama3.2
|
| 125 |
+
|
| 126 |
+
# Ollama server URL
|
| 127 |
+
base_url: "http://localhost:11434"
|
| 128 |
+
|
| 129 |
+
# Request timeout in seconds
|
| 130 |
+
timeout: 60
|
| 131 |
+
|
| 132 |
+
# Generation parameters
|
| 133 |
+
temperature: 0.1
|
| 134 |
+
max_tokens: 1024
|
| 135 |
+
|
| 136 |
+
# OpenAI settings (cloud, disabled by default)
|
| 137 |
+
openai:
|
| 138 |
+
enabled: false
|
| 139 |
+
model: gpt-4o-mini
|
| 140 |
+
api_key_env: OPENAI_API_KEY
|
| 141 |
+
temperature: 0.1
|
| 142 |
+
max_tokens: 1024
|
| 143 |
+
|
| 144 |
+
# Confidence settings
|
| 145 |
+
min_confidence: 0.5
|
| 146 |
+
|
| 147 |
+
# Abstention policy
|
| 148 |
+
# When true, the system will refuse to answer if confidence is too low
|
| 149 |
+
abstain_on_low_confidence: true
|
| 150 |
+
abstain_threshold: 0.3
|
| 151 |
+
|
| 152 |
+
# Maximum context length for LLM
|
| 153 |
+
max_context_length: 8000
|
| 154 |
+
|
| 155 |
+
# Require citations in answers
|
| 156 |
+
require_citations: true
|
| 157 |
+
|
| 158 |
+
# =============================================================================
|
| 159 |
+
# Document Intelligence Integration
|
| 160 |
+
# =============================================================================
|
| 161 |
+
document_intelligence:
|
| 162 |
+
# Parser settings
|
| 163 |
+
parser:
|
| 164 |
+
render_dpi: 200
|
| 165 |
+
max_pages: null # null = no limit
|
| 166 |
+
|
| 167 |
+
# Extraction settings
|
| 168 |
+
extraction:
|
| 169 |
+
min_field_confidence: 0.5
|
| 170 |
+
abstain_on_low_confidence: true
|
| 171 |
+
|
| 172 |
+
# Grounding settings
|
| 173 |
+
grounding:
|
| 174 |
+
enable_crops: true
|
| 175 |
+
crop_output_dir: "./.sparknet/crops"
|
| 176 |
+
|
| 177 |
+
# =============================================================================
|
| 178 |
+
# Performance Settings
|
| 179 |
+
# =============================================================================
|
| 180 |
+
performance:
|
| 181 |
+
# Number of parallel workers for batch processing
|
| 182 |
+
num_workers: 4
|
| 183 |
+
|
| 184 |
+
# Chunk processing batch size
|
| 185 |
+
chunk_batch_size: 100
|
| 186 |
+
|
| 187 |
+
# Enable async processing where supported
|
| 188 |
+
async_enabled: true
|
| 189 |
+
|
| 190 |
+
# =============================================================================
|
| 191 |
+
# Logging Settings
|
| 192 |
+
# =============================================================================
|
| 193 |
+
logging:
|
| 194 |
+
# Log level: DEBUG, INFO, WARNING, ERROR
|
| 195 |
+
level: INFO
|
| 196 |
+
|
| 197 |
+
# Log RAG queries and results
|
| 198 |
+
log_queries: false
|
| 199 |
+
|
| 200 |
+
# Log embedding operations
|
| 201 |
+
log_embeddings: false
|
demo/README.md
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPARKNET Demo Application
|
| 2 |
+
|
| 3 |
+
An interactive Streamlit demo showcasing SPARKNET's document intelligence capabilities.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- **📄 Document Processing**: Upload and process documents with OCR
|
| 8 |
+
- **🔍 Field Extraction**: Extract structured data with evidence grounding
|
| 9 |
+
- **💬 RAG Q&A**: Interactive question answering with citations
|
| 10 |
+
- **🏷️ Classification**: Automatic document type detection
|
| 11 |
+
- **📊 Analytics**: Processing statistics and insights
|
| 12 |
+
- **🔬 Live Processing**: Real-time pipeline visualization
|
| 13 |
+
- **📊 Document Comparison**: Compare multiple documents
|
| 14 |
+
|
| 15 |
+
## Quick Start
|
| 16 |
+
|
| 17 |
+
### 1. Install Dependencies
|
| 18 |
+
|
| 19 |
+
```bash
|
| 20 |
+
# From project root
|
| 21 |
+
pip install -r demo/requirements.txt
|
| 22 |
+
|
| 23 |
+
# Or install all SPARKNET dependencies
|
| 24 |
+
pip install -r requirements.txt
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
### 2. Start Ollama (Optional, for live processing)
|
| 28 |
+
|
| 29 |
+
```bash
|
| 30 |
+
ollama serve
|
| 31 |
+
|
| 32 |
+
# Pull required models
|
| 33 |
+
ollama pull llama3.2:3b
|
| 34 |
+
ollama pull nomic-embed-text
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
### 3. Run the Demo
|
| 38 |
+
|
| 39 |
+
```bash
|
| 40 |
+
# From project root
|
| 41 |
+
streamlit run demo/app.py
|
| 42 |
+
|
| 43 |
+
# Or with custom port
|
| 44 |
+
streamlit run demo/app.py --server.port 8501
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
### 4. Open in Browser
|
| 48 |
+
|
| 49 |
+
Navigate to http://localhost:8501
|
| 50 |
+
|
| 51 |
+
## Demo Pages
|
| 52 |
+
|
| 53 |
+
| Page | Description |
|
| 54 |
+
|------|-------------|
|
| 55 |
+
| **Home** | Overview and feature cards |
|
| 56 |
+
| **Document Processing** | Upload/select documents for OCR processing |
|
| 57 |
+
| **Field Extraction** | Extract structured fields with evidence |
|
| 58 |
+
| **RAG Q&A** | Ask questions about indexed documents |
|
| 59 |
+
| **Classification** | Classify document types |
|
| 60 |
+
| **Analytics** | View processing statistics |
|
| 61 |
+
| **Live Processing** | Watch pipeline in real-time |
|
| 62 |
+
| **Interactive RAG** | Chat-style document Q&A |
|
| 63 |
+
| **Document Comparison** | Compare documents side by side |
|
| 64 |
+
|
| 65 |
+
## Sample Documents
|
| 66 |
+
|
| 67 |
+
The demo uses patent pledge documents from the `Dataset/` folder:
|
| 68 |
+
|
| 69 |
+
- Apple 11.11.2011.pdf
|
| 70 |
+
- IBM 11.01.2005.pdf
|
| 71 |
+
- Google 08.02.2012.pdf
|
| 72 |
+
- And more...
|
| 73 |
+
|
| 74 |
+
## Screenshots
|
| 75 |
+
|
| 76 |
+
### Home Page
|
| 77 |
+
```
|
| 78 |
+
┌─────────────────────────────────────────┐
|
| 79 |
+
│ 🔥 SPARKNET │
|
| 80 |
+
│ Agentic Document Intelligence Platform │
|
| 81 |
+
├─────────────────────────────────────────┤
|
| 82 |
+
│ [Doc Processing] [Extraction] [RAG] │
|
| 83 |
+
│ │
|
| 84 |
+
│ Feature cards with gradients... │
|
| 85 |
+
└─────────────────────────────────────────┘
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
### RAG Q&A
|
| 89 |
+
```
|
| 90 |
+
┌─────────────────────────────────────────┐
|
| 91 |
+
│ 💬 Ask a question... │
|
| 92 |
+
├─────────────────────────────────────────┤
|
| 93 |
+
│ User: What patents are covered? │
|
| 94 |
+
│ │
|
| 95 |
+
│ Assistant: Based on the documents... │
|
| 96 |
+
│ [📚 View Sources] │
|
| 97 |
+
│ [1] Apple - Page 1: "..." │
|
| 98 |
+
│ [2] IBM - Page 2: "..." │
|
| 99 |
+
└─────────────────────────────────────────┘
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
## Configuration
|
| 103 |
+
|
| 104 |
+
### Environment Variables
|
| 105 |
+
|
| 106 |
+
```bash
|
| 107 |
+
# Ollama URL (default: http://localhost:11434)
|
| 108 |
+
export OLLAMA_BASE_URL=http://localhost:11434
|
| 109 |
+
|
| 110 |
+
# ChromaDB path (default: ./data/vectorstore)
|
| 111 |
+
export CHROMA_PERSIST_DIR=./data/vectorstore
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
### Streamlit Config
|
| 115 |
+
|
| 116 |
+
Create `.streamlit/config.toml`:
|
| 117 |
+
|
| 118 |
+
```toml
|
| 119 |
+
[theme]
|
| 120 |
+
primaryColor = "#FF6B6B"
|
| 121 |
+
backgroundColor = "#FFFFFF"
|
| 122 |
+
secondaryBackgroundColor = "#F0F2F6"
|
| 123 |
+
textColor = "#262730"
|
| 124 |
+
|
| 125 |
+
[server]
|
| 126 |
+
maxUploadSize = 50
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
## Development
|
| 130 |
+
|
| 131 |
+
### Adding New Pages
|
| 132 |
+
|
| 133 |
+
1. Create a new file in `demo/pages/`:
|
| 134 |
+
```
|
| 135 |
+
demo/pages/4_🆕_New_Feature.py
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
2. Follow the naming convention: `{order}_{emoji}_{name}.py`
|
| 139 |
+
|
| 140 |
+
3. Import project modules:
|
| 141 |
+
```python
|
| 142 |
+
import sys
|
| 143 |
+
from pathlib import Path
|
| 144 |
+
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
| 145 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
### Customizing Styles
|
| 149 |
+
|
| 150 |
+
Edit the CSS in `app.py`:
|
| 151 |
+
|
| 152 |
+
```python
|
| 153 |
+
st.markdown("""
|
| 154 |
+
<style>
|
| 155 |
+
.main-header { ... }
|
| 156 |
+
.evidence-box { ... }
|
| 157 |
+
</style>
|
| 158 |
+
""", unsafe_allow_html=True)
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
## Troubleshooting
|
| 162 |
+
|
| 163 |
+
### "ModuleNotFoundError: No module named 'src'"
|
| 164 |
+
|
| 165 |
+
Make sure you're running from the project root:
|
| 166 |
+
```bash
|
| 167 |
+
cd /path/to/SPARKNET
|
| 168 |
+
streamlit run demo/app.py
|
| 169 |
+
```
|
| 170 |
+
|
| 171 |
+
### Ollama Not Connected
|
| 172 |
+
|
| 173 |
+
1. Check if Ollama is running: `curl http://localhost:11434/api/tags`
|
| 174 |
+
2. Start Ollama: `ollama serve`
|
| 175 |
+
|
| 176 |
+
### ChromaDB Errors
|
| 177 |
+
|
| 178 |
+
Install ChromaDB:
|
| 179 |
+
```bash
|
| 180 |
+
pip install chromadb
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
## License
|
| 184 |
+
|
| 185 |
+
Part of the SPARKNET project. See main LICENSE file.
|
demo/app.py
ADDED
|
@@ -0,0 +1,944 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SPARKNET Demo Application
|
| 3 |
+
|
| 4 |
+
A Streamlit-based demo showcasing:
|
| 5 |
+
- Document Processing Pipeline
|
| 6 |
+
- Field Extraction with Evidence
|
| 7 |
+
- RAG Search and Q&A
|
| 8 |
+
- Document Classification
|
| 9 |
+
- Evidence Visualization
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import streamlit as st
|
| 13 |
+
import sys
|
| 14 |
+
import os
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
import json
|
| 17 |
+
import time
|
| 18 |
+
from datetime import datetime
|
| 19 |
+
|
| 20 |
+
# Add project root to path
|
| 21 |
+
PROJECT_ROOT = Path(__file__).parent.parent
|
| 22 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 23 |
+
|
| 24 |
+
# Page configuration
|
| 25 |
+
st.set_page_config(
|
| 26 |
+
page_title="SPARKNET Document Intelligence",
|
| 27 |
+
page_icon="🔥",
|
| 28 |
+
layout="wide",
|
| 29 |
+
initial_sidebar_state="expanded",
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# Custom CSS
|
| 33 |
+
st.markdown("""
|
| 34 |
+
<style>
|
| 35 |
+
.main-header {
|
| 36 |
+
font-size: 2.5rem;
|
| 37 |
+
font-weight: bold;
|
| 38 |
+
background: linear-gradient(90deg, #FF6B6B, #4ECDC4);
|
| 39 |
+
-webkit-background-clip: text;
|
| 40 |
+
-webkit-text-fill-color: transparent;
|
| 41 |
+
margin-bottom: 0.5rem;
|
| 42 |
+
}
|
| 43 |
+
.sub-header {
|
| 44 |
+
color: #666;
|
| 45 |
+
font-size: 1.1rem;
|
| 46 |
+
margin-bottom: 2rem;
|
| 47 |
+
}
|
| 48 |
+
.metric-card {
|
| 49 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 50 |
+
border-radius: 10px;
|
| 51 |
+
padding: 1rem;
|
| 52 |
+
color: white;
|
| 53 |
+
}
|
| 54 |
+
.evidence-box {
|
| 55 |
+
background-color: #f0f7ff;
|
| 56 |
+
border-left: 4px solid #4ECDC4;
|
| 57 |
+
padding: 1rem;
|
| 58 |
+
margin: 0.5rem 0;
|
| 59 |
+
border-radius: 0 8px 8px 0;
|
| 60 |
+
}
|
| 61 |
+
.chunk-card {
|
| 62 |
+
background-color: #fafafa;
|
| 63 |
+
border: 1px solid #e0e0e0;
|
| 64 |
+
border-radius: 8px;
|
| 65 |
+
padding: 1rem;
|
| 66 |
+
margin: 0.5rem 0;
|
| 67 |
+
}
|
| 68 |
+
.confidence-high { color: #22c55e; font-weight: bold; }
|
| 69 |
+
.confidence-medium { color: #eab308; font-weight: bold; }
|
| 70 |
+
.confidence-low { color: #ef4444; font-weight: bold; }
|
| 71 |
+
.stTabs [data-baseweb="tab-list"] {
|
| 72 |
+
gap: 8px;
|
| 73 |
+
}
|
| 74 |
+
.stTabs [data-baseweb="tab"] {
|
| 75 |
+
padding: 10px 20px;
|
| 76 |
+
background-color: #f0f2f6;
|
| 77 |
+
border-radius: 8px;
|
| 78 |
+
}
|
| 79 |
+
</style>
|
| 80 |
+
""", unsafe_allow_html=True)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def get_sample_documents():
|
| 84 |
+
"""Get list of sample documents from Dataset folder."""
|
| 85 |
+
dataset_path = PROJECT_ROOT / "Dataset"
|
| 86 |
+
if dataset_path.exists():
|
| 87 |
+
return sorted([f.name for f in dataset_path.glob("*.pdf")])
|
| 88 |
+
return []
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def format_confidence(confidence: float) -> str:
|
| 92 |
+
"""Format confidence with color coding."""
|
| 93 |
+
if confidence >= 0.8:
|
| 94 |
+
return f'<span class="confidence-high">{confidence:.1%}</span>'
|
| 95 |
+
elif confidence >= 0.6:
|
| 96 |
+
return f'<span class="confidence-medium">{confidence:.1%}</span>'
|
| 97 |
+
else:
|
| 98 |
+
return f'<span class="confidence-low">{confidence:.1%}</span>'
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def render_header():
|
| 102 |
+
"""Render the main header."""
|
| 103 |
+
col1, col2 = st.columns([3, 1])
|
| 104 |
+
with col1:
|
| 105 |
+
st.markdown('<div class="main-header">🔥 SPARKNET</div>', unsafe_allow_html=True)
|
| 106 |
+
st.markdown('<div class="sub-header">Agentic Document Intelligence Platform</div>', unsafe_allow_html=True)
|
| 107 |
+
with col2:
|
| 108 |
+
st.image("https://img.shields.io/badge/version-0.1.0-blue", width=100)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def render_sidebar():
|
| 112 |
+
"""Render the sidebar with navigation."""
|
| 113 |
+
with st.sidebar:
|
| 114 |
+
st.markdown("## Navigation")
|
| 115 |
+
|
| 116 |
+
page = st.radio(
|
| 117 |
+
"Select Feature",
|
| 118 |
+
[
|
| 119 |
+
"🏠 Home",
|
| 120 |
+
"📄 Document Processing",
|
| 121 |
+
"🔍 Field Extraction",
|
| 122 |
+
"💬 RAG Q&A",
|
| 123 |
+
"🏷️ Classification",
|
| 124 |
+
"📊 Analytics",
|
| 125 |
+
],
|
| 126 |
+
label_visibility="collapsed",
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
st.markdown("---")
|
| 130 |
+
st.markdown("### System Status")
|
| 131 |
+
|
| 132 |
+
# Check component status
|
| 133 |
+
ollama_status = check_ollama_status()
|
| 134 |
+
st.markdown(f"**Ollama:** {'🟢 Online' if ollama_status else '🔴 Offline'}")
|
| 135 |
+
|
| 136 |
+
chromadb_status = check_chromadb_status()
|
| 137 |
+
st.markdown(f"**ChromaDB:** {'🟢 Ready' if chromadb_status else '🔴 Not initialized'}")
|
| 138 |
+
|
| 139 |
+
st.markdown("---")
|
| 140 |
+
st.markdown("### Sample Documents")
|
| 141 |
+
docs = get_sample_documents()
|
| 142 |
+
st.markdown(f"**Available:** {len(docs)} PDFs")
|
| 143 |
+
|
| 144 |
+
return page
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def check_ollama_status():
|
| 148 |
+
"""Check if Ollama is running."""
|
| 149 |
+
try:
|
| 150 |
+
import httpx
|
| 151 |
+
with httpx.Client(timeout=2.0) as client:
|
| 152 |
+
resp = client.get("http://localhost:11434/api/tags")
|
| 153 |
+
return resp.status_code == 200
|
| 154 |
+
except:
|
| 155 |
+
return False
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def check_chromadb_status():
|
| 159 |
+
"""Check if ChromaDB is available."""
|
| 160 |
+
try:
|
| 161 |
+
import chromadb
|
| 162 |
+
return True
|
| 163 |
+
except:
|
| 164 |
+
return False
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def render_home_page():
|
| 168 |
+
"""Render the home page."""
|
| 169 |
+
st.markdown("## Welcome to SPARKNET")
|
| 170 |
+
|
| 171 |
+
st.markdown("""
|
| 172 |
+
SPARKNET is an enterprise-grade **Agentic Document Intelligence Platform** that combines:
|
| 173 |
+
|
| 174 |
+
- **📄 Document Processing**: OCR with PaddleOCR/Tesseract, layout detection, semantic chunking
|
| 175 |
+
- **🔍 RAG Subsystem**: Vector search with ChromaDB, grounded retrieval with citations
|
| 176 |
+
- **🤖 Multi-Agent System**: ReAct-style agents with tool use and validation
|
| 177 |
+
- **🏠 Local-First**: Privacy-preserving inference via Ollama
|
| 178 |
+
- **📎 Evidence Grounding**: Every extraction includes bbox, page, chunk_id references
|
| 179 |
+
""")
|
| 180 |
+
|
| 181 |
+
st.markdown("---")
|
| 182 |
+
|
| 183 |
+
# Feature cards
|
| 184 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 185 |
+
|
| 186 |
+
with col1:
|
| 187 |
+
st.markdown("""
|
| 188 |
+
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 189 |
+
border-radius: 12px; padding: 1.5rem; color: white; text-align: center;">
|
| 190 |
+
<h3>📄</h3>
|
| 191 |
+
<h4>Document Processing</h4>
|
| 192 |
+
<p style="font-size: 0.9rem;">OCR, Layout Detection, Chunking</p>
|
| 193 |
+
</div>
|
| 194 |
+
""", unsafe_allow_html=True)
|
| 195 |
+
|
| 196 |
+
with col2:
|
| 197 |
+
st.markdown("""
|
| 198 |
+
<div style="background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
|
| 199 |
+
border-radius: 12px; padding: 1.5rem; color: white; text-align: center;">
|
| 200 |
+
<h3>🔍</h3>
|
| 201 |
+
<h4>Field Extraction</h4>
|
| 202 |
+
<p style="font-size: 0.9rem;">Structured Data with Evidence</p>
|
| 203 |
+
</div>
|
| 204 |
+
""", unsafe_allow_html=True)
|
| 205 |
+
|
| 206 |
+
with col3:
|
| 207 |
+
st.markdown("""
|
| 208 |
+
<div style="background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
|
| 209 |
+
border-radius: 12px; padding: 1.5rem; color: white; text-align: center;">
|
| 210 |
+
<h3>💬</h3>
|
| 211 |
+
<h4>RAG Q&A</h4>
|
| 212 |
+
<p style="font-size: 0.9rem;">Grounded Answers with Citations</p>
|
| 213 |
+
</div>
|
| 214 |
+
""", unsafe_allow_html=True)
|
| 215 |
+
|
| 216 |
+
with col4:
|
| 217 |
+
st.markdown("""
|
| 218 |
+
<div style="background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%);
|
| 219 |
+
border-radius: 12px; padding: 1.5rem; color: white; text-align: center;">
|
| 220 |
+
<h3>🏷️</h3>
|
| 221 |
+
<h4>Classification</h4>
|
| 222 |
+
<p style="font-size: 0.9rem;">Document Type Detection</p>
|
| 223 |
+
</div>
|
| 224 |
+
""", unsafe_allow_html=True)
|
| 225 |
+
|
| 226 |
+
st.markdown("---")
|
| 227 |
+
|
| 228 |
+
# Quick start
|
| 229 |
+
st.markdown("### Quick Start")
|
| 230 |
+
|
| 231 |
+
with st.expander("📚 How to Use This Demo", expanded=True):
|
| 232 |
+
st.markdown("""
|
| 233 |
+
1. **Document Processing**: Upload or select a PDF to process with OCR
|
| 234 |
+
2. **Field Extraction**: Define fields to extract with evidence grounding
|
| 235 |
+
3. **RAG Q&A**: Ask questions about indexed documents
|
| 236 |
+
4. **Classification**: Automatically classify document types
|
| 237 |
+
|
| 238 |
+
**Sample Documents**: The demo includes real patent documents from major tech companies.
|
| 239 |
+
""")
|
| 240 |
+
|
| 241 |
+
# Sample documents preview
|
| 242 |
+
st.markdown("### Available Sample Documents")
|
| 243 |
+
docs = get_sample_documents()
|
| 244 |
+
|
| 245 |
+
if docs:
|
| 246 |
+
cols = st.columns(4)
|
| 247 |
+
for i, doc in enumerate(docs[:8]):
|
| 248 |
+
with cols[i % 4]:
|
| 249 |
+
company = doc.split()[0] if doc else "Unknown"
|
| 250 |
+
st.markdown(f"""
|
| 251 |
+
<div style="background: #f8f9fa; border-radius: 8px; padding: 0.8rem;
|
| 252 |
+
margin: 0.3rem 0; border: 1px solid #e0e0e0;">
|
| 253 |
+
<strong>📄 {company}</strong>
|
| 254 |
+
<br><small style="color: #666;">{doc[:30]}...</small>
|
| 255 |
+
</div>
|
| 256 |
+
""", unsafe_allow_html=True)
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def render_document_processing_page():
|
| 260 |
+
"""Render the document processing page."""
|
| 261 |
+
st.markdown("## 📄 Document Processing Pipeline")
|
| 262 |
+
|
| 263 |
+
st.markdown("""
|
| 264 |
+
Process documents through our intelligent pipeline:
|
| 265 |
+
**OCR → Layout Detection → Reading Order → Semantic Chunking → Grounding**
|
| 266 |
+
""")
|
| 267 |
+
|
| 268 |
+
# Document selection
|
| 269 |
+
col1, col2 = st.columns([2, 1])
|
| 270 |
+
|
| 271 |
+
with col1:
|
| 272 |
+
upload_option = st.radio(
|
| 273 |
+
"Document Source",
|
| 274 |
+
["Select from samples", "Upload new document"],
|
| 275 |
+
horizontal=True,
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
if upload_option == "Select from samples":
|
| 279 |
+
docs = get_sample_documents()
|
| 280 |
+
if docs:
|
| 281 |
+
selected_doc = st.selectbox("Select a document", docs)
|
| 282 |
+
doc_path = PROJECT_ROOT / "Dataset" / selected_doc
|
| 283 |
+
else:
|
| 284 |
+
st.warning("No sample documents found")
|
| 285 |
+
doc_path = None
|
| 286 |
+
else:
|
| 287 |
+
uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
|
| 288 |
+
if uploaded_file:
|
| 289 |
+
# Save temporarily
|
| 290 |
+
temp_path = PROJECT_ROOT / "data" / "temp" / uploaded_file.name
|
| 291 |
+
temp_path.parent.mkdir(parents=True, exist_ok=True)
|
| 292 |
+
with open(temp_path, "wb") as f:
|
| 293 |
+
f.write(uploaded_file.read())
|
| 294 |
+
doc_path = temp_path
|
| 295 |
+
else:
|
| 296 |
+
doc_path = None
|
| 297 |
+
|
| 298 |
+
with col2:
|
| 299 |
+
st.markdown("### Processing Options")
|
| 300 |
+
ocr_engine = st.selectbox("OCR Engine", ["paddleocr", "tesseract"])
|
| 301 |
+
max_pages = st.slider("Max Pages", 1, 20, 5)
|
| 302 |
+
render_dpi = st.selectbox("Render DPI", [150, 200, 300], index=2)
|
| 303 |
+
|
| 304 |
+
st.markdown("---")
|
| 305 |
+
|
| 306 |
+
if doc_path and st.button("🚀 Process Document", type="primary"):
|
| 307 |
+
process_document_demo(doc_path, ocr_engine, max_pages, render_dpi)
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def process_document_demo(doc_path, ocr_engine, max_pages, render_dpi):
|
| 311 |
+
"""Demo document processing."""
|
| 312 |
+
|
| 313 |
+
progress_bar = st.progress(0)
|
| 314 |
+
status_text = st.empty()
|
| 315 |
+
|
| 316 |
+
# Simulate processing stages
|
| 317 |
+
stages = [
|
| 318 |
+
("Loading document...", 0.1),
|
| 319 |
+
("Running OCR extraction...", 0.3),
|
| 320 |
+
("Detecting layout regions...", 0.5),
|
| 321 |
+
("Reconstructing reading order...", 0.7),
|
| 322 |
+
("Creating semantic chunks...", 0.9),
|
| 323 |
+
("Finalizing...", 1.0),
|
| 324 |
+
]
|
| 325 |
+
|
| 326 |
+
for stage_text, progress in stages:
|
| 327 |
+
status_text.text(stage_text)
|
| 328 |
+
progress_bar.progress(progress)
|
| 329 |
+
time.sleep(0.5)
|
| 330 |
+
|
| 331 |
+
status_text.text("✅ Processing complete!")
|
| 332 |
+
|
| 333 |
+
# Try actual processing
|
| 334 |
+
try:
|
| 335 |
+
from src.document.pipeline import process_document, PipelineConfig
|
| 336 |
+
from src.document.ocr import OCRConfig
|
| 337 |
+
|
| 338 |
+
config = PipelineConfig(
|
| 339 |
+
ocr=OCRConfig(engine=ocr_engine),
|
| 340 |
+
render_dpi=render_dpi,
|
| 341 |
+
max_pages=max_pages,
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
with st.spinner("Running actual document processing..."):
|
| 345 |
+
result = process_document(str(doc_path), config=config)
|
| 346 |
+
|
| 347 |
+
# Display results
|
| 348 |
+
render_processing_results(result)
|
| 349 |
+
|
| 350 |
+
except Exception as e:
|
| 351 |
+
st.warning(f"Live processing unavailable: {e}")
|
| 352 |
+
st.info("Showing demo results instead...")
|
| 353 |
+
render_demo_processing_results(str(doc_path))
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
def render_processing_results(result):
|
| 357 |
+
"""Render actual processing results."""
|
| 358 |
+
|
| 359 |
+
# Metrics
|
| 360 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 361 |
+
|
| 362 |
+
with col1:
|
| 363 |
+
st.metric("Pages", result.metadata.num_pages)
|
| 364 |
+
with col2:
|
| 365 |
+
st.metric("Chunks", result.metadata.total_chunks)
|
| 366 |
+
with col3:
|
| 367 |
+
st.metric("Characters", f"{result.metadata.total_characters:,}")
|
| 368 |
+
with col4:
|
| 369 |
+
conf = result.metadata.ocr_confidence_avg or 0
|
| 370 |
+
st.metric("OCR Confidence", f"{conf:.1%}")
|
| 371 |
+
|
| 372 |
+
st.markdown("---")
|
| 373 |
+
|
| 374 |
+
# Tabs for different views
|
| 375 |
+
tab1, tab2, tab3 = st.tabs(["📝 Extracted Text", "📦 Chunks", "🗺️ Layout"])
|
| 376 |
+
|
| 377 |
+
with tab1:
|
| 378 |
+
st.markdown("### Full Extracted Text")
|
| 379 |
+
st.text_area(
|
| 380 |
+
"Document Text",
|
| 381 |
+
result.full_text[:5000] + "..." if len(result.full_text) > 5000 else result.full_text,
|
| 382 |
+
height=400,
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
with tab2:
|
| 386 |
+
st.markdown("### Document Chunks")
|
| 387 |
+
for i, chunk in enumerate(result.chunks[:10]):
|
| 388 |
+
with st.expander(f"Chunk {i+1}: {chunk.chunk_type.value} (Page {chunk.page + 1})"):
|
| 389 |
+
st.markdown(f"**ID:** `{chunk.chunk_id}`")
|
| 390 |
+
st.markdown(f"**Confidence:** {format_confidence(chunk.confidence)}", unsafe_allow_html=True)
|
| 391 |
+
st.markdown(f"**BBox:** ({chunk.bbox.x_min:.0f}, {chunk.bbox.y_min:.0f}) → ({chunk.bbox.x_max:.0f}, {chunk.bbox.y_max:.0f})")
|
| 392 |
+
st.markdown("**Text:**")
|
| 393 |
+
st.text(chunk.text[:500])
|
| 394 |
+
|
| 395 |
+
with tab3:
|
| 396 |
+
st.markdown("### Layout Regions")
|
| 397 |
+
if result.layout_regions:
|
| 398 |
+
layout_data = []
|
| 399 |
+
for r in result.layout_regions:
|
| 400 |
+
layout_data.append({
|
| 401 |
+
"Type": r.layout_type.value,
|
| 402 |
+
"Page": r.page + 1,
|
| 403 |
+
"Confidence": f"{r.confidence:.1%}",
|
| 404 |
+
"Position": f"({r.bbox.x_min:.0f}, {r.bbox.y_min:.0f})",
|
| 405 |
+
})
|
| 406 |
+
st.dataframe(layout_data, width='stretch')
|
| 407 |
+
else:
|
| 408 |
+
st.info("No layout regions detected")
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
def render_demo_processing_results(doc_path):
|
| 412 |
+
"""Render demo processing results when actual processing unavailable."""
|
| 413 |
+
|
| 414 |
+
doc_name = Path(doc_path).name
|
| 415 |
+
|
| 416 |
+
# Simulated metrics
|
| 417 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 418 |
+
|
| 419 |
+
with col1:
|
| 420 |
+
st.metric("Pages", 12)
|
| 421 |
+
with col2:
|
| 422 |
+
st.metric("Chunks", 47)
|
| 423 |
+
with col3:
|
| 424 |
+
st.metric("Characters", "15,234")
|
| 425 |
+
with col4:
|
| 426 |
+
st.metric("OCR Confidence", "94.2%")
|
| 427 |
+
|
| 428 |
+
st.markdown("---")
|
| 429 |
+
|
| 430 |
+
# Demo chunks
|
| 431 |
+
demo_chunks = [
|
| 432 |
+
{
|
| 433 |
+
"type": "title",
|
| 434 |
+
"page": 1,
|
| 435 |
+
"confidence": 0.98,
|
| 436 |
+
"text": f"PATENT PLEDGE - {doc_name.split()[0]}",
|
| 437 |
+
"bbox": "(100, 50) → (700, 100)",
|
| 438 |
+
},
|
| 439 |
+
{
|
| 440 |
+
"type": "text",
|
| 441 |
+
"page": 1,
|
| 442 |
+
"confidence": 0.95,
|
| 443 |
+
"text": "This Patent Pledge is made by the undersigned company to promote innovation and reduce patent-related barriers...",
|
| 444 |
+
"bbox": "(100, 150) → (700, 300)",
|
| 445 |
+
},
|
| 446 |
+
{
|
| 447 |
+
"type": "text",
|
| 448 |
+
"page": 1,
|
| 449 |
+
"confidence": 0.92,
|
| 450 |
+
"text": "The company hereby pledges not to assert any patent claims against any party making, using, or selling products...",
|
| 451 |
+
"bbox": "(100, 320) → (700, 500)",
|
| 452 |
+
},
|
| 453 |
+
]
|
| 454 |
+
|
| 455 |
+
tab1, tab2 = st.tabs(["📝 Extracted Text", "📦 Chunks"])
|
| 456 |
+
|
| 457 |
+
with tab1:
|
| 458 |
+
st.markdown("### Full Extracted Text")
|
| 459 |
+
demo_text = f"""
|
| 460 |
+
PATENT PLEDGE - {doc_name.split()[0]}
|
| 461 |
+
|
| 462 |
+
This Patent Pledge is made by the undersigned company to promote innovation
|
| 463 |
+
and reduce patent-related barriers in the technology industry.
|
| 464 |
+
|
| 465 |
+
DEFINITIONS:
|
| 466 |
+
1. "Covered Patents" means all patents and patent applications owned by
|
| 467 |
+
the Pledgor that cover fundamental technologies.
|
| 468 |
+
2. "Open Source Software" means software distributed under licenses
|
| 469 |
+
approved by the Open Source Initiative.
|
| 470 |
+
|
| 471 |
+
PLEDGE:
|
| 472 |
+
The company hereby pledges not to assert any Covered Patents against
|
| 473 |
+
any party making, using, selling, or distributing Open Source Software.
|
| 474 |
+
|
| 475 |
+
This pledge is irrevocable and shall remain in effect for the life
|
| 476 |
+
of all Covered Patents.
|
| 477 |
+
|
| 478 |
+
[Document continues with legal terms and conditions...]
|
| 479 |
+
"""
|
| 480 |
+
st.text_area("Document Text", demo_text, height=400)
|
| 481 |
+
|
| 482 |
+
with tab2:
|
| 483 |
+
st.markdown("### Document Chunks")
|
| 484 |
+
for i, chunk in enumerate(demo_chunks):
|
| 485 |
+
with st.expander(f"Chunk {i+1}: {chunk['type']} (Page {chunk['page']})"):
|
| 486 |
+
st.markdown(f"**Confidence:** {format_confidence(chunk['confidence'])}", unsafe_allow_html=True)
|
| 487 |
+
st.markdown(f"**BBox:** {chunk['bbox']}")
|
| 488 |
+
st.markdown("**Text:**")
|
| 489 |
+
st.text(chunk["text"])
|
| 490 |
+
|
| 491 |
+
|
| 492 |
+
def render_extraction_page():
|
| 493 |
+
"""Render the field extraction page."""
|
| 494 |
+
st.markdown("## 🔍 Field Extraction with Evidence")
|
| 495 |
+
|
| 496 |
+
st.markdown("""
|
| 497 |
+
Extract structured fields from documents with **evidence grounding**.
|
| 498 |
+
Every extracted value includes its source location (page, bbox, chunk_id).
|
| 499 |
+
""")
|
| 500 |
+
|
| 501 |
+
col1, col2 = st.columns([2, 1])
|
| 502 |
+
|
| 503 |
+
with col1:
|
| 504 |
+
# Document selection
|
| 505 |
+
docs = get_sample_documents()
|
| 506 |
+
if docs:
|
| 507 |
+
selected_doc = st.selectbox("Select Document", docs, key="extract_doc")
|
| 508 |
+
|
| 509 |
+
st.markdown("### Fields to Extract")
|
| 510 |
+
|
| 511 |
+
# Predefined schemas
|
| 512 |
+
schema_type = st.selectbox(
|
| 513 |
+
"Extraction Schema",
|
| 514 |
+
["Patent/Legal Document", "Invoice", "Contract", "Custom"],
|
| 515 |
+
)
|
| 516 |
+
|
| 517 |
+
if schema_type == "Patent/Legal Document":
|
| 518 |
+
default_fields = ["document_title", "company_name", "effective_date", "key_terms", "parties_involved"]
|
| 519 |
+
elif schema_type == "Invoice":
|
| 520 |
+
default_fields = ["invoice_number", "date", "total_amount", "vendor_name", "line_items"]
|
| 521 |
+
elif schema_type == "Contract":
|
| 522 |
+
default_fields = ["contract_title", "parties", "effective_date", "term_length", "key_obligations"]
|
| 523 |
+
else:
|
| 524 |
+
default_fields = ["field_1", "field_2"]
|
| 525 |
+
|
| 526 |
+
fields = st.multiselect(
|
| 527 |
+
"Select fields to extract",
|
| 528 |
+
default_fields,
|
| 529 |
+
default=default_fields[:3],
|
| 530 |
+
)
|
| 531 |
+
|
| 532 |
+
with col2:
|
| 533 |
+
st.markdown("### Extraction Options")
|
| 534 |
+
validate = st.checkbox("Validate with Critic", value=True)
|
| 535 |
+
include_evidence = st.checkbox("Include Evidence", value=True)
|
| 536 |
+
confidence_threshold = st.slider("Min Confidence", 0.0, 1.0, 0.7)
|
| 537 |
+
|
| 538 |
+
st.markdown("---")
|
| 539 |
+
|
| 540 |
+
if fields and st.button("🔍 Extract Fields", type="primary"):
|
| 541 |
+
extract_fields_demo(selected_doc, fields, validate, include_evidence)
|
| 542 |
+
|
| 543 |
+
|
| 544 |
+
def extract_fields_demo(doc_name, fields, validate, include_evidence):
|
| 545 |
+
"""Demo field extraction."""
|
| 546 |
+
|
| 547 |
+
with st.spinner("Extracting fields..."):
|
| 548 |
+
time.sleep(1.5)
|
| 549 |
+
|
| 550 |
+
st.success("✅ Extraction complete!")
|
| 551 |
+
|
| 552 |
+
# Demo results
|
| 553 |
+
company = doc_name.split()[0] if doc_name else "Company"
|
| 554 |
+
|
| 555 |
+
demo_extractions = {
|
| 556 |
+
"document_title": {
|
| 557 |
+
"value": f"{company} Patent Non-Assertion Pledge",
|
| 558 |
+
"confidence": 0.96,
|
| 559 |
+
"page": 1,
|
| 560 |
+
"evidence": f"Found in header: '{company} Patent Non-Assertion Pledge' at position (100, 50)",
|
| 561 |
+
},
|
| 562 |
+
"company_name": {
|
| 563 |
+
"value": company,
|
| 564 |
+
"confidence": 0.98,
|
| 565 |
+
"page": 1,
|
| 566 |
+
"evidence": f"Identified as pledgor: '{company}' mentioned 15 times throughout document",
|
| 567 |
+
},
|
| 568 |
+
"effective_date": {
|
| 569 |
+
"value": doc_name.split()[-1].replace(".pdf", "") if len(doc_name.split()) > 1 else "N/A",
|
| 570 |
+
"confidence": 0.85,
|
| 571 |
+
"page": 1,
|
| 572 |
+
"evidence": "Date found in document header",
|
| 573 |
+
},
|
| 574 |
+
"key_terms": {
|
| 575 |
+
"value": "Patent pledge, Open source, Non-assertion, Royalty-free",
|
| 576 |
+
"confidence": 0.89,
|
| 577 |
+
"page": 2,
|
| 578 |
+
"evidence": "Key terms identified from definitions section",
|
| 579 |
+
},
|
| 580 |
+
"parties_involved": {
|
| 581 |
+
"value": f"{company}, Open Source Community",
|
| 582 |
+
"confidence": 0.82,
|
| 583 |
+
"page": 1,
|
| 584 |
+
"evidence": "Parties identified from pledge declaration",
|
| 585 |
+
},
|
| 586 |
+
}
|
| 587 |
+
|
| 588 |
+
# Display results
|
| 589 |
+
st.markdown("### Extracted Fields")
|
| 590 |
+
|
| 591 |
+
for field in fields:
|
| 592 |
+
if field in demo_extractions:
|
| 593 |
+
data = demo_extractions[field]
|
| 594 |
+
|
| 595 |
+
col1, col2 = st.columns([3, 1])
|
| 596 |
+
|
| 597 |
+
with col1:
|
| 598 |
+
st.markdown(f"""
|
| 599 |
+
<div class="chunk-card">
|
| 600 |
+
<strong>{field.replace('_', ' ').title()}</strong>
|
| 601 |
+
<p style="font-size: 1.2rem; margin: 0.5rem 0;">{data['value']}</p>
|
| 602 |
+
</div>
|
| 603 |
+
""", unsafe_allow_html=True)
|
| 604 |
+
|
| 605 |
+
with col2:
|
| 606 |
+
st.markdown(f"**Confidence:** {format_confidence(data['confidence'])}", unsafe_allow_html=True)
|
| 607 |
+
st.markdown(f"**Page:** {data['page']}")
|
| 608 |
+
|
| 609 |
+
if include_evidence:
|
| 610 |
+
st.markdown(f"""
|
| 611 |
+
<div class="evidence-box">
|
| 612 |
+
📎 <strong>Evidence:</strong> {data['evidence']}
|
| 613 |
+
</div>
|
| 614 |
+
""", unsafe_allow_html=True)
|
| 615 |
+
|
| 616 |
+
st.markdown("")
|
| 617 |
+
|
| 618 |
+
# Validation results
|
| 619 |
+
if validate:
|
| 620 |
+
st.markdown("---")
|
| 621 |
+
st.markdown("### Validation Results")
|
| 622 |
+
|
| 623 |
+
col1, col2, col3 = st.columns(3)
|
| 624 |
+
with col1:
|
| 625 |
+
st.metric("Fields Validated", len(fields))
|
| 626 |
+
with col2:
|
| 627 |
+
st.metric("Valid", len(fields) - 1)
|
| 628 |
+
with col3:
|
| 629 |
+
st.metric("Uncertain", 1)
|
| 630 |
+
|
| 631 |
+
st.info("💡 Critic validation: All fields have supporting evidence in the document.")
|
| 632 |
+
|
| 633 |
+
|
| 634 |
+
def render_rag_page():
|
| 635 |
+
"""Render the RAG Q&A page."""
|
| 636 |
+
st.markdown("## 💬 RAG Question Answering")
|
| 637 |
+
|
| 638 |
+
st.markdown("""
|
| 639 |
+
Ask questions about indexed documents. Answers include **citations** pointing to
|
| 640 |
+
the exact source chunks with page numbers and text snippets.
|
| 641 |
+
""")
|
| 642 |
+
|
| 643 |
+
# Index status
|
| 644 |
+
col1, col2 = st.columns([2, 1])
|
| 645 |
+
|
| 646 |
+
with col1:
|
| 647 |
+
st.markdown("### Ask a Question")
|
| 648 |
+
|
| 649 |
+
# Preset questions
|
| 650 |
+
preset_questions = [
|
| 651 |
+
"What is the main purpose of this document?",
|
| 652 |
+
"What patents are covered by this pledge?",
|
| 653 |
+
"What are the key terms and definitions?",
|
| 654 |
+
"Who are the parties involved?",
|
| 655 |
+
"What are the conditions for the pledge?",
|
| 656 |
+
]
|
| 657 |
+
|
| 658 |
+
question_mode = st.radio(
|
| 659 |
+
"Question Mode",
|
| 660 |
+
["Select preset", "Custom question"],
|
| 661 |
+
horizontal=True,
|
| 662 |
+
)
|
| 663 |
+
|
| 664 |
+
if question_mode == "Select preset":
|
| 665 |
+
question = st.selectbox("Select a question", preset_questions)
|
| 666 |
+
else:
|
| 667 |
+
question = st.text_input("Enter your question")
|
| 668 |
+
|
| 669 |
+
col_a, col_b = st.columns(2)
|
| 670 |
+
with col_a:
|
| 671 |
+
top_k = st.slider("Number of sources", 1, 10, 5)
|
| 672 |
+
with col_b:
|
| 673 |
+
show_confidence = st.checkbox("Show confidence scores", value=True)
|
| 674 |
+
|
| 675 |
+
with col2:
|
| 676 |
+
st.markdown("### Index Status")
|
| 677 |
+
st.markdown("""
|
| 678 |
+
- **Documents indexed:** 3
|
| 679 |
+
- **Total chunks:** 147
|
| 680 |
+
- **Embedding model:** nomic-embed-text
|
| 681 |
+
- **Vector dimension:** 768
|
| 682 |
+
""")
|
| 683 |
+
|
| 684 |
+
st.markdown("---")
|
| 685 |
+
|
| 686 |
+
if question and st.button("🔍 Get Answer", type="primary"):
|
| 687 |
+
rag_query_demo(question, top_k, show_confidence)
|
| 688 |
+
|
| 689 |
+
|
| 690 |
+
def rag_query_demo(question, top_k, show_confidence):
|
| 691 |
+
"""Demo RAG query."""
|
| 692 |
+
|
| 693 |
+
with st.spinner("Searching documents and generating answer..."):
|
| 694 |
+
time.sleep(1.5)
|
| 695 |
+
|
| 696 |
+
# Demo answer based on question
|
| 697 |
+
demo_answers = {
|
| 698 |
+
"purpose": {
|
| 699 |
+
"answer": "The main purpose of this document is to establish a **Patent Non-Assertion Pledge** where the company commits not to assert certain patent claims against parties using, making, or distributing Open Source Software. This pledge aims to promote innovation and reduce patent-related barriers in the technology industry.",
|
| 700 |
+
"confidence": 0.92,
|
| 701 |
+
"citations": [
|
| 702 |
+
{"index": 1, "page": 1, "snippet": "This Patent Pledge is made to promote innovation and reduce patent-related barriers...", "confidence": 0.95},
|
| 703 |
+
{"index": 2, "page": 1, "snippet": "The company hereby pledges not to assert any patent claims against any party...", "confidence": 0.91},
|
| 704 |
+
],
|
| 705 |
+
},
|
| 706 |
+
"patents": {
|
| 707 |
+
"answer": "The pledge covers **all patents and patent applications** owned by the Pledgor that relate to fundamental technologies used in Open Source Software. Specifically, these are referred to as 'Covered Patents' in the document, defined as patents that cover essential features or functionalities.",
|
| 708 |
+
"confidence": 0.88,
|
| 709 |
+
"citations": [
|
| 710 |
+
{"index": 1, "page": 2, "snippet": "'Covered Patents' means all patents and patent applications owned by the Pledgor...", "confidence": 0.93},
|
| 711 |
+
{"index": 2, "page": 2, "snippet": "Patents covering fundamental technologies essential to Open Source implementations...", "confidence": 0.85},
|
| 712 |
+
],
|
| 713 |
+
},
|
| 714 |
+
"default": {
|
| 715 |
+
"answer": "Based on the available documents, this appears to be a **Patent Pledge** document from a major technology company. The document establishes terms for patent non-assertion related to Open Source Software, with specific definitions and conditions outlined in the legal text.",
|
| 716 |
+
"confidence": 0.75,
|
| 717 |
+
"citations": [
|
| 718 |
+
{"index": 1, "page": 1, "snippet": "Patent Pledge document establishing non-assertion terms...", "confidence": 0.80},
|
| 719 |
+
],
|
| 720 |
+
},
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
# Select answer based on question keywords
|
| 724 |
+
if "purpose" in question.lower() or "main" in question.lower():
|
| 725 |
+
result = demo_answers["purpose"]
|
| 726 |
+
elif "patent" in question.lower() and "cover" in question.lower():
|
| 727 |
+
result = demo_answers["patents"]
|
| 728 |
+
else:
|
| 729 |
+
result = demo_answers["default"]
|
| 730 |
+
|
| 731 |
+
# Display answer
|
| 732 |
+
st.markdown("### Answer")
|
| 733 |
+
|
| 734 |
+
st.markdown(f"""
|
| 735 |
+
<div style="background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
|
| 736 |
+
border-radius: 12px; padding: 1.5rem; margin: 1rem 0;">
|
| 737 |
+
{result['answer']}
|
| 738 |
+
</div>
|
| 739 |
+
""", unsafe_allow_html=True)
|
| 740 |
+
|
| 741 |
+
if show_confidence:
|
| 742 |
+
st.markdown(f"**Overall Confidence:** {format_confidence(result['confidence'])}", unsafe_allow_html=True)
|
| 743 |
+
|
| 744 |
+
# Citations
|
| 745 |
+
st.markdown("### 📚 Citations")
|
| 746 |
+
|
| 747 |
+
for citation in result["citations"][:top_k]:
|
| 748 |
+
st.markdown(f"""
|
| 749 |
+
<div class="evidence-box">
|
| 750 |
+
<strong>[{citation['index']}] Page {citation['page']}</strong>
|
| 751 |
+
{f' - Confidence: {citation["confidence"]:.0%}' if show_confidence else ''}
|
| 752 |
+
<br>
|
| 753 |
+
<em>"{citation['snippet']}"</em>
|
| 754 |
+
</div>
|
| 755 |
+
""", unsafe_allow_html=True)
|
| 756 |
+
|
| 757 |
+
|
| 758 |
+
def render_classification_page():
|
| 759 |
+
"""Render the classification page."""
|
| 760 |
+
st.markdown("## 🏷️ Document Classification")
|
| 761 |
+
|
| 762 |
+
st.markdown("""
|
| 763 |
+
Automatically classify documents into predefined categories with confidence scores
|
| 764 |
+
and reasoning explanations.
|
| 765 |
+
""")
|
| 766 |
+
|
| 767 |
+
docs = get_sample_documents()
|
| 768 |
+
|
| 769 |
+
col1, col2 = st.columns([2, 1])
|
| 770 |
+
|
| 771 |
+
with col1:
|
| 772 |
+
if docs:
|
| 773 |
+
selected_doc = st.selectbox("Select Document to Classify", docs, key="classify_doc")
|
| 774 |
+
|
| 775 |
+
st.markdown("### Document Categories")
|
| 776 |
+
categories = [
|
| 777 |
+
"📜 Legal/Patent Document",
|
| 778 |
+
"📑 Contract/Agreement",
|
| 779 |
+
"📊 Financial Report",
|
| 780 |
+
"📋 Technical Specification",
|
| 781 |
+
"📄 General Business Document",
|
| 782 |
+
]
|
| 783 |
+
st.markdown("\n".join([f"- {cat}" for cat in categories]))
|
| 784 |
+
|
| 785 |
+
with col2:
|
| 786 |
+
st.markdown("### Classification Options")
|
| 787 |
+
detailed_reasoning = st.checkbox("Show detailed reasoning", value=True)
|
| 788 |
+
multi_label = st.checkbox("Allow multiple categories", value=False)
|
| 789 |
+
|
| 790 |
+
st.markdown("---")
|
| 791 |
+
|
| 792 |
+
if st.button("🏷️ Classify Document", type="primary"):
|
| 793 |
+
classify_document_demo(selected_doc, detailed_reasoning)
|
| 794 |
+
|
| 795 |
+
|
| 796 |
+
def classify_document_demo(doc_name, detailed_reasoning):
|
| 797 |
+
"""Demo document classification."""
|
| 798 |
+
|
| 799 |
+
with st.spinner("Analyzing document..."):
|
| 800 |
+
time.sleep(1.0)
|
| 801 |
+
|
| 802 |
+
st.success("✅ Classification complete!")
|
| 803 |
+
|
| 804 |
+
# Demo classification results
|
| 805 |
+
col1, col2 = st.columns([2, 1])
|
| 806 |
+
|
| 807 |
+
with col1:
|
| 808 |
+
st.markdown("### Primary Classification")
|
| 809 |
+
st.markdown("""
|
| 810 |
+
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 811 |
+
border-radius: 12px; padding: 1.5rem; color: white; text-align: center;">
|
| 812 |
+
<h2 style="margin: 0;">📜 Legal/Patent Document</h2>
|
| 813 |
+
<p style="font-size: 1.2rem; margin: 0.5rem 0;">Patent Non-Assertion Pledge</p>
|
| 814 |
+
</div>
|
| 815 |
+
""", unsafe_allow_html=True)
|
| 816 |
+
|
| 817 |
+
with col2:
|
| 818 |
+
st.markdown("### Confidence Scores")
|
| 819 |
+
st.markdown(f"**Legal/Patent:** {format_confidence(0.94)}", unsafe_allow_html=True)
|
| 820 |
+
st.markdown(f"**Contract:** {format_confidence(0.72)}", unsafe_allow_html=True)
|
| 821 |
+
st.markdown(f"**Technical:** {format_confidence(0.15)}", unsafe_allow_html=True)
|
| 822 |
+
st.markdown(f"**Financial:** {format_confidence(0.08)}", unsafe_allow_html=True)
|
| 823 |
+
|
| 824 |
+
if detailed_reasoning:
|
| 825 |
+
st.markdown("---")
|
| 826 |
+
st.markdown("### Classification Reasoning")
|
| 827 |
+
|
| 828 |
+
st.markdown("""
|
| 829 |
+
<div class="evidence-box">
|
| 830 |
+
<strong>Why Legal/Patent Document?</strong>
|
| 831 |
+
<ul>
|
| 832 |
+
<li>Contains legal terminology: "pledge", "assert", "patent claims", "royalty-free"</li>
|
| 833 |
+
<li>Structured as a formal legal declaration</li>
|
| 834 |
+
<li>References specific patent-related definitions</li>
|
| 835 |
+
<li>Contains commitment/obligation language</li>
|
| 836 |
+
</ul>
|
| 837 |
+
</div>
|
| 838 |
+
""", unsafe_allow_html=True)
|
| 839 |
+
|
| 840 |
+
st.markdown("""
|
| 841 |
+
<div class="chunk-card">
|
| 842 |
+
<strong>Key Indicators Found:</strong>
|
| 843 |
+
<br>
|
| 844 |
+
• "Patent Pledge" - Document title indicator (weight: 0.35)<br>
|
| 845 |
+
• "hereby pledges" - Legal commitment language (weight: 0.25)<br>
|
| 846 |
+
• "Covered Patents" - Patent-specific terminology (weight: 0.20)<br>
|
| 847 |
+
• "Open Source Software" - Tech/IP context (weight: 0.15)
|
| 848 |
+
</div>
|
| 849 |
+
""", unsafe_allow_html=True)
|
| 850 |
+
|
| 851 |
+
|
| 852 |
+
def render_analytics_page():
|
| 853 |
+
"""Render the analytics page."""
|
| 854 |
+
st.markdown("## 📊 Processing Analytics")
|
| 855 |
+
|
| 856 |
+
st.markdown("View statistics and insights about document processing.")
|
| 857 |
+
|
| 858 |
+
# Summary metrics
|
| 859 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 860 |
+
|
| 861 |
+
with col1:
|
| 862 |
+
st.metric("Documents Processed", 24, delta="+3 today")
|
| 863 |
+
with col2:
|
| 864 |
+
st.metric("Total Chunks", 1247, delta="+156")
|
| 865 |
+
with col3:
|
| 866 |
+
st.metric("Avg. Confidence", "91.3%", delta="+2.1%")
|
| 867 |
+
with col4:
|
| 868 |
+
st.metric("Questions Answered", 89, delta="+12")
|
| 869 |
+
|
| 870 |
+
st.markdown("---")
|
| 871 |
+
|
| 872 |
+
# Charts
|
| 873 |
+
col1, col2 = st.columns(2)
|
| 874 |
+
|
| 875 |
+
with col1:
|
| 876 |
+
st.markdown("### Document Types Processed")
|
| 877 |
+
import pandas as pd
|
| 878 |
+
|
| 879 |
+
chart_data = pd.DataFrame({
|
| 880 |
+
"Type": ["Patent/Legal", "Contract", "Technical", "Financial", "Other"],
|
| 881 |
+
"Count": [12, 5, 4, 2, 1],
|
| 882 |
+
})
|
| 883 |
+
st.bar_chart(chart_data.set_index("Type"))
|
| 884 |
+
|
| 885 |
+
with col2:
|
| 886 |
+
st.markdown("### Processing Performance")
|
| 887 |
+
perf_data = pd.DataFrame({
|
| 888 |
+
"Stage": ["OCR", "Layout", "Chunking", "Indexing", "Retrieval"],
|
| 889 |
+
"Avg Time (s)": [2.3, 0.8, 0.5, 1.2, 0.3],
|
| 890 |
+
})
|
| 891 |
+
st.bar_chart(perf_data.set_index("Stage"))
|
| 892 |
+
|
| 893 |
+
st.markdown("---")
|
| 894 |
+
|
| 895 |
+
# Recent activity
|
| 896 |
+
st.markdown("### Recent Activity")
|
| 897 |
+
|
| 898 |
+
activities = [
|
| 899 |
+
{"time": "2 min ago", "action": "Processed", "document": "IBM N_A.pdf", "chunks": 42},
|
| 900 |
+
{"time": "15 min ago", "action": "Indexed", "document": "Apple 11.11.2011.pdf", "chunks": 67},
|
| 901 |
+
{"time": "1 hour ago", "action": "Queried", "document": "RAG Collection", "chunks": 5},
|
| 902 |
+
{"time": "2 hours ago", "action": "Classified", "document": "Google 08.02.2012.pdf", "chunks": 0},
|
| 903 |
+
]
|
| 904 |
+
|
| 905 |
+
for activity in activities:
|
| 906 |
+
st.markdown(f"""
|
| 907 |
+
<div class="chunk-card">
|
| 908 |
+
<strong>{activity['time']}</strong> - {activity['action']} <em>{activity['document']}</em>
|
| 909 |
+
{f" ({activity['chunks']} chunks)" if activity['chunks'] > 0 else ""}
|
| 910 |
+
</div>
|
| 911 |
+
""", unsafe_allow_html=True)
|
| 912 |
+
|
| 913 |
+
|
| 914 |
+
def main():
|
| 915 |
+
"""Main application."""
|
| 916 |
+
render_header()
|
| 917 |
+
page = render_sidebar()
|
| 918 |
+
|
| 919 |
+
# Route to appropriate page
|
| 920 |
+
if page == "🏠 Home":
|
| 921 |
+
render_home_page()
|
| 922 |
+
elif page == "📄 Document Processing":
|
| 923 |
+
render_document_processing_page()
|
| 924 |
+
elif page == "🔍 Field Extraction":
|
| 925 |
+
render_extraction_page()
|
| 926 |
+
elif page == "💬 RAG Q&A":
|
| 927 |
+
render_rag_page()
|
| 928 |
+
elif page == "🏷️ Classification":
|
| 929 |
+
render_classification_page()
|
| 930 |
+
elif page == "📊 Analytics":
|
| 931 |
+
render_analytics_page()
|
| 932 |
+
|
| 933 |
+
# Footer
|
| 934 |
+
st.markdown("---")
|
| 935 |
+
st.markdown(
|
| 936 |
+
"<div style='text-align: center; color: #666;'>"
|
| 937 |
+
"🔥 SPARKNET Document Intelligence Platform | Built with Streamlit"
|
| 938 |
+
"</div>",
|
| 939 |
+
unsafe_allow_html=True,
|
| 940 |
+
)
|
| 941 |
+
|
| 942 |
+
|
| 943 |
+
if __name__ == "__main__":
|
| 944 |
+
main()
|
demo/llm_providers.py
ADDED
|
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Free LLM Providers for SPARKNET
|
| 3 |
+
|
| 4 |
+
Supports multiple free-tier LLM providers:
|
| 5 |
+
1. HuggingFace Inference API (free, no payment required)
|
| 6 |
+
2. Groq (free tier - very fast)
|
| 7 |
+
3. Google Gemini (free tier)
|
| 8 |
+
4. Local/Offline mode (simulated responses)
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import os
|
| 12 |
+
import requests
|
| 13 |
+
from typing import Optional, Tuple, List
|
| 14 |
+
from dataclasses import dataclass
|
| 15 |
+
from loguru import logger
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class LLMResponse:
|
| 19 |
+
text: str
|
| 20 |
+
model: str
|
| 21 |
+
provider: str
|
| 22 |
+
success: bool
|
| 23 |
+
error: Optional[str] = None
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class HuggingFaceProvider:
|
| 27 |
+
"""
|
| 28 |
+
HuggingFace Inference API - FREE tier available.
|
| 29 |
+
|
| 30 |
+
Models that work well on free tier:
|
| 31 |
+
- microsoft/DialoGPT-medium
|
| 32 |
+
- google/flan-t5-base
|
| 33 |
+
- mistralai/Mistral-7B-Instruct-v0.2 (may need Pro for heavy use)
|
| 34 |
+
- HuggingFaceH4/zephyr-7b-beta
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
API_URL = "https://api-inference.huggingface.co/models/"
|
| 38 |
+
|
| 39 |
+
# Free-tier friendly models
|
| 40 |
+
MODELS = {
|
| 41 |
+
"chat": "HuggingFaceH4/zephyr-7b-beta",
|
| 42 |
+
"chat_small": "microsoft/DialoGPT-medium",
|
| 43 |
+
"instruct": "google/flan-t5-large",
|
| 44 |
+
"embed": "sentence-transformers/all-MiniLM-L6-v2",
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
def __init__(self, api_token: Optional[str] = None):
|
| 48 |
+
"""
|
| 49 |
+
Initialize HuggingFace provider.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
api_token: HF token (optional but recommended for higher rate limits)
|
| 53 |
+
Get free token at: https://huggingface.co/settings/tokens
|
| 54 |
+
"""
|
| 55 |
+
self.api_token = api_token or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN")
|
| 56 |
+
self.headers = {}
|
| 57 |
+
if self.api_token:
|
| 58 |
+
self.headers["Authorization"] = f"Bearer {self.api_token}"
|
| 59 |
+
|
| 60 |
+
def generate(self, prompt: str, model: Optional[str] = None, max_tokens: int = 500) -> LLMResponse:
|
| 61 |
+
"""Generate text using HuggingFace Inference API."""
|
| 62 |
+
model = model or self.MODELS["chat"]
|
| 63 |
+
url = f"{self.API_URL}{model}"
|
| 64 |
+
|
| 65 |
+
payload = {
|
| 66 |
+
"inputs": prompt,
|
| 67 |
+
"parameters": {
|
| 68 |
+
"max_new_tokens": max_tokens,
|
| 69 |
+
"temperature": 0.7,
|
| 70 |
+
"do_sample": True,
|
| 71 |
+
"return_full_text": False,
|
| 72 |
+
}
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
response = requests.post(url, headers=self.headers, json=payload, timeout=60)
|
| 77 |
+
|
| 78 |
+
if response.status_code == 503:
|
| 79 |
+
# Model is loading
|
| 80 |
+
return LLMResponse(
|
| 81 |
+
text="Model is loading, please try again in a moment...",
|
| 82 |
+
model=model,
|
| 83 |
+
provider="huggingface",
|
| 84 |
+
success=False,
|
| 85 |
+
error="Model loading"
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
response.raise_for_status()
|
| 89 |
+
result = response.json()
|
| 90 |
+
|
| 91 |
+
if isinstance(result, list) and len(result) > 0:
|
| 92 |
+
text = result[0].get("generated_text", "")
|
| 93 |
+
else:
|
| 94 |
+
text = str(result)
|
| 95 |
+
|
| 96 |
+
return LLMResponse(
|
| 97 |
+
text=text,
|
| 98 |
+
model=model,
|
| 99 |
+
provider="huggingface",
|
| 100 |
+
success=True
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
except Exception as e:
|
| 104 |
+
logger.error(f"HuggingFace API error: {e}")
|
| 105 |
+
return LLMResponse(
|
| 106 |
+
text="",
|
| 107 |
+
model=model,
|
| 108 |
+
provider="huggingface",
|
| 109 |
+
success=False,
|
| 110 |
+
error=str(e)
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
def embed(self, texts: List[str], model: Optional[str] = None) -> Tuple[List[List[float]], Optional[str]]:
|
| 114 |
+
"""Generate embeddings using HuggingFace."""
|
| 115 |
+
model = model or self.MODELS["embed"]
|
| 116 |
+
url = f"{self.API_URL}{model}"
|
| 117 |
+
|
| 118 |
+
payload = {
|
| 119 |
+
"inputs": texts,
|
| 120 |
+
"options": {"wait_for_model": True}
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
response = requests.post(url, headers=self.headers, json=payload, timeout=60)
|
| 125 |
+
response.raise_for_status()
|
| 126 |
+
embeddings = response.json()
|
| 127 |
+
return embeddings, None
|
| 128 |
+
except Exception as e:
|
| 129 |
+
logger.error(f"HuggingFace embed error: {e}")
|
| 130 |
+
return [], str(e)
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
class GroqProvider:
|
| 134 |
+
"""
|
| 135 |
+
Groq - FREE tier with very fast inference.
|
| 136 |
+
|
| 137 |
+
Free tier includes:
|
| 138 |
+
- 14,400 requests/day for smaller models
|
| 139 |
+
- Very fast inference (fastest available)
|
| 140 |
+
|
| 141 |
+
Get free API key at: https://console.groq.com/keys
|
| 142 |
+
"""
|
| 143 |
+
|
| 144 |
+
API_URL = "https://api.groq.com/openai/v1/chat/completions"
|
| 145 |
+
|
| 146 |
+
MODELS = {
|
| 147 |
+
"fast": "llama-3.1-8b-instant", # Fastest
|
| 148 |
+
"smart": "llama-3.3-70b-versatile", # Best quality
|
| 149 |
+
"small": "gemma2-9b-it", # Good balance
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
def __init__(self, api_key: Optional[str] = None):
|
| 153 |
+
self.api_key = api_key or os.environ.get("GROQ_API_KEY")
|
| 154 |
+
if not self.api_key:
|
| 155 |
+
logger.warning("No Groq API key found. Get free key at: https://console.groq.com/keys")
|
| 156 |
+
|
| 157 |
+
def generate(self, prompt: str, model: Optional[str] = None, max_tokens: int = 500) -> LLMResponse:
|
| 158 |
+
"""Generate text using Groq API."""
|
| 159 |
+
if not self.api_key:
|
| 160 |
+
return LLMResponse(
|
| 161 |
+
text="",
|
| 162 |
+
model="",
|
| 163 |
+
provider="groq",
|
| 164 |
+
success=False,
|
| 165 |
+
error="No Groq API key configured"
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
model = model or self.MODELS["fast"]
|
| 169 |
+
|
| 170 |
+
headers = {
|
| 171 |
+
"Authorization": f"Bearer {self.api_key}",
|
| 172 |
+
"Content-Type": "application/json"
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
payload = {
|
| 176 |
+
"model": model,
|
| 177 |
+
"messages": [{"role": "user", "content": prompt}],
|
| 178 |
+
"max_tokens": max_tokens,
|
| 179 |
+
"temperature": 0.7,
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
try:
|
| 183 |
+
response = requests.post(self.API_URL, headers=headers, json=payload, timeout=30)
|
| 184 |
+
response.raise_for_status()
|
| 185 |
+
result = response.json()
|
| 186 |
+
|
| 187 |
+
text = result["choices"][0]["message"]["content"]
|
| 188 |
+
|
| 189 |
+
return LLMResponse(
|
| 190 |
+
text=text,
|
| 191 |
+
model=model,
|
| 192 |
+
provider="groq",
|
| 193 |
+
success=True
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
except Exception as e:
|
| 197 |
+
logger.error(f"Groq API error: {e}")
|
| 198 |
+
return LLMResponse(
|
| 199 |
+
text="",
|
| 200 |
+
model=model,
|
| 201 |
+
provider="groq",
|
| 202 |
+
success=False,
|
| 203 |
+
error=str(e)
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
class OfflineProvider:
|
| 208 |
+
"""
|
| 209 |
+
Offline/Demo mode - no API required.
|
| 210 |
+
|
| 211 |
+
Provides simulated responses for demonstration purposes.
|
| 212 |
+
"""
|
| 213 |
+
|
| 214 |
+
def __init__(self):
|
| 215 |
+
pass
|
| 216 |
+
|
| 217 |
+
def generate(self, prompt: str, context: str = "", **kwargs) -> LLMResponse:
|
| 218 |
+
"""Generate a simulated response based on context."""
|
| 219 |
+
|
| 220 |
+
# Extract key information from context if provided
|
| 221 |
+
if context:
|
| 222 |
+
# Simple extractive response
|
| 223 |
+
sentences = context.split('.')
|
| 224 |
+
relevant = [s.strip() for s in sentences if len(s.strip()) > 20][:3]
|
| 225 |
+
|
| 226 |
+
if relevant:
|
| 227 |
+
response = f"Based on the documents, {relevant[0].lower()}."
|
| 228 |
+
if len(relevant) > 1:
|
| 229 |
+
response += f" Additionally, {relevant[1].lower()}."
|
| 230 |
+
else:
|
| 231 |
+
response = "Based on the available documents, I found relevant information but cannot generate a detailed response in offline mode."
|
| 232 |
+
else:
|
| 233 |
+
response = "I'm running in offline demo mode. To get AI-powered responses, please configure a free LLM provider (HuggingFace or Groq)."
|
| 234 |
+
|
| 235 |
+
return LLMResponse(
|
| 236 |
+
text=response,
|
| 237 |
+
model="offline",
|
| 238 |
+
provider="offline",
|
| 239 |
+
success=True
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
def embed(self, texts: List[str]) -> Tuple[List[List[float]], Optional[str]]:
|
| 243 |
+
"""Generate simple bag-of-words style embeddings for demo."""
|
| 244 |
+
import hashlib
|
| 245 |
+
|
| 246 |
+
embeddings = []
|
| 247 |
+
for text in texts:
|
| 248 |
+
# Create deterministic pseudo-embeddings based on text hash
|
| 249 |
+
hash_bytes = hashlib.sha256(text.encode()).digest()
|
| 250 |
+
# Convert to 384-dim vector (same as MiniLM)
|
| 251 |
+
embedding = [((b % 200) - 100) / 100.0 for b in hash_bytes * 12][:384]
|
| 252 |
+
embeddings.append(embedding)
|
| 253 |
+
|
| 254 |
+
return embeddings, None
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
class UnifiedLLMProvider:
|
| 258 |
+
"""
|
| 259 |
+
Unified interface for all LLM providers.
|
| 260 |
+
|
| 261 |
+
Automatically selects the best available provider.
|
| 262 |
+
"""
|
| 263 |
+
|
| 264 |
+
def __init__(self):
|
| 265 |
+
self.providers = {}
|
| 266 |
+
self.active_provider = None
|
| 267 |
+
self.active_embed_provider = None
|
| 268 |
+
|
| 269 |
+
# Try to initialize providers in order of preference
|
| 270 |
+
self._init_providers()
|
| 271 |
+
|
| 272 |
+
def _init_providers(self):
|
| 273 |
+
"""Initialize available providers."""
|
| 274 |
+
|
| 275 |
+
# Check for Groq (fastest, generous free tier)
|
| 276 |
+
groq_key = os.environ.get("GROQ_API_KEY")
|
| 277 |
+
if groq_key:
|
| 278 |
+
self.providers["groq"] = GroqProvider(groq_key)
|
| 279 |
+
self.active_provider = "groq"
|
| 280 |
+
logger.info("Using Groq provider (free tier)")
|
| 281 |
+
|
| 282 |
+
# Check for HuggingFace (always available, even without token)
|
| 283 |
+
hf_token = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN")
|
| 284 |
+
self.providers["huggingface"] = HuggingFaceProvider(hf_token)
|
| 285 |
+
if not self.active_provider:
|
| 286 |
+
self.active_provider = "huggingface"
|
| 287 |
+
logger.info("Using HuggingFace provider")
|
| 288 |
+
|
| 289 |
+
# HuggingFace for embeddings (always free)
|
| 290 |
+
self.active_embed_provider = "huggingface"
|
| 291 |
+
|
| 292 |
+
# Offline fallback
|
| 293 |
+
self.providers["offline"] = OfflineProvider()
|
| 294 |
+
|
| 295 |
+
logger.info(f"LLM Provider: {self.active_provider}, Embed Provider: {self.active_embed_provider}")
|
| 296 |
+
|
| 297 |
+
def generate(self, prompt: str, **kwargs) -> LLMResponse:
|
| 298 |
+
"""Generate text using the best available provider."""
|
| 299 |
+
provider = self.providers.get(self.active_provider)
|
| 300 |
+
|
| 301 |
+
if provider:
|
| 302 |
+
response = provider.generate(prompt, **kwargs)
|
| 303 |
+
if response.success:
|
| 304 |
+
return response
|
| 305 |
+
|
| 306 |
+
# Fallback to offline
|
| 307 |
+
return self.providers["offline"].generate(prompt, **kwargs)
|
| 308 |
+
|
| 309 |
+
def embed(self, texts: List[str]) -> Tuple[List[List[float]], Optional[str]]:
|
| 310 |
+
"""Generate embeddings using the best available provider."""
|
| 311 |
+
if self.active_embed_provider == "huggingface":
|
| 312 |
+
embeddings, error = self.providers["huggingface"].embed(texts)
|
| 313 |
+
if not error:
|
| 314 |
+
return embeddings, None
|
| 315 |
+
|
| 316 |
+
# Fallback to offline embeddings
|
| 317 |
+
return self.providers["offline"].embed(texts)
|
| 318 |
+
|
| 319 |
+
def get_status(self) -> dict:
|
| 320 |
+
"""Get status of all providers."""
|
| 321 |
+
return {
|
| 322 |
+
"active_llm": self.active_provider,
|
| 323 |
+
"active_embed": self.active_embed_provider,
|
| 324 |
+
"available_providers": list(self.providers.keys()),
|
| 325 |
+
"groq_configured": "groq" in self.providers and self.providers["groq"].api_key is not None,
|
| 326 |
+
"huggingface_configured": self.providers["huggingface"].api_token is not None,
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
# Global instance
|
| 331 |
+
_llm_provider: Optional[UnifiedLLMProvider] = None
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
def get_llm_provider() -> UnifiedLLMProvider:
|
| 335 |
+
"""Get or create the unified LLM provider."""
|
| 336 |
+
global _llm_provider
|
| 337 |
+
if _llm_provider is None:
|
| 338 |
+
_llm_provider = UnifiedLLMProvider()
|
| 339 |
+
return _llm_provider
|
demo/pages/1_🔬_Live_Processing.py
ADDED
|
@@ -0,0 +1,714 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Live Document Processing Demo - SPARKNET
|
| 3 |
+
|
| 4 |
+
Real-time document processing with integrated state management and auto-indexing.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import streamlit as st
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import time
|
| 11 |
+
import io
|
| 12 |
+
import base64
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
import hashlib
|
| 15 |
+
|
| 16 |
+
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
| 17 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 18 |
+
sys.path.insert(0, str(PROJECT_ROOT / "demo"))
|
| 19 |
+
|
| 20 |
+
# Import state manager and RAG config
|
| 21 |
+
from state_manager import (
|
| 22 |
+
get_state_manager,
|
| 23 |
+
ProcessedDocument as StateDocument,
|
| 24 |
+
generate_doc_id,
|
| 25 |
+
render_global_status_bar,
|
| 26 |
+
)
|
| 27 |
+
from rag_config import (
|
| 28 |
+
get_unified_rag_system,
|
| 29 |
+
auto_index_processed_document,
|
| 30 |
+
check_ollama,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
st.set_page_config(page_title="Live Processing - SPARKNET", page_icon="🔬", layout="wide")
|
| 34 |
+
|
| 35 |
+
# Custom CSS
|
| 36 |
+
st.markdown("""
|
| 37 |
+
<style>
|
| 38 |
+
.stage-card {
|
| 39 |
+
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
|
| 40 |
+
padding: 15px;
|
| 41 |
+
border-radius: 10px;
|
| 42 |
+
margin: 10px 0;
|
| 43 |
+
border-left: 4px solid #4ECDC4;
|
| 44 |
+
}
|
| 45 |
+
.stage-active {
|
| 46 |
+
border-left-color: #ffc107;
|
| 47 |
+
animation: pulse 1s infinite;
|
| 48 |
+
}
|
| 49 |
+
.stage-done {
|
| 50 |
+
border-left-color: #28a745;
|
| 51 |
+
}
|
| 52 |
+
.stage-error {
|
| 53 |
+
border-left-color: #dc3545;
|
| 54 |
+
}
|
| 55 |
+
@keyframes pulse {
|
| 56 |
+
0% { opacity: 1; }
|
| 57 |
+
50% { opacity: 0.7; }
|
| 58 |
+
100% { opacity: 1; }
|
| 59 |
+
}
|
| 60 |
+
.metric-card {
|
| 61 |
+
background: #161b22;
|
| 62 |
+
border-radius: 8px;
|
| 63 |
+
padding: 12px;
|
| 64 |
+
text-align: center;
|
| 65 |
+
border: 1px solid #30363d;
|
| 66 |
+
}
|
| 67 |
+
.metric-value {
|
| 68 |
+
font-size: 24px;
|
| 69 |
+
font-weight: bold;
|
| 70 |
+
color: #4ECDC4;
|
| 71 |
+
}
|
| 72 |
+
.metric-label {
|
| 73 |
+
font-size: 11px;
|
| 74 |
+
color: #8b949e;
|
| 75 |
+
text-transform: uppercase;
|
| 76 |
+
}
|
| 77 |
+
.action-btn {
|
| 78 |
+
margin: 5px;
|
| 79 |
+
}
|
| 80 |
+
.nav-card {
|
| 81 |
+
background: #0d1117;
|
| 82 |
+
border-radius: 10px;
|
| 83 |
+
padding: 15px;
|
| 84 |
+
margin: 10px 0;
|
| 85 |
+
border: 1px solid #30363d;
|
| 86 |
+
cursor: pointer;
|
| 87 |
+
}
|
| 88 |
+
.nav-card:hover {
|
| 89 |
+
border-color: #4ECDC4;
|
| 90 |
+
}
|
| 91 |
+
</style>
|
| 92 |
+
""", unsafe_allow_html=True)
|
| 93 |
+
|
| 94 |
+
# Initialize state manager
|
| 95 |
+
state_manager = get_state_manager()
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def process_document_actual(file_bytes: bytes, filename: str, options: dict) -> dict:
|
| 99 |
+
"""
|
| 100 |
+
Process document using the actual document processing pipeline.
|
| 101 |
+
Returns processing results with all extracted data.
|
| 102 |
+
"""
|
| 103 |
+
import tempfile
|
| 104 |
+
import os
|
| 105 |
+
|
| 106 |
+
# Create temp file
|
| 107 |
+
suffix = Path(filename).suffix
|
| 108 |
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
| 109 |
+
tmp.write(file_bytes)
|
| 110 |
+
tmp_path = tmp.name
|
| 111 |
+
|
| 112 |
+
try:
|
| 113 |
+
# Try to use actual document processor
|
| 114 |
+
try:
|
| 115 |
+
from src.document.pipeline.processor import (
|
| 116 |
+
DocumentProcessor,
|
| 117 |
+
PipelineConfig,
|
| 118 |
+
)
|
| 119 |
+
from src.document.ocr import OCRConfig
|
| 120 |
+
from src.document.layout import LayoutConfig
|
| 121 |
+
from src.document.chunking.chunker import ChunkerConfig
|
| 122 |
+
|
| 123 |
+
# Configure chunking with table preservation options
|
| 124 |
+
chunker_config = ChunkerConfig(
|
| 125 |
+
preserve_table_structure=options.get("preserve_tables", True),
|
| 126 |
+
detect_table_headers=options.get("detect_headers", True),
|
| 127 |
+
chunk_tables=True,
|
| 128 |
+
chunk_figures=True,
|
| 129 |
+
include_captions=True,
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
# Configure layout detection
|
| 133 |
+
layout_config = LayoutConfig(
|
| 134 |
+
method="rule_based",
|
| 135 |
+
detect_tables=True,
|
| 136 |
+
detect_figures=True,
|
| 137 |
+
detect_headers=True,
|
| 138 |
+
detect_titles=True,
|
| 139 |
+
detect_lists=True,
|
| 140 |
+
min_confidence=0.3, # Lower threshold to detect more regions
|
| 141 |
+
heading_font_ratio=1.1, # More sensitive heading detection
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
# Configure pipeline with all options
|
| 145 |
+
config = PipelineConfig(
|
| 146 |
+
ocr=OCRConfig(engine=options.get("ocr_engine", "paddleocr")),
|
| 147 |
+
layout=layout_config,
|
| 148 |
+
chunking=chunker_config,
|
| 149 |
+
max_pages=options.get("max_pages", 10),
|
| 150 |
+
include_ocr_regions=True,
|
| 151 |
+
include_layout_regions=options.get("enable_layout", True),
|
| 152 |
+
generate_full_text=True,
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
processor = DocumentProcessor(config)
|
| 156 |
+
processor.initialize()
|
| 157 |
+
|
| 158 |
+
# Process document
|
| 159 |
+
result = processor.process(tmp_path)
|
| 160 |
+
|
| 161 |
+
# Convert to dict format for state
|
| 162 |
+
chunks_list = []
|
| 163 |
+
for chunk in result.chunks:
|
| 164 |
+
chunks_list.append({
|
| 165 |
+
"chunk_id": chunk.chunk_id,
|
| 166 |
+
"text": chunk.text,
|
| 167 |
+
"page": chunk.page,
|
| 168 |
+
"chunk_type": chunk.chunk_type.value,
|
| 169 |
+
"confidence": chunk.confidence,
|
| 170 |
+
"bbox": chunk.bbox.to_xyxy() if chunk.bbox else None,
|
| 171 |
+
})
|
| 172 |
+
|
| 173 |
+
ocr_regions = []
|
| 174 |
+
for region in result.ocr_regions:
|
| 175 |
+
ocr_regions.append({
|
| 176 |
+
"text": region.text,
|
| 177 |
+
"confidence": region.confidence,
|
| 178 |
+
"page": region.page,
|
| 179 |
+
"bbox": region.bbox.to_xyxy() if region.bbox else None,
|
| 180 |
+
})
|
| 181 |
+
|
| 182 |
+
layout_regions = []
|
| 183 |
+
for region in result.layout_regions:
|
| 184 |
+
layout_regions.append({
|
| 185 |
+
"id": region.id,
|
| 186 |
+
"type": region.type.value,
|
| 187 |
+
"confidence": region.confidence,
|
| 188 |
+
"page": region.page,
|
| 189 |
+
"bbox": region.bbox.to_xyxy() if region.bbox else None,
|
| 190 |
+
})
|
| 191 |
+
|
| 192 |
+
return {
|
| 193 |
+
"success": True,
|
| 194 |
+
"raw_text": result.full_text,
|
| 195 |
+
"chunks": chunks_list,
|
| 196 |
+
"ocr_regions": ocr_regions,
|
| 197 |
+
"layout_regions": layout_regions,
|
| 198 |
+
"page_count": result.metadata.num_pages,
|
| 199 |
+
"ocr_confidence": result.metadata.ocr_confidence_avg or 0.0,
|
| 200 |
+
"layout_confidence": result.metadata.layout_confidence_avg or 0.0,
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
except Exception as e:
|
| 204 |
+
# Fallback: Use simple text extraction
|
| 205 |
+
return process_document_fallback(file_bytes, filename, options, str(e))
|
| 206 |
+
|
| 207 |
+
finally:
|
| 208 |
+
# Cleanup
|
| 209 |
+
if os.path.exists(tmp_path):
|
| 210 |
+
os.unlink(tmp_path)
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def process_document_fallback(file_bytes: bytes, filename: str, options: dict, reason: str) -> dict:
|
| 214 |
+
"""
|
| 215 |
+
Fallback document processing using simple text extraction.
|
| 216 |
+
"""
|
| 217 |
+
text = ""
|
| 218 |
+
page_count = 1
|
| 219 |
+
|
| 220 |
+
suffix = Path(filename).suffix.lower()
|
| 221 |
+
|
| 222 |
+
# Try PyMuPDF for PDFs
|
| 223 |
+
if suffix == ".pdf":
|
| 224 |
+
try:
|
| 225 |
+
import fitz
|
| 226 |
+
pdf_stream = io.BytesIO(file_bytes)
|
| 227 |
+
doc = fitz.open(stream=pdf_stream, filetype="pdf")
|
| 228 |
+
page_count = len(doc)
|
| 229 |
+
max_pages = min(options.get("max_pages", 5), page_count)
|
| 230 |
+
|
| 231 |
+
text_parts = []
|
| 232 |
+
for page_num in range(max_pages):
|
| 233 |
+
page = doc[page_num]
|
| 234 |
+
text_parts.append(f"--- Page {page_num + 1} ---\n{page.get_text()}")
|
| 235 |
+
text = "\n\n".join(text_parts)
|
| 236 |
+
doc.close()
|
| 237 |
+
except Exception as pdf_e:
|
| 238 |
+
text = f"PDF extraction failed: {pdf_e}"
|
| 239 |
+
|
| 240 |
+
elif suffix in [".txt", ".md"]:
|
| 241 |
+
try:
|
| 242 |
+
text = file_bytes.decode("utf-8")
|
| 243 |
+
except:
|
| 244 |
+
text = file_bytes.decode("latin-1", errors="ignore")
|
| 245 |
+
|
| 246 |
+
else:
|
| 247 |
+
text = f"Unsupported file type: {suffix}"
|
| 248 |
+
|
| 249 |
+
# Simple chunking
|
| 250 |
+
chunk_size = 500
|
| 251 |
+
overlap = 50
|
| 252 |
+
chunks = []
|
| 253 |
+
|
| 254 |
+
for i in range(0, len(text), chunk_size - overlap):
|
| 255 |
+
chunk_text = text[i:i + chunk_size]
|
| 256 |
+
if len(chunk_text.strip()) > 20:
|
| 257 |
+
chunks.append({
|
| 258 |
+
"chunk_id": f"chunk_{len(chunks)}",
|
| 259 |
+
"text": chunk_text,
|
| 260 |
+
"page": 0,
|
| 261 |
+
"chunk_type": "text",
|
| 262 |
+
"confidence": 0.9,
|
| 263 |
+
"bbox": None,
|
| 264 |
+
})
|
| 265 |
+
|
| 266 |
+
return {
|
| 267 |
+
"success": True,
|
| 268 |
+
"raw_text": text,
|
| 269 |
+
"chunks": chunks,
|
| 270 |
+
"ocr_regions": [],
|
| 271 |
+
"layout_regions": [],
|
| 272 |
+
"page_count": page_count,
|
| 273 |
+
"ocr_confidence": 0.9,
|
| 274 |
+
"layout_confidence": 0.0,
|
| 275 |
+
"fallback_reason": reason,
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def get_page_images(file_bytes: bytes, filename: str, max_pages: int = 5) -> list:
|
| 280 |
+
"""Extract page images from PDF for visualization."""
|
| 281 |
+
images = []
|
| 282 |
+
suffix = Path(filename).suffix.lower()
|
| 283 |
+
|
| 284 |
+
if suffix == ".pdf":
|
| 285 |
+
try:
|
| 286 |
+
import fitz
|
| 287 |
+
pdf_stream = io.BytesIO(file_bytes)
|
| 288 |
+
doc = fitz.open(stream=pdf_stream, filetype="pdf")
|
| 289 |
+
page_count = min(len(doc), max_pages)
|
| 290 |
+
|
| 291 |
+
for page_num in range(page_count):
|
| 292 |
+
page = doc[page_num]
|
| 293 |
+
pix = page.get_pixmap(dpi=100)
|
| 294 |
+
img_bytes = pix.tobytes("png")
|
| 295 |
+
images.append({
|
| 296 |
+
"page": page_num,
|
| 297 |
+
"data": base64.b64encode(img_bytes).decode(),
|
| 298 |
+
"width": pix.width,
|
| 299 |
+
"height": pix.height,
|
| 300 |
+
})
|
| 301 |
+
doc.close()
|
| 302 |
+
except:
|
| 303 |
+
pass
|
| 304 |
+
|
| 305 |
+
return images
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
# Header
|
| 309 |
+
st.markdown("# 🔬 Live Document Processing")
|
| 310 |
+
st.markdown("Process documents in real-time with auto-indexing to RAG")
|
| 311 |
+
|
| 312 |
+
# Global status bar
|
| 313 |
+
render_global_status_bar()
|
| 314 |
+
|
| 315 |
+
st.markdown("---")
|
| 316 |
+
|
| 317 |
+
# Main content
|
| 318 |
+
col_upload, col_status = st.columns([2, 1])
|
| 319 |
+
|
| 320 |
+
with col_upload:
|
| 321 |
+
st.markdown("### 📤 Upload Document")
|
| 322 |
+
|
| 323 |
+
uploaded_file = st.file_uploader(
|
| 324 |
+
"Choose a document",
|
| 325 |
+
type=["pdf", "txt", "md"],
|
| 326 |
+
help="Upload PDF, TXT, or MD files for processing"
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
# Or select from existing files
|
| 330 |
+
docs_path = PROJECT_ROOT / "Dataset"
|
| 331 |
+
existing_docs = sorted([f.name for f in docs_path.glob("*.pdf")]) if docs_path.exists() else []
|
| 332 |
+
|
| 333 |
+
if existing_docs:
|
| 334 |
+
st.markdown("**Or select from samples:**")
|
| 335 |
+
selected_sample = st.selectbox("Sample documents", ["-- Select --"] + existing_docs)
|
| 336 |
+
|
| 337 |
+
with col_status:
|
| 338 |
+
st.markdown("### 📊 System Status")
|
| 339 |
+
|
| 340 |
+
ollama_ok, models = check_ollama()
|
| 341 |
+
rag_system = get_unified_rag_system()
|
| 342 |
+
|
| 343 |
+
status_cols = st.columns(2)
|
| 344 |
+
with status_cols[0]:
|
| 345 |
+
if ollama_ok:
|
| 346 |
+
st.success(f"Ollama ({len(models)})")
|
| 347 |
+
else:
|
| 348 |
+
st.error("Ollama Offline")
|
| 349 |
+
with status_cols[1]:
|
| 350 |
+
if rag_system["status"] == "ready":
|
| 351 |
+
st.success("RAG Ready")
|
| 352 |
+
else:
|
| 353 |
+
st.error("RAG Error")
|
| 354 |
+
|
| 355 |
+
# State summary
|
| 356 |
+
summary = state_manager.get_summary()
|
| 357 |
+
st.metric("Processed Docs", summary["total_documents"])
|
| 358 |
+
st.metric("Indexed Chunks", summary["total_indexed_chunks"])
|
| 359 |
+
|
| 360 |
+
st.markdown("---")
|
| 361 |
+
|
| 362 |
+
# Processing Options
|
| 363 |
+
st.markdown("### ⚙️ Processing Options")
|
| 364 |
+
|
| 365 |
+
opt_cols = st.columns(4)
|
| 366 |
+
with opt_cols[0]:
|
| 367 |
+
ocr_engine = st.radio("OCR Engine", ["paddleocr", "tesseract"], horizontal=True,
|
| 368 |
+
help="PaddleOCR is faster and more accurate for most documents")
|
| 369 |
+
with opt_cols[1]:
|
| 370 |
+
max_pages = st.slider("Max pages", 1, 50, 10, help="Maximum number of pages to process")
|
| 371 |
+
with opt_cols[2]:
|
| 372 |
+
enable_layout = st.checkbox("Layout detection", value=True,
|
| 373 |
+
help="Detect tables, figures, headings and other layout elements")
|
| 374 |
+
with opt_cols[3]:
|
| 375 |
+
auto_index = st.checkbox("Auto-index to RAG", value=True,
|
| 376 |
+
help="Automatically index processed documents for RAG queries")
|
| 377 |
+
|
| 378 |
+
# Advanced options (expanded by default for visibility)
|
| 379 |
+
with st.expander("🔧 Advanced Options", expanded=False):
|
| 380 |
+
adv_cols = st.columns(3)
|
| 381 |
+
with adv_cols[0]:
|
| 382 |
+
preserve_tables = st.checkbox("Preserve table structure", value=True,
|
| 383 |
+
help="Convert tables to markdown format with structure")
|
| 384 |
+
with adv_cols[1]:
|
| 385 |
+
detect_headers = st.checkbox("Detect table headers", value=True,
|
| 386 |
+
help="Automatically identify header rows in tables")
|
| 387 |
+
with adv_cols[2]:
|
| 388 |
+
generate_embeddings = st.checkbox("Generate embeddings", value=True,
|
| 389 |
+
help="Create embeddings for semantic search")
|
| 390 |
+
|
| 391 |
+
# Determine what to process
|
| 392 |
+
file_to_process = None
|
| 393 |
+
file_bytes = None
|
| 394 |
+
filename = None
|
| 395 |
+
|
| 396 |
+
if uploaded_file is not None:
|
| 397 |
+
file_bytes = uploaded_file.read()
|
| 398 |
+
filename = uploaded_file.name
|
| 399 |
+
file_to_process = "upload"
|
| 400 |
+
elif existing_docs and selected_sample != "-- Select --":
|
| 401 |
+
file_path = docs_path / selected_sample
|
| 402 |
+
file_bytes = file_path.read_bytes()
|
| 403 |
+
filename = selected_sample
|
| 404 |
+
file_to_process = "sample"
|
| 405 |
+
|
| 406 |
+
# Process button
|
| 407 |
+
if file_to_process and st.button("🚀 Start Processing", type="primary", use_container_width=True):
|
| 408 |
+
|
| 409 |
+
# Generate document ID
|
| 410 |
+
content_hash = hashlib.md5(file_bytes[:1000]).hexdigest()[:8]
|
| 411 |
+
doc_id = generate_doc_id(filename, content_hash)
|
| 412 |
+
|
| 413 |
+
# Start processing in state manager
|
| 414 |
+
state_manager.start_processing(doc_id, filename)
|
| 415 |
+
|
| 416 |
+
# Pipeline stages
|
| 417 |
+
stages = [
|
| 418 |
+
("loading", "📄 Loading Document", "Reading and preparing document..."),
|
| 419 |
+
("ocr", f"🔍 {ocr_engine.upper()} Extraction", "Extracting text from document..."),
|
| 420 |
+
("layout", "📐 Layout Detection", "Identifying document structure..."),
|
| 421 |
+
("chunking", "✂️ Semantic Chunking", "Creating meaningful text chunks..."),
|
| 422 |
+
("indexing", "📚 RAG Indexing", "Adding to vector store..."),
|
| 423 |
+
]
|
| 424 |
+
|
| 425 |
+
# Progress container
|
| 426 |
+
progress_container = st.container()
|
| 427 |
+
results_container = st.container()
|
| 428 |
+
|
| 429 |
+
with progress_container:
|
| 430 |
+
progress_bar = st.progress(0)
|
| 431 |
+
status_text = st.empty()
|
| 432 |
+
|
| 433 |
+
# Metrics row
|
| 434 |
+
metric_cols = st.columns(5)
|
| 435 |
+
metric_placeholders = {
|
| 436 |
+
"pages": metric_cols[0].empty(),
|
| 437 |
+
"ocr_regions": metric_cols[1].empty(),
|
| 438 |
+
"layout_regions": metric_cols[2].empty(),
|
| 439 |
+
"chunks": metric_cols[3].empty(),
|
| 440 |
+
"confidence": metric_cols[4].empty(),
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
processing_start = time.time()
|
| 444 |
+
processing_result = None
|
| 445 |
+
error_msg = None
|
| 446 |
+
|
| 447 |
+
try:
|
| 448 |
+
# Stage 1: Loading
|
| 449 |
+
status_text.markdown("**📄 Loading document...**")
|
| 450 |
+
state_manager.update_processing(doc_id, "loading", 0.1, "Loading document...")
|
| 451 |
+
progress_bar.progress(10)
|
| 452 |
+
time.sleep(0.3)
|
| 453 |
+
|
| 454 |
+
# Get page images for visualization
|
| 455 |
+
page_images = get_page_images(file_bytes, filename, max_pages)
|
| 456 |
+
metric_placeholders["pages"].metric("Pages", len(page_images) if page_images else "N/A")
|
| 457 |
+
|
| 458 |
+
# Stage 2-3: OCR + Layout
|
| 459 |
+
status_text.markdown(f"**🔍 Running {ocr_engine.upper()}...**")
|
| 460 |
+
state_manager.update_processing(doc_id, "ocr", 0.3, f"Running {ocr_engine}...")
|
| 461 |
+
progress_bar.progress(30)
|
| 462 |
+
|
| 463 |
+
# Actual processing with all options
|
| 464 |
+
options = {
|
| 465 |
+
"ocr_engine": ocr_engine,
|
| 466 |
+
"max_pages": max_pages,
|
| 467 |
+
"enable_layout": enable_layout,
|
| 468 |
+
"preserve_tables": preserve_tables,
|
| 469 |
+
"detect_headers": detect_headers,
|
| 470 |
+
"generate_embeddings": generate_embeddings,
|
| 471 |
+
}
|
| 472 |
+
processing_result = process_document_actual(file_bytes, filename, options)
|
| 473 |
+
|
| 474 |
+
# Update metrics
|
| 475 |
+
metric_placeholders["pages"].metric("Pages", processing_result.get("page_count", 0))
|
| 476 |
+
metric_placeholders["ocr_regions"].metric("OCR Regions", len(processing_result.get("ocr_regions", [])))
|
| 477 |
+
|
| 478 |
+
status_text.markdown("**📐 Layout detection...**")
|
| 479 |
+
state_manager.update_processing(doc_id, "layout", 0.5, "Detecting layout...")
|
| 480 |
+
progress_bar.progress(50)
|
| 481 |
+
time.sleep(0.2)
|
| 482 |
+
|
| 483 |
+
metric_placeholders["layout_regions"].metric("Layout Regions", len(processing_result.get("layout_regions", [])))
|
| 484 |
+
|
| 485 |
+
# Stage 4: Chunking
|
| 486 |
+
status_text.markdown("**✂️ Creating chunks...**")
|
| 487 |
+
state_manager.update_processing(doc_id, "chunking", 0.7, "Creating chunks...")
|
| 488 |
+
progress_bar.progress(70)
|
| 489 |
+
time.sleep(0.2)
|
| 490 |
+
|
| 491 |
+
chunks = processing_result.get("chunks", [])
|
| 492 |
+
metric_placeholders["chunks"].metric("Chunks", len(chunks))
|
| 493 |
+
metric_placeholders["confidence"].metric(
|
| 494 |
+
"Confidence",
|
| 495 |
+
f"{processing_result.get('ocr_confidence', 0) * 100:.0f}%"
|
| 496 |
+
)
|
| 497 |
+
|
| 498 |
+
# Stage 5: RAG Indexing
|
| 499 |
+
indexed_count = 0
|
| 500 |
+
if auto_index and rag_system["status"] == "ready" and chunks:
|
| 501 |
+
status_text.markdown("**📚 Indexing to RAG...**")
|
| 502 |
+
state_manager.update_processing(doc_id, "indexing", 0.9, "Indexing to RAG...")
|
| 503 |
+
progress_bar.progress(90)
|
| 504 |
+
|
| 505 |
+
# Auto-index
|
| 506 |
+
index_result = auto_index_processed_document(
|
| 507 |
+
doc_id=doc_id,
|
| 508 |
+
text=processing_result.get("raw_text", ""),
|
| 509 |
+
chunks=chunks,
|
| 510 |
+
metadata={"filename": filename, "source": file_to_process}
|
| 511 |
+
)
|
| 512 |
+
|
| 513 |
+
if index_result["success"]:
|
| 514 |
+
indexed_count = index_result["num_chunks"]
|
| 515 |
+
state_manager.mark_indexed(doc_id, indexed_count)
|
| 516 |
+
|
| 517 |
+
# Complete
|
| 518 |
+
progress_bar.progress(100)
|
| 519 |
+
processing_time = time.time() - processing_start
|
| 520 |
+
|
| 521 |
+
# Add to state manager
|
| 522 |
+
state_doc = StateDocument(
|
| 523 |
+
doc_id=doc_id,
|
| 524 |
+
filename=filename,
|
| 525 |
+
file_type=Path(filename).suffix[1:].upper(),
|
| 526 |
+
raw_text=processing_result.get("raw_text", ""),
|
| 527 |
+
chunks=chunks,
|
| 528 |
+
page_count=processing_result.get("page_count", 1),
|
| 529 |
+
page_images=[img["data"] for img in page_images],
|
| 530 |
+
ocr_regions=processing_result.get("ocr_regions", []),
|
| 531 |
+
layout_data={"regions": processing_result.get("layout_regions", [])},
|
| 532 |
+
indexed=indexed_count > 0,
|
| 533 |
+
indexed_chunks=indexed_count,
|
| 534 |
+
processing_time=processing_time,
|
| 535 |
+
)
|
| 536 |
+
state_manager.add_document(state_doc)
|
| 537 |
+
state_manager.complete_processing(doc_id, success=True)
|
| 538 |
+
state_manager.set_active_document(doc_id)
|
| 539 |
+
|
| 540 |
+
status_text.success(f"✅ Processing complete in {processing_time:.2f}s!")
|
| 541 |
+
|
| 542 |
+
except Exception as e:
|
| 543 |
+
error_msg = str(e)
|
| 544 |
+
state_manager.complete_processing(doc_id, success=False, error=error_msg)
|
| 545 |
+
status_text.error(f"❌ Processing failed: {error_msg}")
|
| 546 |
+
|
| 547 |
+
# Results
|
| 548 |
+
if processing_result and processing_result.get("success"):
|
| 549 |
+
with results_container:
|
| 550 |
+
st.markdown("---")
|
| 551 |
+
st.markdown("### 📋 Processing Results")
|
| 552 |
+
|
| 553 |
+
# Summary cards
|
| 554 |
+
sum_cols = st.columns(5)
|
| 555 |
+
sum_cols[0].markdown(f"""
|
| 556 |
+
<div class="metric-card">
|
| 557 |
+
<div class="metric-value">{processing_result.get('page_count', 0)}</div>
|
| 558 |
+
<div class="metric-label">Pages</div>
|
| 559 |
+
</div>
|
| 560 |
+
""", unsafe_allow_html=True)
|
| 561 |
+
sum_cols[1].markdown(f"""
|
| 562 |
+
<div class="metric-card">
|
| 563 |
+
<div class="metric-value">{len(processing_result.get('ocr_regions', []))}</div>
|
| 564 |
+
<div class="metric-label">OCR Regions</div>
|
| 565 |
+
</div>
|
| 566 |
+
""", unsafe_allow_html=True)
|
| 567 |
+
sum_cols[2].markdown(f"""
|
| 568 |
+
<div class="metric-card">
|
| 569 |
+
<div class="metric-value">{len(processing_result.get('layout_regions', []))}</div>
|
| 570 |
+
<div class="metric-label">Layout Regions</div>
|
| 571 |
+
</div>
|
| 572 |
+
""", unsafe_allow_html=True)
|
| 573 |
+
sum_cols[3].markdown(f"""
|
| 574 |
+
<div class="metric-card">
|
| 575 |
+
<div class="metric-value">{len(chunks)}</div>
|
| 576 |
+
<div class="metric-label">Chunks</div>
|
| 577 |
+
</div>
|
| 578 |
+
""", unsafe_allow_html=True)
|
| 579 |
+
sum_cols[4].markdown(f"""
|
| 580 |
+
<div class="metric-card">
|
| 581 |
+
<div class="metric-value">{indexed_count}</div>
|
| 582 |
+
<div class="metric-label">Indexed</div>
|
| 583 |
+
</div>
|
| 584 |
+
""", unsafe_allow_html=True)
|
| 585 |
+
|
| 586 |
+
# Show fallback warning prominently if fallback was used
|
| 587 |
+
if processing_result.get("fallback_reason"):
|
| 588 |
+
st.error(f"⚠️ **Fallback Mode**: Document processor failed, using simple text extraction. Layout detection unavailable. Reason: {processing_result['fallback_reason']}")
|
| 589 |
+
|
| 590 |
+
# Tabs for detailed results
|
| 591 |
+
tab_text, tab_chunks, tab_layout, tab_pages = st.tabs([
|
| 592 |
+
"📝 Extracted Text",
|
| 593 |
+
"📦 Chunks",
|
| 594 |
+
"🗺️ Layout",
|
| 595 |
+
"📄 Pages"
|
| 596 |
+
])
|
| 597 |
+
|
| 598 |
+
with tab_text:
|
| 599 |
+
text_preview = processing_result.get("raw_text", "")[:5000]
|
| 600 |
+
if len(processing_result.get("raw_text", "")) > 5000:
|
| 601 |
+
text_preview += "\n\n... [truncated] ..."
|
| 602 |
+
st.text_area("Full Text", text_preview, height=400)
|
| 603 |
+
|
| 604 |
+
if processing_result.get("fallback_reason"):
|
| 605 |
+
st.warning(f"Using fallback extraction: {processing_result['fallback_reason']}")
|
| 606 |
+
|
| 607 |
+
with tab_chunks:
|
| 608 |
+
for i, chunk in enumerate(chunks[:20]):
|
| 609 |
+
chunk_type = chunk.get("chunk_type", "text")
|
| 610 |
+
conf = chunk.get("confidence", 0)
|
| 611 |
+
color = "#4ECDC4" if conf > 0.8 else "#ffc107" if conf > 0.6 else "#dc3545"
|
| 612 |
+
|
| 613 |
+
with st.expander(f"[{i+1}] {chunk_type.upper()} - {chunk.get('text', '')[:50]}..."):
|
| 614 |
+
col1, col2, col3 = st.columns([2, 1, 1])
|
| 615 |
+
col1.markdown(f"**Chunk ID:** `{chunk.get('chunk_id', 'N/A')}`")
|
| 616 |
+
col2.markdown(f"**Page:** {chunk.get('page', 0) + 1}")
|
| 617 |
+
col3.markdown(f"**Confidence:** <span style='color:{color}'>{conf:.0%}</span>", unsafe_allow_html=True)
|
| 618 |
+
st.code(chunk.get("text", ""), language=None)
|
| 619 |
+
|
| 620 |
+
if len(chunks) > 20:
|
| 621 |
+
st.info(f"Showing 20 of {len(chunks)} chunks")
|
| 622 |
+
|
| 623 |
+
with tab_layout:
|
| 624 |
+
layout_regions = processing_result.get("layout_regions", [])
|
| 625 |
+
if layout_regions:
|
| 626 |
+
# Group by type
|
| 627 |
+
by_type = {}
|
| 628 |
+
for r in layout_regions:
|
| 629 |
+
t = r.get("type", "unknown")
|
| 630 |
+
by_type[t] = by_type.get(t, 0) + 1
|
| 631 |
+
|
| 632 |
+
st.markdown("**Detected Region Types:**")
|
| 633 |
+
type_cols = st.columns(min(len(by_type), 6))
|
| 634 |
+
for i, (rtype, count) in enumerate(by_type.items()):
|
| 635 |
+
type_cols[i % 6].metric(rtype.title(), count)
|
| 636 |
+
|
| 637 |
+
st.markdown("**Regions:**")
|
| 638 |
+
for r in layout_regions[:15]:
|
| 639 |
+
conf = r.get("confidence", 0)
|
| 640 |
+
color = "#4ECDC4" if conf > 0.8 else "#ffc107" if conf > 0.6 else "#dc3545"
|
| 641 |
+
st.markdown(f"- **{r.get('type', 'unknown').upper()}** (page {r.get('page', 0) + 1}) - Confidence: <span style='color:{color}'>{conf:.0%}</span>", unsafe_allow_html=True)
|
| 642 |
+
else:
|
| 643 |
+
# Provide helpful message based on cause
|
| 644 |
+
if processing_result.get("fallback_reason"):
|
| 645 |
+
st.warning("Layout detection unavailable - document processor is using fallback mode. Check the error message above.")
|
| 646 |
+
elif not enable_layout:
|
| 647 |
+
st.info("Layout detection is disabled. Enable it in the options above.")
|
| 648 |
+
else:
|
| 649 |
+
st.info("No layout regions detected. The document may have minimal structure or the OCR results didn't contain enough text patterns for layout analysis.")
|
| 650 |
+
|
| 651 |
+
with tab_pages:
|
| 652 |
+
if page_images:
|
| 653 |
+
for img_data in page_images:
|
| 654 |
+
st.markdown(f"**Page {img_data['page'] + 1}** ({img_data['width']}x{img_data['height']})")
|
| 655 |
+
st.image(
|
| 656 |
+
f"data:image/png;base64,{img_data['data']}",
|
| 657 |
+
use_container_width=True
|
| 658 |
+
)
|
| 659 |
+
else:
|
| 660 |
+
st.info("Page images not available")
|
| 661 |
+
|
| 662 |
+
# Navigation to other modules
|
| 663 |
+
st.markdown("---")
|
| 664 |
+
st.markdown("### 🔗 Continue With This Document")
|
| 665 |
+
|
| 666 |
+
nav_cols = st.columns(3)
|
| 667 |
+
|
| 668 |
+
with nav_cols[0]:
|
| 669 |
+
st.markdown("""
|
| 670 |
+
<div class="nav-card">
|
| 671 |
+
<h4>💬 Interactive RAG</h4>
|
| 672 |
+
<p style="color: #8b949e;">Ask questions about this document using the RAG system.</p>
|
| 673 |
+
</div>
|
| 674 |
+
""", unsafe_allow_html=True)
|
| 675 |
+
if st.button("Go to Interactive RAG", key="nav_rag", use_container_width=True):
|
| 676 |
+
st.switch_page("pages/2_💬_Interactive_RAG.py")
|
| 677 |
+
|
| 678 |
+
with nav_cols[1]:
|
| 679 |
+
st.markdown("""
|
| 680 |
+
<div class="nav-card">
|
| 681 |
+
<h4>📄 Document Viewer</h4>
|
| 682 |
+
<p style="color: #8b949e;">View chunks, layout, and visual annotations.</p>
|
| 683 |
+
</div>
|
| 684 |
+
""", unsafe_allow_html=True)
|
| 685 |
+
if st.button("Go to Document Viewer", key="nav_viewer", use_container_width=True):
|
| 686 |
+
st.switch_page("pages/5_📄_Document_Viewer.py")
|
| 687 |
+
|
| 688 |
+
with nav_cols[2]:
|
| 689 |
+
st.markdown("""
|
| 690 |
+
<div class="nav-card">
|
| 691 |
+
<h4>🎯 Evidence Viewer</h4>
|
| 692 |
+
<p style="color: #8b949e;">Inspect OCR regions and evidence grounding.</p>
|
| 693 |
+
</div>
|
| 694 |
+
""", unsafe_allow_html=True)
|
| 695 |
+
if st.button("Go to Evidence Viewer", key="nav_evidence", use_container_width=True):
|
| 696 |
+
st.switch_page("pages/4_🎯_Evidence_Viewer.py")
|
| 697 |
+
|
| 698 |
+
# Show recent processed documents
|
| 699 |
+
st.markdown("---")
|
| 700 |
+
st.markdown("### 📚 Recently Processed")
|
| 701 |
+
|
| 702 |
+
all_docs = state_manager.get_all_documents()
|
| 703 |
+
if all_docs:
|
| 704 |
+
for doc in reversed(all_docs[-5:]):
|
| 705 |
+
col1, col2, col3, col4 = st.columns([3, 1, 1, 1])
|
| 706 |
+
col1.markdown(f"**{doc.filename}** (`{doc.doc_id[:8]}...`)")
|
| 707 |
+
col2.markdown(f"📄 {doc.page_count} pages")
|
| 708 |
+
col3.markdown(f"📦 {len(doc.chunks)} chunks")
|
| 709 |
+
if doc.indexed:
|
| 710 |
+
col4.success(f"✓ Indexed ({doc.indexed_chunks})")
|
| 711 |
+
else:
|
| 712 |
+
col4.warning("Not indexed")
|
| 713 |
+
else:
|
| 714 |
+
st.info("No documents processed yet. Upload or select a document above.")
|
demo/pages/2_💬_Interactive_RAG.py
ADDED
|
@@ -0,0 +1,844 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Interactive Multi-Agentic RAG - SPARKNET
|
| 3 |
+
|
| 4 |
+
Query your documents using the unified RAG system with document filtering
|
| 5 |
+
and real-time chunk inspection.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import streamlit as st
|
| 9 |
+
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import time
|
| 12 |
+
import hashlib
|
| 13 |
+
|
| 14 |
+
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
| 15 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 16 |
+
sys.path.insert(0, str(PROJECT_ROOT / "demo"))
|
| 17 |
+
|
| 18 |
+
# Import unified RAG configuration and state manager
|
| 19 |
+
from rag_config import (
|
| 20 |
+
get_unified_rag_system,
|
| 21 |
+
get_store_stats,
|
| 22 |
+
index_document,
|
| 23 |
+
query_rag,
|
| 24 |
+
check_ollama,
|
| 25 |
+
get_indexed_documents,
|
| 26 |
+
search_similar_chunks,
|
| 27 |
+
)
|
| 28 |
+
from state_manager import (
|
| 29 |
+
get_state_manager,
|
| 30 |
+
render_global_status_bar,
|
| 31 |
+
)
|
| 32 |
+
import re
|
| 33 |
+
from collections import Counter
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def clean_filename_for_question(filename: str) -> str:
|
| 37 |
+
"""
|
| 38 |
+
Clean a filename to make it suitable for use in a question.
|
| 39 |
+
Handles cases like 'Red_Hat_NA.pdf' -> 'Red Hat' (removing short tokens).
|
| 40 |
+
"""
|
| 41 |
+
# Remove extension
|
| 42 |
+
name = Path(filename).stem
|
| 43 |
+
|
| 44 |
+
# Replace separators with spaces
|
| 45 |
+
name = re.sub(r'[_\-\.]+', ' ', name)
|
| 46 |
+
|
| 47 |
+
# Split into words and filter
|
| 48 |
+
words = name.split()
|
| 49 |
+
|
| 50 |
+
# Remove very short tokens (like 'NA', 'V1', etc.) and numbers
|
| 51 |
+
cleaned_words = []
|
| 52 |
+
for word in words:
|
| 53 |
+
# Skip if too short (1-2 chars) unless it's a known acronym
|
| 54 |
+
if len(word) <= 2 and not word.upper() in ['AI', 'ML', 'NLP', 'API', 'UI', 'UX']:
|
| 55 |
+
continue
|
| 56 |
+
# Skip pure numbers or version-like strings
|
| 57 |
+
if re.match(r'^[vV]?\d+$', word):
|
| 58 |
+
continue
|
| 59 |
+
# Skip common file suffixes
|
| 60 |
+
if word.lower() in ['final', 'draft', 'copy', 'new', 'old', 'v1', 'v2']:
|
| 61 |
+
continue
|
| 62 |
+
cleaned_words.append(word)
|
| 63 |
+
|
| 64 |
+
# Join and clean up extra spaces
|
| 65 |
+
result = ' '.join(cleaned_words).strip()
|
| 66 |
+
|
| 67 |
+
# If result is too short, return None
|
| 68 |
+
if len(result) < 3:
|
| 69 |
+
return None
|
| 70 |
+
|
| 71 |
+
return result
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def generate_dynamic_questions(state_manager, indexed_docs, max_questions=4):
|
| 75 |
+
"""
|
| 76 |
+
Generate dynamic suggested questions based on indexed document content.
|
| 77 |
+
|
| 78 |
+
Analyzes:
|
| 79 |
+
- Document titles and filenames
|
| 80 |
+
- Chunk content for key topics
|
| 81 |
+
- Table presence
|
| 82 |
+
- Document types
|
| 83 |
+
- Detected entities and keywords
|
| 84 |
+
"""
|
| 85 |
+
questions = []
|
| 86 |
+
|
| 87 |
+
# Get all indexed documents from state manager
|
| 88 |
+
all_docs = state_manager.get_all_documents()
|
| 89 |
+
indexed_doc_list = [d for d in all_docs if d.indexed]
|
| 90 |
+
|
| 91 |
+
if not indexed_doc_list and not indexed_docs:
|
| 92 |
+
# No documents indexed - return generic questions
|
| 93 |
+
return [
|
| 94 |
+
"What is the main topic of this document?",
|
| 95 |
+
"Summarize the key points",
|
| 96 |
+
"What are the main findings?",
|
| 97 |
+
"List the important details",
|
| 98 |
+
]
|
| 99 |
+
|
| 100 |
+
# Collect document info
|
| 101 |
+
doc_names = []
|
| 102 |
+
all_text_samples = []
|
| 103 |
+
has_tables = False
|
| 104 |
+
has_figures = False
|
| 105 |
+
doc_types = set()
|
| 106 |
+
|
| 107 |
+
for doc in indexed_doc_list:
|
| 108 |
+
doc_names.append(doc.filename)
|
| 109 |
+
doc_types.add(doc.file_type.lower())
|
| 110 |
+
|
| 111 |
+
# Sample text from chunks
|
| 112 |
+
for chunk in doc.chunks[:10]: # First 10 chunks
|
| 113 |
+
chunk_text = chunk.get('text', '') if isinstance(chunk, dict) else str(chunk)
|
| 114 |
+
all_text_samples.append(chunk_text[:500])
|
| 115 |
+
|
| 116 |
+
# Check for tables
|
| 117 |
+
chunk_type = chunk.get('chunk_type', '') if isinstance(chunk, dict) else ''
|
| 118 |
+
if 'table' in chunk_type.lower():
|
| 119 |
+
has_tables = True
|
| 120 |
+
if 'figure' in chunk_type.lower() or 'chart' in chunk_type.lower():
|
| 121 |
+
has_figures = True
|
| 122 |
+
|
| 123 |
+
# Also check indexed_docs from RAG system
|
| 124 |
+
for doc_info in indexed_docs[:5]:
|
| 125 |
+
if isinstance(doc_info, dict):
|
| 126 |
+
doc_names.append(doc_info.get('filename', doc_info.get('doc_id', '')))
|
| 127 |
+
|
| 128 |
+
# Extract key topics from text samples
|
| 129 |
+
combined_text = ' '.join(all_text_samples).lower()
|
| 130 |
+
|
| 131 |
+
# Extract potential topics (simple keyword extraction)
|
| 132 |
+
stop_words = {
|
| 133 |
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
| 134 |
+
'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
|
| 135 |
+
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
|
| 136 |
+
'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need',
|
| 137 |
+
'this', 'that', 'these', 'those', 'it', 'its', 'as', 'if', 'when',
|
| 138 |
+
'than', 'so', 'no', 'not', 'only', 'own', 'same', 'too', 'very',
|
| 139 |
+
'just', 'also', 'now', 'here', 'there', 'where', 'why', 'how', 'all',
|
| 140 |
+
'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such',
|
| 141 |
+
'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between',
|
| 142 |
+
'under', 'again', 'further', 'then', 'once', 'any', 'about', 'which', 'who',
|
| 143 |
+
'page', 'document', 'file', 'section', 'chapter', 'figure', 'table',
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
# Extract words (3+ chars, not numbers)
|
| 147 |
+
words = re.findall(r'\b[a-z]{3,}\b', combined_text)
|
| 148 |
+
meaningful_words = [w for w in words if w not in stop_words and len(w) > 3]
|
| 149 |
+
word_freq = Counter(meaningful_words)
|
| 150 |
+
top_topics = [word for word, count in word_freq.most_common(15) if count > 2]
|
| 151 |
+
|
| 152 |
+
# Generate questions based on top topics (prioritize content-based questions)
|
| 153 |
+
if top_topics:
|
| 154 |
+
topic = top_topics[0]
|
| 155 |
+
questions.append(f"What does the document say about {topic}?")
|
| 156 |
+
|
| 157 |
+
if len(top_topics) > 1:
|
| 158 |
+
questions.append(f"Explain the {top_topics[1]} mentioned in the document")
|
| 159 |
+
|
| 160 |
+
if len(top_topics) > 2:
|
| 161 |
+
questions.append(f"How are {top_topics[0]} and {top_topics[2]} related?")
|
| 162 |
+
|
| 163 |
+
# Generate questions based on clean document names (only if name is meaningful)
|
| 164 |
+
for name in doc_names[:2]:
|
| 165 |
+
clean_name = clean_filename_for_question(name)
|
| 166 |
+
if clean_name and len(clean_name) > 5:
|
| 167 |
+
questions.append(f"Summarize the {clean_name} document")
|
| 168 |
+
break # Only use one document name question
|
| 169 |
+
|
| 170 |
+
# Add table-specific question if tables detected
|
| 171 |
+
if has_tables:
|
| 172 |
+
questions.append("What data is presented in the tables?")
|
| 173 |
+
|
| 174 |
+
# Add figure-specific question if figures detected
|
| 175 |
+
if has_figures:
|
| 176 |
+
questions.append("What do the figures and charts show?")
|
| 177 |
+
|
| 178 |
+
# Add document-type specific questions
|
| 179 |
+
if 'pdf' in doc_types:
|
| 180 |
+
questions.append("What are the main conclusions?")
|
| 181 |
+
if 'docx' in doc_types or 'doc' in doc_types:
|
| 182 |
+
questions.append("What recommendations are made?")
|
| 183 |
+
if 'xlsx' in doc_types or 'xls' in doc_types:
|
| 184 |
+
questions.append("What trends are visible in the data?")
|
| 185 |
+
|
| 186 |
+
# Add content-aware generic questions
|
| 187 |
+
generic_questions = [
|
| 188 |
+
"Summarize the key points in this document",
|
| 189 |
+
"What are the main findings discussed?",
|
| 190 |
+
"What methodology or approach is described?",
|
| 191 |
+
"What are the important takeaways?",
|
| 192 |
+
"List the main topics covered",
|
| 193 |
+
"What problems or challenges are mentioned?",
|
| 194 |
+
]
|
| 195 |
+
|
| 196 |
+
# Fill remaining slots with generic questions
|
| 197 |
+
for q in generic_questions:
|
| 198 |
+
if len(questions) >= max_questions:
|
| 199 |
+
break
|
| 200 |
+
if q not in questions:
|
| 201 |
+
questions.append(q)
|
| 202 |
+
|
| 203 |
+
# Ensure we have unique questions and limit to max
|
| 204 |
+
seen = set()
|
| 205 |
+
unique_questions = []
|
| 206 |
+
for q in questions:
|
| 207 |
+
q_lower = q.lower()
|
| 208 |
+
if q_lower not in seen:
|
| 209 |
+
seen.add(q_lower)
|
| 210 |
+
unique_questions.append(q)
|
| 211 |
+
if len(unique_questions) >= max_questions:
|
| 212 |
+
break
|
| 213 |
+
|
| 214 |
+
# Fallback if we don't have enough
|
| 215 |
+
while len(unique_questions) < max_questions:
|
| 216 |
+
fallback = [
|
| 217 |
+
"What is this document about?",
|
| 218 |
+
"Summarize the key points",
|
| 219 |
+
"What are the main findings?",
|
| 220 |
+
"What conclusions are drawn?",
|
| 221 |
+
]
|
| 222 |
+
for q in fallback:
|
| 223 |
+
if q not in unique_questions:
|
| 224 |
+
unique_questions.append(q)
|
| 225 |
+
break
|
| 226 |
+
if len(unique_questions) >= max_questions:
|
| 227 |
+
break
|
| 228 |
+
|
| 229 |
+
return unique_questions[:max_questions]
|
| 230 |
+
|
| 231 |
+
st.set_page_config(
|
| 232 |
+
page_title="Interactive RAG - SPARKNET",
|
| 233 |
+
page_icon="💬",
|
| 234 |
+
layout="wide"
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
# Custom CSS
|
| 238 |
+
st.markdown("""
|
| 239 |
+
<style>
|
| 240 |
+
.chat-message {
|
| 241 |
+
padding: 15px;
|
| 242 |
+
border-radius: 12px;
|
| 243 |
+
margin: 10px 0;
|
| 244 |
+
}
|
| 245 |
+
.user-message {
|
| 246 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 247 |
+
color: white;
|
| 248 |
+
}
|
| 249 |
+
.assistant-message {
|
| 250 |
+
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
|
| 251 |
+
color: #eee;
|
| 252 |
+
}
|
| 253 |
+
.source-card {
|
| 254 |
+
background: #0d1117;
|
| 255 |
+
border-radius: 8px;
|
| 256 |
+
padding: 12px;
|
| 257 |
+
margin: 8px 0;
|
| 258 |
+
border-left: 4px solid #4ECDC4;
|
| 259 |
+
}
|
| 260 |
+
.source-header {
|
| 261 |
+
font-size: 11px;
|
| 262 |
+
color: #888;
|
| 263 |
+
margin-bottom: 6px;
|
| 264 |
+
}
|
| 265 |
+
.source-text {
|
| 266 |
+
font-size: 13px;
|
| 267 |
+
color: #c9d1d9;
|
| 268 |
+
font-family: monospace;
|
| 269 |
+
}
|
| 270 |
+
.metric-box {
|
| 271 |
+
background: #161b22;
|
| 272 |
+
border-radius: 8px;
|
| 273 |
+
padding: 10px;
|
| 274 |
+
text-align: center;
|
| 275 |
+
}
|
| 276 |
+
.metric-value {
|
| 277 |
+
font-size: 20px;
|
| 278 |
+
font-weight: bold;
|
| 279 |
+
color: #4ECDC4;
|
| 280 |
+
}
|
| 281 |
+
.metric-label {
|
| 282 |
+
font-size: 10px;
|
| 283 |
+
color: #888;
|
| 284 |
+
text-transform: uppercase;
|
| 285 |
+
}
|
| 286 |
+
.pipeline-bar {
|
| 287 |
+
display: flex;
|
| 288 |
+
justify-content: center;
|
| 289 |
+
gap: 5px;
|
| 290 |
+
padding: 10px;
|
| 291 |
+
background: #0d1117;
|
| 292 |
+
border-radius: 8px;
|
| 293 |
+
margin: 10px 0;
|
| 294 |
+
}
|
| 295 |
+
.pipeline-step {
|
| 296 |
+
padding: 5px 12px;
|
| 297 |
+
border-radius: 15px;
|
| 298 |
+
font-size: 11px;
|
| 299 |
+
background: #21262d;
|
| 300 |
+
color: #8b949e;
|
| 301 |
+
}
|
| 302 |
+
.pipeline-step.active {
|
| 303 |
+
background: linear-gradient(90deg, #4ECDC4, #44a08d);
|
| 304 |
+
color: white;
|
| 305 |
+
}
|
| 306 |
+
.pipeline-step.done {
|
| 307 |
+
background: #238636;
|
| 308 |
+
color: white;
|
| 309 |
+
}
|
| 310 |
+
.doc-filter-card {
|
| 311 |
+
background: #161b22;
|
| 312 |
+
border-radius: 8px;
|
| 313 |
+
padding: 10px;
|
| 314 |
+
margin: 5px 0;
|
| 315 |
+
border: 1px solid #30363d;
|
| 316 |
+
}
|
| 317 |
+
.doc-filter-card.selected {
|
| 318 |
+
border-color: #4ECDC4;
|
| 319 |
+
}
|
| 320 |
+
.chunk-preview {
|
| 321 |
+
background: #0d1117;
|
| 322 |
+
border-radius: 6px;
|
| 323 |
+
padding: 8px;
|
| 324 |
+
margin: 4px 0;
|
| 325 |
+
font-size: 12px;
|
| 326 |
+
font-family: monospace;
|
| 327 |
+
max-height: 100px;
|
| 328 |
+
overflow-y: auto;
|
| 329 |
+
}
|
| 330 |
+
</style>
|
| 331 |
+
""", unsafe_allow_html=True)
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
def get_chunk_color(index: int) -> str:
|
| 335 |
+
"""Get distinct color for citations."""
|
| 336 |
+
colors = [
|
| 337 |
+
"#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4",
|
| 338 |
+
"#FFEAA7", "#DDA0DD", "#98D8C8", "#F7DC6F",
|
| 339 |
+
]
|
| 340 |
+
return colors[index % len(colors)]
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
# Initialize state manager
|
| 344 |
+
state_manager = get_state_manager()
|
| 345 |
+
|
| 346 |
+
# Get system status
|
| 347 |
+
rag_system = get_unified_rag_system()
|
| 348 |
+
ollama_ok, models = check_ollama()
|
| 349 |
+
stats = get_store_stats()
|
| 350 |
+
indexed_docs = get_indexed_documents()
|
| 351 |
+
|
| 352 |
+
# Session state
|
| 353 |
+
if "messages" not in st.session_state:
|
| 354 |
+
st.session_state.messages = []
|
| 355 |
+
if "quick_indexed" not in st.session_state:
|
| 356 |
+
st.session_state.quick_indexed = []
|
| 357 |
+
if "doc_filter" not in st.session_state:
|
| 358 |
+
st.session_state.doc_filter = None # None = all documents
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
# Header
|
| 362 |
+
st.markdown("# 💬 Interactive RAG Chat")
|
| 363 |
+
st.markdown("Ask questions about your indexed documents with multi-agent pipeline")
|
| 364 |
+
|
| 365 |
+
# Global status bar
|
| 366 |
+
render_global_status_bar()
|
| 367 |
+
|
| 368 |
+
# Pipeline indicator
|
| 369 |
+
st.markdown("""
|
| 370 |
+
<div class="pipeline-bar">
|
| 371 |
+
<span class="pipeline-step">📝 Query</span>
|
| 372 |
+
<span>→</span>
|
| 373 |
+
<span class="pipeline-step">🎯 Plan</span>
|
| 374 |
+
<span>→</span>
|
| 375 |
+
<span class="pipeline-step">🔍 Retrieve</span>
|
| 376 |
+
<span>→</span>
|
| 377 |
+
<span class="pipeline-step">📊 Rerank</span>
|
| 378 |
+
<span>→</span>
|
| 379 |
+
<span class="pipeline-step">💬 Generate</span>
|
| 380 |
+
<span>→</span>
|
| 381 |
+
<span class="pipeline-step">✅ Validate</span>
|
| 382 |
+
</div>
|
| 383 |
+
""", unsafe_allow_html=True)
|
| 384 |
+
|
| 385 |
+
# Status bar
|
| 386 |
+
cols = st.columns(5)
|
| 387 |
+
with cols[0]:
|
| 388 |
+
if ollama_ok:
|
| 389 |
+
st.success(f"Ollama ({len(models)})")
|
| 390 |
+
else:
|
| 391 |
+
st.error("Ollama Offline")
|
| 392 |
+
with cols[1]:
|
| 393 |
+
if rag_system["status"] == "ready":
|
| 394 |
+
st.success("RAG Ready")
|
| 395 |
+
else:
|
| 396 |
+
st.error("RAG Error")
|
| 397 |
+
with cols[2]:
|
| 398 |
+
st.info(f"{rag_system.get('llm_model', 'N/A').split(':')[0]}")
|
| 399 |
+
with cols[3]:
|
| 400 |
+
chunk_count = stats.get('total_chunks', 0)
|
| 401 |
+
if chunk_count > 0:
|
| 402 |
+
st.success(f"{chunk_count} Chunks")
|
| 403 |
+
else:
|
| 404 |
+
st.warning("0 Chunks")
|
| 405 |
+
with cols[4]:
|
| 406 |
+
st.info(f"{rag_system.get('embed_model', 'N/A').split(':')[0]}")
|
| 407 |
+
|
| 408 |
+
if rag_system["status"] == "error":
|
| 409 |
+
with st.expander("RAG Error Details"):
|
| 410 |
+
st.code(rag_system["error"])
|
| 411 |
+
|
| 412 |
+
st.markdown("---")
|
| 413 |
+
|
| 414 |
+
# Sidebar
|
| 415 |
+
with st.sidebar:
|
| 416 |
+
st.markdown("## 📚 Document Filter")
|
| 417 |
+
|
| 418 |
+
if indexed_docs:
|
| 419 |
+
st.markdown(f"**{len(indexed_docs)} documents indexed**")
|
| 420 |
+
|
| 421 |
+
# All documents option
|
| 422 |
+
if st.button(
|
| 423 |
+
"All Documents",
|
| 424 |
+
key="filter_all",
|
| 425 |
+
type="primary" if st.session_state.doc_filter is None else "secondary",
|
| 426 |
+
use_container_width=True
|
| 427 |
+
):
|
| 428 |
+
st.session_state.doc_filter = None
|
| 429 |
+
st.rerun()
|
| 430 |
+
|
| 431 |
+
st.markdown("---")
|
| 432 |
+
st.markdown("**Filter by document:**")
|
| 433 |
+
|
| 434 |
+
# Document list
|
| 435 |
+
for doc in indexed_docs[:10]:
|
| 436 |
+
doc_id = doc.get("document_id", "unknown")
|
| 437 |
+
chunk_count = doc.get("chunk_count", 0)
|
| 438 |
+
is_selected = st.session_state.doc_filter == doc_id
|
| 439 |
+
|
| 440 |
+
if st.button(
|
| 441 |
+
f"📄 {doc_id[:20]}... ({chunk_count})",
|
| 442 |
+
key=f"filter_{doc_id}",
|
| 443 |
+
type="primary" if is_selected else "secondary",
|
| 444 |
+
use_container_width=True
|
| 445 |
+
):
|
| 446 |
+
st.session_state.doc_filter = doc_id
|
| 447 |
+
st.rerun()
|
| 448 |
+
|
| 449 |
+
if len(indexed_docs) > 10:
|
| 450 |
+
st.caption(f"... and {len(indexed_docs) - 10} more")
|
| 451 |
+
|
| 452 |
+
# Show selected filter
|
| 453 |
+
if st.session_state.doc_filter:
|
| 454 |
+
st.markdown("---")
|
| 455 |
+
st.info(f"Filtering: {st.session_state.doc_filter[:25]}...")
|
| 456 |
+
if st.button("Clear Filter"):
|
| 457 |
+
st.session_state.doc_filter = None
|
| 458 |
+
st.rerun()
|
| 459 |
+
else:
|
| 460 |
+
st.info("No documents indexed yet")
|
| 461 |
+
|
| 462 |
+
st.markdown("---")
|
| 463 |
+
st.markdown("## 📤 Quick Index")
|
| 464 |
+
st.caption("Index text directly without leaving this page")
|
| 465 |
+
|
| 466 |
+
quick_text = st.text_area("Paste text:", height=120, key="quick_text",
|
| 467 |
+
placeholder="Paste document text here...")
|
| 468 |
+
quick_name = st.text_input("Name:", value="quick_doc", key="quick_name")
|
| 469 |
+
|
| 470 |
+
if st.button("📥 Index Now", type="primary", use_container_width=True,
|
| 471 |
+
disabled=(rag_system["status"] != "ready")):
|
| 472 |
+
if quick_text.strip():
|
| 473 |
+
with st.spinner("Indexing..."):
|
| 474 |
+
doc_id = f"{quick_name}_{hashlib.md5(quick_text[:50].encode()).hexdigest()[:8]}"
|
| 475 |
+
result = index_document(
|
| 476 |
+
text=quick_text,
|
| 477 |
+
document_id=doc_id,
|
| 478 |
+
metadata={"filename": quick_name, "source": "quick_index"}
|
| 479 |
+
)
|
| 480 |
+
if result["success"]:
|
| 481 |
+
st.session_state.quick_indexed.append(quick_name)
|
| 482 |
+
st.success(f"{result['num_chunks']} chunks indexed!")
|
| 483 |
+
st.rerun()
|
| 484 |
+
else:
|
| 485 |
+
st.error(f"Error: {result['error']}")
|
| 486 |
+
else:
|
| 487 |
+
st.warning("Enter some text first")
|
| 488 |
+
|
| 489 |
+
# Recently indexed
|
| 490 |
+
if st.session_state.quick_indexed:
|
| 491 |
+
st.markdown("---")
|
| 492 |
+
st.markdown("### Recently Indexed")
|
| 493 |
+
for doc in st.session_state.quick_indexed[-5:]:
|
| 494 |
+
st.caption(f"• {doc}")
|
| 495 |
+
|
| 496 |
+
st.markdown("---")
|
| 497 |
+
st.markdown("### Options")
|
| 498 |
+
|
| 499 |
+
show_sources = st.checkbox("Show sources", value=True)
|
| 500 |
+
show_metrics = st.checkbox("Show metrics", value=True)
|
| 501 |
+
show_chunk_preview = st.checkbox("Show chunk preview", value=False)
|
| 502 |
+
|
| 503 |
+
if st.button("Clear Chat"):
|
| 504 |
+
st.session_state.messages = []
|
| 505 |
+
st.rerun()
|
| 506 |
+
|
| 507 |
+
# Main chat area
|
| 508 |
+
if stats.get('total_chunks', 0) == 0:
|
| 509 |
+
st.warning("No documents indexed yet!")
|
| 510 |
+
st.markdown("""
|
| 511 |
+
**To get started:**
|
| 512 |
+
1. Use the **Quick Index** in the sidebar to paste and index text
|
| 513 |
+
2. Or go to **🔬 Live Processing** page to upload and process documents
|
| 514 |
+
|
| 515 |
+
Once you've indexed some content, come back here to ask questions!
|
| 516 |
+
""")
|
| 517 |
+
|
| 518 |
+
# Sample text for quick start
|
| 519 |
+
with st.expander("Try with sample text"):
|
| 520 |
+
sample = """SPARKNET is a multi-agentic document intelligence framework.
|
| 521 |
+
It uses RAG (Retrieval-Augmented Generation) for document Q&A.
|
| 522 |
+
|
| 523 |
+
Key features:
|
| 524 |
+
- PDF, TXT, MD document processing
|
| 525 |
+
- Visual chunk segmentation
|
| 526 |
+
- Hybrid retrieval (dense + sparse)
|
| 527 |
+
- Cross-encoder reranking
|
| 528 |
+
- Grounded answer generation with citations
|
| 529 |
+
- Hallucination detection and validation
|
| 530 |
+
|
| 531 |
+
The system uses multiple specialized agents:
|
| 532 |
+
1. Query Planner - analyzes and decomposes queries
|
| 533 |
+
2. Retriever - performs hybrid search
|
| 534 |
+
3. Reranker - scores relevance with cross-encoder
|
| 535 |
+
4. Synthesizer - generates grounded answers
|
| 536 |
+
5. Critic - validates for hallucination"""
|
| 537 |
+
|
| 538 |
+
st.code(sample, language=None)
|
| 539 |
+
if st.button("Index This Sample"):
|
| 540 |
+
result = index_document(
|
| 541 |
+
text=sample,
|
| 542 |
+
document_id="sparknet_sample",
|
| 543 |
+
metadata={"filename": "sparknet_sample", "source": "sample"}
|
| 544 |
+
)
|
| 545 |
+
if result["success"]:
|
| 546 |
+
st.success(f"Indexed {result['num_chunks']} chunks!")
|
| 547 |
+
st.rerun()
|
| 548 |
+
|
| 549 |
+
# Navigation
|
| 550 |
+
col1, col2 = st.columns(2)
|
| 551 |
+
with col1:
|
| 552 |
+
if st.button("🔬 Go to Live Processing", type="primary", use_container_width=True):
|
| 553 |
+
st.switch_page("pages/1_🔬_Live_Processing.py")
|
| 554 |
+
with col2:
|
| 555 |
+
if st.button("📄 Go to Document Viewer", use_container_width=True):
|
| 556 |
+
st.switch_page("pages/5_📄_Document_Viewer.py")
|
| 557 |
+
|
| 558 |
+
else:
|
| 559 |
+
# Check if we need to process a pending user message (from sample question click)
|
| 560 |
+
pending_query = None
|
| 561 |
+
if st.session_state.messages and st.session_state.messages[-1]["role"] == "user":
|
| 562 |
+
# Check if there's no assistant response after the last user message
|
| 563 |
+
pending_query = st.session_state.messages[-1]["content"]
|
| 564 |
+
|
| 565 |
+
# Display chat history (except pending query which we'll process below)
|
| 566 |
+
messages_to_display = st.session_state.messages[:-1] if pending_query else st.session_state.messages
|
| 567 |
+
for msg in messages_to_display:
|
| 568 |
+
with st.chat_message(msg["role"]):
|
| 569 |
+
st.markdown(msg["content"])
|
| 570 |
+
|
| 571 |
+
if msg["role"] == "assistant" and "metadata" in msg:
|
| 572 |
+
meta = msg["metadata"]
|
| 573 |
+
|
| 574 |
+
# Metrics
|
| 575 |
+
if show_metrics and meta:
|
| 576 |
+
m_cols = st.columns(4)
|
| 577 |
+
with m_cols[0]:
|
| 578 |
+
st.markdown(f'<div class="metric-box"><div class="metric-value">{meta.get("latency_ms", 0):.0f}ms</div><div class="metric-label">Latency</div></div>', unsafe_allow_html=True)
|
| 579 |
+
with m_cols[1]:
|
| 580 |
+
st.markdown(f'<div class="metric-box"><div class="metric-value">{meta.get("num_sources", 0)}</div><div class="metric-label">Sources</div></div>', unsafe_allow_html=True)
|
| 581 |
+
with m_cols[2]:
|
| 582 |
+
conf = meta.get("confidence", 0)
|
| 583 |
+
color = "#4ECDC4" if conf > 0.6 else "#ffc107" if conf > 0.3 else "#dc3545"
|
| 584 |
+
st.markdown(f'<div class="metric-box"><div class="metric-value" style="color:{color}">{conf:.0%}</div><div class="metric-label">Confidence</div></div>', unsafe_allow_html=True)
|
| 585 |
+
with m_cols[3]:
|
| 586 |
+
val = "✓" if meta.get("validated") else "?"
|
| 587 |
+
st.markdown(f'<div class="metric-box"><div class="metric-value">{val}</div><div class="metric-label">Validated</div></div>', unsafe_allow_html=True)
|
| 588 |
+
|
| 589 |
+
# Sources
|
| 590 |
+
if show_sources and "citations" in msg and msg["citations"]:
|
| 591 |
+
with st.expander(f"Sources ({len(msg['citations'])})"):
|
| 592 |
+
for i, cite in enumerate(msg["citations"]):
|
| 593 |
+
color = get_chunk_color(i)
|
| 594 |
+
st.markdown(f"""
|
| 595 |
+
<div class="source-card" style="border-left-color: {color};">
|
| 596 |
+
<div class="source-header">
|
| 597 |
+
<strong>[{cite.get('index', i+1)}]</strong> • Relevance: {cite.get('relevance_score', 0):.0%}
|
| 598 |
+
</div>
|
| 599 |
+
<div class="source-text">{cite.get('text_snippet', '')[:300]}...</div>
|
| 600 |
+
</div>
|
| 601 |
+
""", unsafe_allow_html=True)
|
| 602 |
+
|
| 603 |
+
# Show current filter
|
| 604 |
+
if st.session_state.doc_filter:
|
| 605 |
+
st.info(f"Searching in: **{st.session_state.doc_filter}** — [Clear filter in sidebar]")
|
| 606 |
+
|
| 607 |
+
# Process pending query from sample question click
|
| 608 |
+
if pending_query:
|
| 609 |
+
with st.chat_message("user"):
|
| 610 |
+
st.markdown(pending_query)
|
| 611 |
+
|
| 612 |
+
with st.chat_message("assistant"):
|
| 613 |
+
if rag_system["status"] != "ready":
|
| 614 |
+
st.error("RAG system not ready")
|
| 615 |
+
st.session_state.messages.append({"role": "assistant", "content": "RAG system not ready"})
|
| 616 |
+
else:
|
| 617 |
+
# Show progress
|
| 618 |
+
progress = st.progress(0)
|
| 619 |
+
status = st.empty()
|
| 620 |
+
|
| 621 |
+
stages = ["Planning", "Retrieving", "Reranking", "Generating", "Validating"]
|
| 622 |
+
for i, stage in enumerate(stages):
|
| 623 |
+
status.markdown(f"**{stage}...**")
|
| 624 |
+
progress.progress((i + 1) * 20)
|
| 625 |
+
time.sleep(0.15)
|
| 626 |
+
|
| 627 |
+
# Build filters for document
|
| 628 |
+
filters = None
|
| 629 |
+
if st.session_state.doc_filter:
|
| 630 |
+
filters = {"document_id": st.session_state.doc_filter}
|
| 631 |
+
|
| 632 |
+
# Query RAG
|
| 633 |
+
response, error = query_rag(pending_query, filters=filters)
|
| 634 |
+
|
| 635 |
+
progress.empty()
|
| 636 |
+
status.empty()
|
| 637 |
+
|
| 638 |
+
if error:
|
| 639 |
+
st.error(f"Error: {error}")
|
| 640 |
+
st.session_state.messages.append({"role": "assistant", "content": f"Error: {error}"})
|
| 641 |
+
elif response:
|
| 642 |
+
# Display answer
|
| 643 |
+
st.markdown(response.answer)
|
| 644 |
+
|
| 645 |
+
# Build metadata
|
| 646 |
+
metadata = {
|
| 647 |
+
"latency_ms": response.latency_ms,
|
| 648 |
+
"num_sources": response.num_sources,
|
| 649 |
+
"confidence": response.confidence,
|
| 650 |
+
"validated": response.validated,
|
| 651 |
+
}
|
| 652 |
+
|
| 653 |
+
# Display metrics
|
| 654 |
+
if show_metrics:
|
| 655 |
+
m_cols = st.columns(4)
|
| 656 |
+
with m_cols[0]:
|
| 657 |
+
st.markdown(f'<div class="metric-box"><div class="metric-value">{metadata.get("latency_ms", 0):.0f}ms</div><div class="metric-label">Latency</div></div>', unsafe_allow_html=True)
|
| 658 |
+
with m_cols[1]:
|
| 659 |
+
st.markdown(f'<div class="metric-box"><div class="metric-value">{metadata.get("num_sources", 0)}</div><div class="metric-label">Sources</div></div>', unsafe_allow_html=True)
|
| 660 |
+
with m_cols[2]:
|
| 661 |
+
conf = metadata.get("confidence", 0)
|
| 662 |
+
color = "#4ECDC4" if conf > 0.6 else "#ffc107" if conf > 0.3 else "#dc3545"
|
| 663 |
+
st.markdown(f'<div class="metric-box"><div class="metric-value" style="color:{color}">{conf:.0%}</div><div class="metric-label">Confidence</div></div>', unsafe_allow_html=True)
|
| 664 |
+
with m_cols[3]:
|
| 665 |
+
val = "✓" if metadata.get("validated") else "?"
|
| 666 |
+
st.markdown(f'<div class="metric-box"><div class="metric-value">{val}</div><div class="metric-label">Validated</div></div>', unsafe_allow_html=True)
|
| 667 |
+
|
| 668 |
+
# Build citations list
|
| 669 |
+
citations = []
|
| 670 |
+
if hasattr(response, 'citations') and response.citations:
|
| 671 |
+
for i, cite in enumerate(response.citations):
|
| 672 |
+
citations.append({
|
| 673 |
+
"index": i + 1,
|
| 674 |
+
"text_snippet": cite.text_snippet if hasattr(cite, 'text_snippet') else str(cite),
|
| 675 |
+
"relevance_score": cite.relevance_score if hasattr(cite, 'relevance_score') else 0.0,
|
| 676 |
+
})
|
| 677 |
+
|
| 678 |
+
# Store message with metadata
|
| 679 |
+
st.session_state.messages.append({
|
| 680 |
+
"role": "assistant",
|
| 681 |
+
"content": response.answer,
|
| 682 |
+
"metadata": metadata,
|
| 683 |
+
"citations": citations,
|
| 684 |
+
})
|
| 685 |
+
else:
|
| 686 |
+
st.warning("No response from RAG system")
|
| 687 |
+
st.session_state.messages.append({"role": "assistant", "content": "No response from RAG system"})
|
| 688 |
+
|
| 689 |
+
# Chat input
|
| 690 |
+
if prompt := st.chat_input("Ask about your documents..."):
|
| 691 |
+
# Add user message
|
| 692 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
| 693 |
+
|
| 694 |
+
with st.chat_message("user"):
|
| 695 |
+
st.markdown(prompt)
|
| 696 |
+
|
| 697 |
+
with st.chat_message("assistant"):
|
| 698 |
+
if rag_system["status"] != "ready":
|
| 699 |
+
st.error("RAG system not ready")
|
| 700 |
+
st.session_state.messages.append({"role": "assistant", "content": "RAG system not ready"})
|
| 701 |
+
else:
|
| 702 |
+
# Show progress
|
| 703 |
+
progress = st.progress(0)
|
| 704 |
+
status = st.empty()
|
| 705 |
+
|
| 706 |
+
stages = ["Planning", "Retrieving", "Reranking", "Generating", "Validating"]
|
| 707 |
+
for i, stage in enumerate(stages):
|
| 708 |
+
status.markdown(f"**{stage}...**")
|
| 709 |
+
progress.progress((i + 1) * 20)
|
| 710 |
+
time.sleep(0.15)
|
| 711 |
+
|
| 712 |
+
# Build filters for document
|
| 713 |
+
filters = None
|
| 714 |
+
if st.session_state.doc_filter:
|
| 715 |
+
filters = {"document_id": st.session_state.doc_filter}
|
| 716 |
+
|
| 717 |
+
# Query RAG
|
| 718 |
+
response, error = query_rag(prompt, filters=filters)
|
| 719 |
+
|
| 720 |
+
progress.empty()
|
| 721 |
+
status.empty()
|
| 722 |
+
|
| 723 |
+
if error:
|
| 724 |
+
st.error(f"Error: {error}")
|
| 725 |
+
st.session_state.messages.append({"role": "assistant", "content": f"Error: {error}"})
|
| 726 |
+
elif response:
|
| 727 |
+
# Display answer
|
| 728 |
+
st.markdown(response.answer)
|
| 729 |
+
|
| 730 |
+
# Build metadata
|
| 731 |
+
metadata = {
|
| 732 |
+
"latency_ms": response.latency_ms,
|
| 733 |
+
"num_sources": response.num_sources,
|
| 734 |
+
"confidence": response.confidence,
|
| 735 |
+
"validated": response.validated,
|
| 736 |
+
}
|
| 737 |
+
|
| 738 |
+
# Display metrics
|
| 739 |
+
if show_metrics:
|
| 740 |
+
m_cols = st.columns(4)
|
| 741 |
+
with m_cols[0]:
|
| 742 |
+
st.markdown(f'<div class="metric-box"><div class="metric-value">{response.latency_ms:.0f}ms</div><div class="metric-label">Latency</div></div>', unsafe_allow_html=True)
|
| 743 |
+
with m_cols[1]:
|
| 744 |
+
st.markdown(f'<div class="metric-box"><div class="metric-value">{response.num_sources}</div><div class="metric-label">Sources</div></div>', unsafe_allow_html=True)
|
| 745 |
+
with m_cols[2]:
|
| 746 |
+
conf_color = "#4ECDC4" if response.confidence > 0.6 else "#ffc107" if response.confidence > 0.3 else "#dc3545"
|
| 747 |
+
st.markdown(f'<div class="metric-box"><div class="metric-value" style="color:{conf_color}">{response.confidence:.0%}</div><div class="metric-label">Confidence</div></div>', unsafe_allow_html=True)
|
| 748 |
+
with m_cols[3]:
|
| 749 |
+
val_icon = "✓" if response.validated else "?"
|
| 750 |
+
st.markdown(f'<div class="metric-box"><div class="metric-value">{val_icon}</div><div class="metric-label">Validated</div></div>', unsafe_allow_html=True)
|
| 751 |
+
|
| 752 |
+
# Display sources
|
| 753 |
+
citations = []
|
| 754 |
+
if show_sources and response.citations:
|
| 755 |
+
with st.expander(f"Sources ({len(response.citations)})"):
|
| 756 |
+
for i, cite in enumerate(response.citations):
|
| 757 |
+
color = get_chunk_color(i)
|
| 758 |
+
citations.append({
|
| 759 |
+
"index": cite.index,
|
| 760 |
+
"relevance_score": cite.relevance_score,
|
| 761 |
+
"text_snippet": cite.text_snippet,
|
| 762 |
+
})
|
| 763 |
+
st.markdown(f"""
|
| 764 |
+
<div class="source-card" style="border-left-color: {color};">
|
| 765 |
+
<div class="source-header">
|
| 766 |
+
<strong>[{cite.index}]</strong> • Relevance: {cite.relevance_score:.0%}
|
| 767 |
+
</div>
|
| 768 |
+
<div class="source-text">{cite.text_snippet[:300]}...</div>
|
| 769 |
+
</div>
|
| 770 |
+
""", unsafe_allow_html=True)
|
| 771 |
+
|
| 772 |
+
# Chunk preview (semantic search)
|
| 773 |
+
if show_chunk_preview:
|
| 774 |
+
with st.expander("Chunk Preview (Top Matches)"):
|
| 775 |
+
chunks = search_similar_chunks(
|
| 776 |
+
prompt,
|
| 777 |
+
top_k=5,
|
| 778 |
+
doc_filter=st.session_state.doc_filter
|
| 779 |
+
)
|
| 780 |
+
for i, chunk in enumerate(chunks):
|
| 781 |
+
sim = chunk.get("similarity", 0)
|
| 782 |
+
color = "#4ECDC4" if sim > 0.7 else "#ffc107" if sim > 0.5 else "#8b949e"
|
| 783 |
+
st.markdown(f"""
|
| 784 |
+
<div class="chunk-preview" style="border-left: 3px solid {color};">
|
| 785 |
+
<div style="font-size: 10px; color: #8b949e;">
|
| 786 |
+
Similarity: <span style="color: {color};">{sim:.0%}</span> |
|
| 787 |
+
Doc: {chunk.get('document_id', 'N/A')[:15]}...
|
| 788 |
+
</div>
|
| 789 |
+
<div style="margin-top: 4px;">{chunk.get('text', '')[:200]}...</div>
|
| 790 |
+
</div>
|
| 791 |
+
""", unsafe_allow_html=True)
|
| 792 |
+
|
| 793 |
+
# Save to history
|
| 794 |
+
st.session_state.messages.append({
|
| 795 |
+
"role": "assistant",
|
| 796 |
+
"content": response.answer,
|
| 797 |
+
"citations": citations,
|
| 798 |
+
"metadata": metadata,
|
| 799 |
+
})
|
| 800 |
+
|
| 801 |
+
# Dynamic suggested questions based on document content
|
| 802 |
+
st.markdown("---")
|
| 803 |
+
st.markdown("### 💡 Try asking")
|
| 804 |
+
|
| 805 |
+
# Get indexed documents for question generation
|
| 806 |
+
indexed_docs = get_indexed_documents()
|
| 807 |
+
state_manager = get_state_manager()
|
| 808 |
+
|
| 809 |
+
# Generate dynamic questions based on document content
|
| 810 |
+
dynamic_questions = generate_dynamic_questions(state_manager, indexed_docs, max_questions=4)
|
| 811 |
+
|
| 812 |
+
# Display as clickable buttons
|
| 813 |
+
sample_cols = st.columns(len(dynamic_questions))
|
| 814 |
+
for i, q in enumerate(dynamic_questions):
|
| 815 |
+
with sample_cols[i]:
|
| 816 |
+
# Truncate long questions for button display
|
| 817 |
+
display_q = q if len(q) <= 35 else q[:32] + "..."
|
| 818 |
+
if st.button(display_q, key=f"sample_{i}", use_container_width=True,
|
| 819 |
+
disabled=(stats.get('total_chunks', 0) == 0),
|
| 820 |
+
help=q if len(q) > 35 else None):
|
| 821 |
+
st.session_state.messages.append({"role": "user", "content": q})
|
| 822 |
+
st.rerun()
|
| 823 |
+
|
| 824 |
+
# Show hint about dynamic questions
|
| 825 |
+
if stats.get('total_chunks', 0) > 0:
|
| 826 |
+
st.caption("📌 Questions are generated based on your indexed documents")
|
| 827 |
+
|
| 828 |
+
# Architecture info
|
| 829 |
+
with st.expander("Multi-Agent RAG Architecture"):
|
| 830 |
+
st.markdown("""
|
| 831 |
+
```
|
| 832 |
+
Query → [Query Planner] → [Retriever] → [Reranker] → [Synthesizer] → [Critic] → Answer
|
| 833 |
+
↓ ↓ ↓ ↓ ↓
|
| 834 |
+
Decompose Dense+Sparse Cross-Encoder Grounded Hallucination
|
| 835 |
+
& Expand + RRF Fusion Scoring Citations Detection
|
| 836 |
+
```
|
| 837 |
+
|
| 838 |
+
**Agents:**
|
| 839 |
+
- **Query Planner**: Analyzes intent, decomposes complex queries, expands terms
|
| 840 |
+
- **Retriever**: Hybrid search combining dense (embedding) and sparse (BM25) retrieval
|
| 841 |
+
- **Reranker**: Cross-encoder scoring for precision, diversity via MMR
|
| 842 |
+
- **Synthesizer**: Generates grounded answers with proper citations
|
| 843 |
+
- **Critic**: Validates for hallucination, checks citation accuracy
|
| 844 |
+
""")
|
demo/pages/3_📊_Document_Comparison.py
ADDED
|
@@ -0,0 +1,528 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document Comparison - SPARKNET
|
| 3 |
+
|
| 4 |
+
Compare documents using semantic similarity, structure analysis,
|
| 5 |
+
and content comparison with real embedding-based similarity.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import streamlit as st
|
| 9 |
+
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import pandas as pd
|
| 12 |
+
|
| 13 |
+
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
| 14 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 15 |
+
sys.path.insert(0, str(PROJECT_ROOT / "demo"))
|
| 16 |
+
|
| 17 |
+
from state_manager import (
|
| 18 |
+
get_state_manager,
|
| 19 |
+
render_global_status_bar,
|
| 20 |
+
)
|
| 21 |
+
from rag_config import (
|
| 22 |
+
get_indexed_documents,
|
| 23 |
+
compute_document_similarity,
|
| 24 |
+
search_similar_chunks,
|
| 25 |
+
check_ollama,
|
| 26 |
+
get_unified_rag_system,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
st.set_page_config(page_title="Document Comparison - SPARKNET", page_icon="📊", layout="wide")
|
| 30 |
+
|
| 31 |
+
# Custom CSS
|
| 32 |
+
st.markdown("""
|
| 33 |
+
<style>
|
| 34 |
+
.comparison-card {
|
| 35 |
+
background: #161b22;
|
| 36 |
+
border-radius: 10px;
|
| 37 |
+
padding: 15px;
|
| 38 |
+
margin: 10px 0;
|
| 39 |
+
border: 1px solid #30363d;
|
| 40 |
+
}
|
| 41 |
+
.doc-header {
|
| 42 |
+
font-size: 16px;
|
| 43 |
+
font-weight: bold;
|
| 44 |
+
color: #4ECDC4;
|
| 45 |
+
margin-bottom: 10px;
|
| 46 |
+
}
|
| 47 |
+
.similarity-badge {
|
| 48 |
+
display: inline-block;
|
| 49 |
+
padding: 8px 16px;
|
| 50 |
+
border-radius: 20px;
|
| 51 |
+
font-weight: bold;
|
| 52 |
+
font-size: 18px;
|
| 53 |
+
}
|
| 54 |
+
.sim-high {
|
| 55 |
+
background: linear-gradient(90deg, #4ECDC4 0%, #44a08d 100%);
|
| 56 |
+
color: white;
|
| 57 |
+
}
|
| 58 |
+
.sim-med {
|
| 59 |
+
background: linear-gradient(90deg, #ffc107 0%, #ff8800 100%);
|
| 60 |
+
color: black;
|
| 61 |
+
}
|
| 62 |
+
.sim-low {
|
| 63 |
+
background: linear-gradient(90deg, #dc3545 0%, #c82333 100%);
|
| 64 |
+
color: white;
|
| 65 |
+
}
|
| 66 |
+
.chunk-match {
|
| 67 |
+
background: #0d1117;
|
| 68 |
+
border-radius: 8px;
|
| 69 |
+
padding: 10px;
|
| 70 |
+
margin: 8px 0;
|
| 71 |
+
border-left: 4px solid;
|
| 72 |
+
}
|
| 73 |
+
.diff-added {
|
| 74 |
+
background: rgba(78, 205, 196, 0.1);
|
| 75 |
+
border-left-color: #4ECDC4;
|
| 76 |
+
}
|
| 77 |
+
.diff-removed {
|
| 78 |
+
background: rgba(220, 53, 69, 0.1);
|
| 79 |
+
border-left-color: #dc3545;
|
| 80 |
+
}
|
| 81 |
+
.diff-common {
|
| 82 |
+
background: rgba(139, 148, 158, 0.1);
|
| 83 |
+
border-left-color: #8b949e;
|
| 84 |
+
}
|
| 85 |
+
.metric-card {
|
| 86 |
+
background: #161b22;
|
| 87 |
+
border-radius: 8px;
|
| 88 |
+
padding: 15px;
|
| 89 |
+
text-align: center;
|
| 90 |
+
}
|
| 91 |
+
.metric-value {
|
| 92 |
+
font-size: 32px;
|
| 93 |
+
font-weight: bold;
|
| 94 |
+
}
|
| 95 |
+
.metric-label {
|
| 96 |
+
font-size: 11px;
|
| 97 |
+
color: #8b949e;
|
| 98 |
+
text-transform: uppercase;
|
| 99 |
+
}
|
| 100 |
+
</style>
|
| 101 |
+
""", unsafe_allow_html=True)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def get_similarity_class(sim: float) -> str:
|
| 105 |
+
"""Get CSS class based on similarity."""
|
| 106 |
+
if sim >= 0.7:
|
| 107 |
+
return "sim-high"
|
| 108 |
+
elif sim >= 0.4:
|
| 109 |
+
return "sim-med"
|
| 110 |
+
return "sim-low"
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def get_similarity_color(sim: float) -> str:
|
| 114 |
+
"""Get color based on similarity."""
|
| 115 |
+
if sim >= 0.7:
|
| 116 |
+
return "#4ECDC4"
|
| 117 |
+
elif sim >= 0.4:
|
| 118 |
+
return "#ffc107"
|
| 119 |
+
return "#dc3545"
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# Initialize state manager
|
| 123 |
+
state_manager = get_state_manager()
|
| 124 |
+
rag_system = get_unified_rag_system()
|
| 125 |
+
|
| 126 |
+
# Header
|
| 127 |
+
st.markdown("# 📊 Document Comparison")
|
| 128 |
+
st.markdown("Compare documents using semantic similarity, structure analysis, and content comparison")
|
| 129 |
+
|
| 130 |
+
# Global status bar
|
| 131 |
+
render_global_status_bar()
|
| 132 |
+
|
| 133 |
+
st.markdown("---")
|
| 134 |
+
|
| 135 |
+
# Get documents
|
| 136 |
+
all_docs = state_manager.get_all_documents()
|
| 137 |
+
indexed_docs = get_indexed_documents()
|
| 138 |
+
|
| 139 |
+
if not all_docs and not indexed_docs:
|
| 140 |
+
st.warning("No documents available for comparison")
|
| 141 |
+
st.markdown("""
|
| 142 |
+
### Getting Started
|
| 143 |
+
|
| 144 |
+
To compare documents:
|
| 145 |
+
1. Go to **Live Processing** to upload and process documents
|
| 146 |
+
2. Process at least 2 documents
|
| 147 |
+
3. Come back here to compare them
|
| 148 |
+
|
| 149 |
+
Features:
|
| 150 |
+
- **Semantic Similarity**: Compare documents using embedding-based similarity
|
| 151 |
+
- **Structure Analysis**: Compare document structure (pages, chunks, regions)
|
| 152 |
+
- **Content Comparison**: Find similar passages between documents
|
| 153 |
+
""")
|
| 154 |
+
|
| 155 |
+
if st.button("🔬 Go to Live Processing", type="primary", use_container_width=True):
|
| 156 |
+
st.switch_page("pages/1_🔬_Live_Processing.py")
|
| 157 |
+
|
| 158 |
+
else:
|
| 159 |
+
# Build document options
|
| 160 |
+
doc_options = {}
|
| 161 |
+
for doc in all_docs:
|
| 162 |
+
doc_options[f"{doc.filename} (State)"] = {"id": doc.doc_id, "source": "state", "doc": doc}
|
| 163 |
+
for doc in indexed_docs:
|
| 164 |
+
doc_id = doc.get("document_id", "unknown")
|
| 165 |
+
if doc_id not in [d["id"] for d in doc_options.values()]:
|
| 166 |
+
doc_options[f"{doc_id} (RAG)"] = {"id": doc_id, "source": "rag", "doc": doc}
|
| 167 |
+
|
| 168 |
+
if len(doc_options) < 2:
|
| 169 |
+
st.warning("Need at least 2 documents for comparison. Process more documents first.")
|
| 170 |
+
else:
|
| 171 |
+
# Document selection
|
| 172 |
+
st.markdown("### Select Documents to Compare")
|
| 173 |
+
|
| 174 |
+
col1, col2 = st.columns(2)
|
| 175 |
+
with col1:
|
| 176 |
+
doc1_name = st.selectbox("Document 1", list(doc_options.keys()), index=0)
|
| 177 |
+
with col2:
|
| 178 |
+
remaining = [k for k in doc_options.keys() if k != doc1_name]
|
| 179 |
+
doc2_name = st.selectbox("Document 2", remaining, index=0 if remaining else None)
|
| 180 |
+
|
| 181 |
+
doc1_info = doc_options.get(doc1_name)
|
| 182 |
+
doc2_info = doc_options.get(doc2_name)
|
| 183 |
+
|
| 184 |
+
# Comparison type
|
| 185 |
+
comparison_type = st.radio(
|
| 186 |
+
"Comparison Type",
|
| 187 |
+
["Semantic Similarity", "Structure Analysis", "Content Comparison"],
|
| 188 |
+
horizontal=True,
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
if st.button("🔍 Compare Documents", type="primary", use_container_width=True):
|
| 192 |
+
st.markdown("---")
|
| 193 |
+
|
| 194 |
+
if comparison_type == "Semantic Similarity":
|
| 195 |
+
st.markdown("### Semantic Similarity Analysis")
|
| 196 |
+
|
| 197 |
+
with st.spinner("Computing document embeddings and similarity..."):
|
| 198 |
+
# Use the compute_document_similarity function from rag_config
|
| 199 |
+
if rag_system["status"] == "ready":
|
| 200 |
+
result = compute_document_similarity(doc1_info["id"], doc2_info["id"])
|
| 201 |
+
|
| 202 |
+
if result.get("error"):
|
| 203 |
+
st.warning(f"Could not compute similarity: {result['error']}")
|
| 204 |
+
# Use fallback based on text overlap
|
| 205 |
+
if doc1_info["source"] == "state" and doc2_info["source"] == "state":
|
| 206 |
+
doc1 = doc1_info["doc"]
|
| 207 |
+
doc2 = doc2_info["doc"]
|
| 208 |
+
# Simple word overlap
|
| 209 |
+
words1 = set(doc1.raw_text.lower().split())
|
| 210 |
+
words2 = set(doc2.raw_text.lower().split())
|
| 211 |
+
overlap = len(words1 & words2) / max(len(words1 | words2), 1)
|
| 212 |
+
similarity = overlap
|
| 213 |
+
else:
|
| 214 |
+
similarity = 0.5 # Default fallback
|
| 215 |
+
else:
|
| 216 |
+
similarity = result.get("similarity", 0)
|
| 217 |
+
else:
|
| 218 |
+
st.error("RAG system not ready for similarity computation")
|
| 219 |
+
similarity = 0.5
|
| 220 |
+
|
| 221 |
+
# Display similarity score
|
| 222 |
+
sim_class = get_similarity_class(similarity)
|
| 223 |
+
sim_color = get_similarity_color(similarity)
|
| 224 |
+
|
| 225 |
+
st.markdown(f"""
|
| 226 |
+
<div style="text-align: center; padding: 30px;">
|
| 227 |
+
<div class="similarity-badge {sim_class}">
|
| 228 |
+
{similarity:.0%} Similarity
|
| 229 |
+
</div>
|
| 230 |
+
<p style="color: #8b949e; margin-top: 15px;">
|
| 231 |
+
Based on embedding-based semantic similarity
|
| 232 |
+
</p>
|
| 233 |
+
</div>
|
| 234 |
+
""", unsafe_allow_html=True)
|
| 235 |
+
|
| 236 |
+
# Similarity interpretation
|
| 237 |
+
if similarity >= 0.7:
|
| 238 |
+
st.success("These documents are highly similar in content and meaning.")
|
| 239 |
+
elif similarity >= 0.4:
|
| 240 |
+
st.warning("These documents have moderate similarity - some shared topics.")
|
| 241 |
+
else:
|
| 242 |
+
st.info("These documents are quite different in content.")
|
| 243 |
+
|
| 244 |
+
# Document details
|
| 245 |
+
col1, col2 = st.columns(2)
|
| 246 |
+
|
| 247 |
+
with col1:
|
| 248 |
+
st.markdown(f"#### 📄 {doc1_name.split(' (')[0]}")
|
| 249 |
+
if doc1_info["source"] == "state":
|
| 250 |
+
doc = doc1_info["doc"]
|
| 251 |
+
st.metric("Pages", doc.page_count)
|
| 252 |
+
st.metric("Chunks", len(doc.chunks))
|
| 253 |
+
st.metric("Characters", f"{len(doc.raw_text):,}")
|
| 254 |
+
else:
|
| 255 |
+
doc = doc1_info["doc"]
|
| 256 |
+
st.metric("Chunks", doc.get("chunk_count", "N/A"))
|
| 257 |
+
|
| 258 |
+
with col2:
|
| 259 |
+
st.markdown(f"#### 📄 {doc2_name.split(' (')[0]}")
|
| 260 |
+
if doc2_info["source"] == "state":
|
| 261 |
+
doc = doc2_info["doc"]
|
| 262 |
+
st.metric("Pages", doc.page_count)
|
| 263 |
+
st.metric("Chunks", len(doc.chunks))
|
| 264 |
+
st.metric("Characters", f"{len(doc.raw_text):,}")
|
| 265 |
+
else:
|
| 266 |
+
doc = doc2_info["doc"]
|
| 267 |
+
st.metric("Chunks", doc.get("chunk_count", "N/A"))
|
| 268 |
+
|
| 269 |
+
elif comparison_type == "Structure Analysis":
|
| 270 |
+
st.markdown("### Document Structure Comparison")
|
| 271 |
+
|
| 272 |
+
col1, col2 = st.columns(2)
|
| 273 |
+
|
| 274 |
+
# Get structure data
|
| 275 |
+
def get_structure(info):
|
| 276 |
+
if info["source"] == "state":
|
| 277 |
+
doc = info["doc"]
|
| 278 |
+
return {
|
| 279 |
+
"Pages": doc.page_count,
|
| 280 |
+
"Chunks": len(doc.chunks),
|
| 281 |
+
"OCR Regions": len(doc.ocr_regions),
|
| 282 |
+
"Layout Regions": len(doc.layout_data.get("regions", [])),
|
| 283 |
+
"Characters": len(doc.raw_text),
|
| 284 |
+
"Words": len(doc.raw_text.split()),
|
| 285 |
+
}
|
| 286 |
+
else:
|
| 287 |
+
doc = info["doc"]
|
| 288 |
+
return {
|
| 289 |
+
"Chunks": doc.get("chunk_count", 0),
|
| 290 |
+
"Source": doc.get("source_path", "N/A"),
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
struct1 = get_structure(doc1_info)
|
| 294 |
+
struct2 = get_structure(doc2_info)
|
| 295 |
+
|
| 296 |
+
with col1:
|
| 297 |
+
st.markdown(f"#### 📄 {doc1_name.split(' (')[0]}")
|
| 298 |
+
for key, value in struct1.items():
|
| 299 |
+
if isinstance(value, int) and value > 1000:
|
| 300 |
+
st.metric(key, f"{value:,}")
|
| 301 |
+
else:
|
| 302 |
+
st.metric(key, value)
|
| 303 |
+
|
| 304 |
+
with col2:
|
| 305 |
+
st.markdown(f"#### 📄 {doc2_name.split(' (')[0]}")
|
| 306 |
+
for key, value in struct2.items():
|
| 307 |
+
if isinstance(value, int) and value > 1000:
|
| 308 |
+
st.metric(key, f"{value:,}")
|
| 309 |
+
else:
|
| 310 |
+
st.metric(key, value)
|
| 311 |
+
|
| 312 |
+
# Structure comparison chart
|
| 313 |
+
st.markdown("---")
|
| 314 |
+
st.markdown("### Comparison Chart")
|
| 315 |
+
|
| 316 |
+
common_keys = [k for k in struct1.keys() if k in struct2 and isinstance(struct1[k], (int, float))]
|
| 317 |
+
if common_keys:
|
| 318 |
+
comparison_df = pd.DataFrame({
|
| 319 |
+
"Metric": common_keys,
|
| 320 |
+
doc1_name.split(' (')[0]: [struct1[k] for k in common_keys],
|
| 321 |
+
doc2_name.split(' (')[0]: [struct2[k] for k in common_keys],
|
| 322 |
+
})
|
| 323 |
+
st.bar_chart(comparison_df.set_index("Metric"))
|
| 324 |
+
|
| 325 |
+
# Chunk type comparison (if available)
|
| 326 |
+
if doc1_info["source"] == "state" and doc2_info["source"] == "state":
|
| 327 |
+
st.markdown("---")
|
| 328 |
+
st.markdown("### Chunk Type Distribution")
|
| 329 |
+
|
| 330 |
+
def get_chunk_types(doc):
|
| 331 |
+
types = {}
|
| 332 |
+
for chunk in doc.chunks:
|
| 333 |
+
t = chunk.get("chunk_type", "unknown")
|
| 334 |
+
types[t] = types.get(t, 0) + 1
|
| 335 |
+
return types
|
| 336 |
+
|
| 337 |
+
types1 = get_chunk_types(doc1_info["doc"])
|
| 338 |
+
types2 = get_chunk_types(doc2_info["doc"])
|
| 339 |
+
|
| 340 |
+
all_types = set(types1.keys()) | set(types2.keys())
|
| 341 |
+
|
| 342 |
+
type_df = pd.DataFrame({
|
| 343 |
+
"Type": list(all_types),
|
| 344 |
+
doc1_name.split(' (')[0]: [types1.get(t, 0) for t in all_types],
|
| 345 |
+
doc2_name.split(' (')[0]: [types2.get(t, 0) for t in all_types],
|
| 346 |
+
})
|
| 347 |
+
st.dataframe(type_df, width='stretch', hide_index=True)
|
| 348 |
+
|
| 349 |
+
else: # Content Comparison
|
| 350 |
+
st.markdown("### Content Comparison")
|
| 351 |
+
|
| 352 |
+
if doc1_info["source"] == "state" and doc2_info["source"] == "state":
|
| 353 |
+
doc1 = doc1_info["doc"]
|
| 354 |
+
doc2 = doc2_info["doc"]
|
| 355 |
+
|
| 356 |
+
# Word overlap analysis
|
| 357 |
+
words1 = set(doc1.raw_text.lower().split())
|
| 358 |
+
words2 = set(doc2.raw_text.lower().split())
|
| 359 |
+
|
| 360 |
+
common_words = words1 & words2
|
| 361 |
+
only_doc1 = words1 - words2
|
| 362 |
+
only_doc2 = words2 - words1
|
| 363 |
+
|
| 364 |
+
# Metrics
|
| 365 |
+
metric_cols = st.columns(4)
|
| 366 |
+
metric_cols[0].markdown(f"""
|
| 367 |
+
<div class="metric-card">
|
| 368 |
+
<div class="metric-value" style="color: #4ECDC4;">{len(common_words):,}</div>
|
| 369 |
+
<div class="metric-label">Common Words</div>
|
| 370 |
+
</div>
|
| 371 |
+
""", unsafe_allow_html=True)
|
| 372 |
+
metric_cols[1].markdown(f"""
|
| 373 |
+
<div class="metric-card">
|
| 374 |
+
<div class="metric-value" style="color: #FF6B6B;">{len(only_doc1):,}</div>
|
| 375 |
+
<div class="metric-label">Only in Doc 1</div>
|
| 376 |
+
</div>
|
| 377 |
+
""", unsafe_allow_html=True)
|
| 378 |
+
metric_cols[2].markdown(f"""
|
| 379 |
+
<div class="metric-card">
|
| 380 |
+
<div class="metric-value" style="color: #45B7D1;">{len(only_doc2):,}</div>
|
| 381 |
+
<div class="metric-label">Only in Doc 2</div>
|
| 382 |
+
</div>
|
| 383 |
+
""", unsafe_allow_html=True)
|
| 384 |
+
|
| 385 |
+
overlap_pct = len(common_words) / max(len(words1 | words2), 1)
|
| 386 |
+
metric_cols[3].markdown(f"""
|
| 387 |
+
<div class="metric-card">
|
| 388 |
+
<div class="metric-value" style="color: #ffc107;">{overlap_pct:.0%}</div>
|
| 389 |
+
<div class="metric-label">Word Overlap</div>
|
| 390 |
+
</div>
|
| 391 |
+
""", unsafe_allow_html=True)
|
| 392 |
+
|
| 393 |
+
# Similar passages
|
| 394 |
+
st.markdown("---")
|
| 395 |
+
st.markdown("### Similar Passages")
|
| 396 |
+
|
| 397 |
+
# Find similar chunks between documents
|
| 398 |
+
with st.spinner("Finding similar passages..."):
|
| 399 |
+
similar_passages = []
|
| 400 |
+
|
| 401 |
+
# Compare first 10 chunks from doc1 against doc2
|
| 402 |
+
for i, chunk1 in enumerate(doc1.chunks[:10]):
|
| 403 |
+
text1 = chunk1.get("text", "")
|
| 404 |
+
words_c1 = set(text1.lower().split())
|
| 405 |
+
|
| 406 |
+
best_match = None
|
| 407 |
+
best_score = 0
|
| 408 |
+
|
| 409 |
+
for j, chunk2 in enumerate(doc2.chunks):
|
| 410 |
+
text2 = chunk2.get("text", "")
|
| 411 |
+
words_c2 = set(text2.lower().split())
|
| 412 |
+
|
| 413 |
+
# Jaccard similarity
|
| 414 |
+
if words_c1 and words_c2:
|
| 415 |
+
score = len(words_c1 & words_c2) / len(words_c1 | words_c2)
|
| 416 |
+
if score > best_score and score > 0.3:
|
| 417 |
+
best_score = score
|
| 418 |
+
best_match = {
|
| 419 |
+
"doc1_chunk": i,
|
| 420 |
+
"doc2_chunk": j,
|
| 421 |
+
"doc1_text": text1[:200],
|
| 422 |
+
"doc2_text": text2[:200],
|
| 423 |
+
"similarity": score,
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
if best_match:
|
| 427 |
+
similar_passages.append(best_match)
|
| 428 |
+
|
| 429 |
+
if similar_passages:
|
| 430 |
+
# Sort by similarity
|
| 431 |
+
similar_passages.sort(key=lambda x: x["similarity"], reverse=True)
|
| 432 |
+
|
| 433 |
+
for i, match in enumerate(similar_passages[:5]):
|
| 434 |
+
sim_color = get_similarity_color(match["similarity"])
|
| 435 |
+
with st.expander(f"Match {i+1} - Similarity: {match['similarity']:.0%}"):
|
| 436 |
+
col1, col2 = st.columns(2)
|
| 437 |
+
with col1:
|
| 438 |
+
st.markdown(f"**{doc1_name.split(' (')[0]}** (Chunk {match['doc1_chunk']+1})")
|
| 439 |
+
st.markdown(f"""
|
| 440 |
+
<div class="chunk-match diff-common">
|
| 441 |
+
{match['doc1_text']}...
|
| 442 |
+
</div>
|
| 443 |
+
""", unsafe_allow_html=True)
|
| 444 |
+
with col2:
|
| 445 |
+
st.markdown(f"**{doc2_name.split(' (')[0]}** (Chunk {match['doc2_chunk']+1})")
|
| 446 |
+
st.markdown(f"""
|
| 447 |
+
<div class="chunk-match diff-common">
|
| 448 |
+
{match['doc2_text']}...
|
| 449 |
+
</div>
|
| 450 |
+
""", unsafe_allow_html=True)
|
| 451 |
+
else:
|
| 452 |
+
st.info("No significantly similar passages found between documents")
|
| 453 |
+
|
| 454 |
+
# Key terms comparison
|
| 455 |
+
st.markdown("---")
|
| 456 |
+
st.markdown("### Key Terms Comparison")
|
| 457 |
+
|
| 458 |
+
# Get most frequent words (simple approach)
|
| 459 |
+
from collections import Counter
|
| 460 |
+
|
| 461 |
+
def get_top_words(text, n=20):
|
| 462 |
+
words = text.lower().split()
|
| 463 |
+
# Filter out common words
|
| 464 |
+
stopwords = {"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
|
| 465 |
+
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
| 466 |
+
"should", "may", "might", "must", "and", "or", "but", "if", "then",
|
| 467 |
+
"so", "to", "of", "in", "for", "on", "with", "at", "by", "from",
|
| 468 |
+
"this", "that", "these", "those", "it", "its"}
|
| 469 |
+
words = [w for w in words if len(w) > 3 and w not in stopwords]
|
| 470 |
+
return Counter(words).most_common(n)
|
| 471 |
+
|
| 472 |
+
top1 = get_top_words(doc1.raw_text)
|
| 473 |
+
top2 = get_top_words(doc2.raw_text)
|
| 474 |
+
|
| 475 |
+
col1, col2 = st.columns(2)
|
| 476 |
+
with col1:
|
| 477 |
+
st.markdown(f"**Top terms in {doc1_name.split(' (')[0]}:**")
|
| 478 |
+
for word, count in top1[:10]:
|
| 479 |
+
in_doc2 = word in [w for w, c in top2]
|
| 480 |
+
color = "#4ECDC4" if in_doc2 else "#8b949e"
|
| 481 |
+
st.markdown(f"<span style='color: {color};'>• {word}</span> ({count})", unsafe_allow_html=True)
|
| 482 |
+
|
| 483 |
+
with col2:
|
| 484 |
+
st.markdown(f"**Top terms in {doc2_name.split(' (')[0]}:**")
|
| 485 |
+
for word, count in top2[:10]:
|
| 486 |
+
in_doc1 = word in [w for w, c in top1]
|
| 487 |
+
color = "#4ECDC4" if in_doc1 else "#8b949e"
|
| 488 |
+
st.markdown(f"<span style='color: {color};'>• {word}</span> ({count})", unsafe_allow_html=True)
|
| 489 |
+
|
| 490 |
+
else:
|
| 491 |
+
st.info("Content comparison requires both documents to be in processed state")
|
| 492 |
+
|
| 493 |
+
# Export options
|
| 494 |
+
st.markdown("---")
|
| 495 |
+
st.markdown("### Export Comparison")
|
| 496 |
+
|
| 497 |
+
export_cols = st.columns(3)
|
| 498 |
+
with export_cols[0]:
|
| 499 |
+
if st.button("📄 Export as JSON", use_container_width=True):
|
| 500 |
+
import json
|
| 501 |
+
export_data = {
|
| 502 |
+
"document1": doc1_name,
|
| 503 |
+
"document2": doc2_name,
|
| 504 |
+
"comparison_type": comparison_type,
|
| 505 |
+
}
|
| 506 |
+
st.json(export_data)
|
| 507 |
+
with export_cols[1]:
|
| 508 |
+
st.button("📊 Export as CSV", disabled=True, use_container_width=True)
|
| 509 |
+
with export_cols[2]:
|
| 510 |
+
st.button("📋 Export as PDF", disabled=True, use_container_width=True)
|
| 511 |
+
|
| 512 |
+
# Navigation
|
| 513 |
+
st.markdown("---")
|
| 514 |
+
st.markdown("### Navigation")
|
| 515 |
+
nav_cols = st.columns(4)
|
| 516 |
+
|
| 517 |
+
with nav_cols[0]:
|
| 518 |
+
if st.button("🔬 Live Processing", use_container_width=True):
|
| 519 |
+
st.switch_page("pages/1_🔬_Live_Processing.py")
|
| 520 |
+
with nav_cols[1]:
|
| 521 |
+
if st.button("💬 Interactive RAG", use_container_width=True):
|
| 522 |
+
st.switch_page("pages/2_💬_Interactive_RAG.py")
|
| 523 |
+
with nav_cols[2]:
|
| 524 |
+
if st.button("🎯 Evidence Viewer", use_container_width=True):
|
| 525 |
+
st.switch_page("pages/4_🎯_Evidence_Viewer.py")
|
| 526 |
+
with nav_cols[3]:
|
| 527 |
+
if st.button("📄 Document Viewer", use_container_width=True):
|
| 528 |
+
st.switch_page("pages/5_📄_Document_Viewer.py")
|
demo/pages/4_🎯_Evidence_Viewer.py
ADDED
|
@@ -0,0 +1,529 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Evidence Viewer - SPARKNET
|
| 3 |
+
|
| 4 |
+
Visualize extracted OCR regions, layout, and evidence grounding with
|
| 5 |
+
confidence-based coloring and interactivity.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import streamlit as st
|
| 9 |
+
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import base64
|
| 12 |
+
|
| 13 |
+
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
| 14 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 15 |
+
sys.path.insert(0, str(PROJECT_ROOT / "demo"))
|
| 16 |
+
|
| 17 |
+
from state_manager import (
|
| 18 |
+
get_state_manager,
|
| 19 |
+
render_global_status_bar,
|
| 20 |
+
)
|
| 21 |
+
from rag_config import (
|
| 22 |
+
get_indexed_documents,
|
| 23 |
+
get_chunks_for_document,
|
| 24 |
+
check_ollama,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
st.set_page_config(page_title="Evidence Viewer - SPARKNET", page_icon="🎯", layout="wide")
|
| 28 |
+
|
| 29 |
+
# Custom CSS with confidence-based colors
|
| 30 |
+
st.markdown("""
|
| 31 |
+
<style>
|
| 32 |
+
.region-card {
|
| 33 |
+
background: #161b22;
|
| 34 |
+
border-radius: 10px;
|
| 35 |
+
padding: 12px;
|
| 36 |
+
margin: 8px 0;
|
| 37 |
+
border-left: 4px solid;
|
| 38 |
+
transition: transform 0.2s;
|
| 39 |
+
}
|
| 40 |
+
.region-card:hover {
|
| 41 |
+
transform: translateX(4px);
|
| 42 |
+
}
|
| 43 |
+
.confidence-high {
|
| 44 |
+
border-left-color: #4ECDC4 !important;
|
| 45 |
+
background: linear-gradient(90deg, rgba(78,205,196,0.1) 0%, transparent 100%);
|
| 46 |
+
}
|
| 47 |
+
.confidence-med {
|
| 48 |
+
border-left-color: #ffc107 !important;
|
| 49 |
+
background: linear-gradient(90deg, rgba(255,193,7,0.1) 0%, transparent 100%);
|
| 50 |
+
}
|
| 51 |
+
.confidence-low {
|
| 52 |
+
border-left-color: #dc3545 !important;
|
| 53 |
+
background: linear-gradient(90deg, rgba(220,53,69,0.1) 0%, transparent 100%);
|
| 54 |
+
}
|
| 55 |
+
.region-header {
|
| 56 |
+
display: flex;
|
| 57 |
+
justify-content: space-between;
|
| 58 |
+
align-items: center;
|
| 59 |
+
margin-bottom: 8px;
|
| 60 |
+
}
|
| 61 |
+
.region-type {
|
| 62 |
+
font-weight: bold;
|
| 63 |
+
text-transform: uppercase;
|
| 64 |
+
font-size: 12px;
|
| 65 |
+
}
|
| 66 |
+
.region-conf {
|
| 67 |
+
font-size: 14px;
|
| 68 |
+
font-weight: bold;
|
| 69 |
+
}
|
| 70 |
+
.region-text {
|
| 71 |
+
font-family: 'Monaco', 'Menlo', monospace;
|
| 72 |
+
font-size: 13px;
|
| 73 |
+
color: #c9d1d9;
|
| 74 |
+
line-height: 1.5;
|
| 75 |
+
}
|
| 76 |
+
.region-meta {
|
| 77 |
+
font-size: 10px;
|
| 78 |
+
color: #8b949e;
|
| 79 |
+
margin-top: 8px;
|
| 80 |
+
}
|
| 81 |
+
.bbox-display {
|
| 82 |
+
background: #0d1117;
|
| 83 |
+
padding: 4px 8px;
|
| 84 |
+
border-radius: 4px;
|
| 85 |
+
font-family: monospace;
|
| 86 |
+
font-size: 11px;
|
| 87 |
+
}
|
| 88 |
+
.page-thumbnail {
|
| 89 |
+
border: 2px solid #30363d;
|
| 90 |
+
border-radius: 8px;
|
| 91 |
+
padding: 10px;
|
| 92 |
+
background: #0d1117;
|
| 93 |
+
}
|
| 94 |
+
.page-thumbnail.active {
|
| 95 |
+
border-color: #4ECDC4;
|
| 96 |
+
}
|
| 97 |
+
.stats-card {
|
| 98 |
+
background: #161b22;
|
| 99 |
+
border-radius: 8px;
|
| 100 |
+
padding: 15px;
|
| 101 |
+
text-align: center;
|
| 102 |
+
}
|
| 103 |
+
.stats-value {
|
| 104 |
+
font-size: 28px;
|
| 105 |
+
font-weight: bold;
|
| 106 |
+
color: #4ECDC4;
|
| 107 |
+
}
|
| 108 |
+
.stats-label {
|
| 109 |
+
font-size: 11px;
|
| 110 |
+
color: #8b949e;
|
| 111 |
+
text-transform: uppercase;
|
| 112 |
+
}
|
| 113 |
+
.copy-btn {
|
| 114 |
+
background: #21262d;
|
| 115 |
+
border: none;
|
| 116 |
+
padding: 4px 8px;
|
| 117 |
+
border-radius: 4px;
|
| 118 |
+
font-size: 11px;
|
| 119 |
+
cursor: pointer;
|
| 120 |
+
}
|
| 121 |
+
</style>
|
| 122 |
+
""", unsafe_allow_html=True)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def get_confidence_class(conf: float) -> str:
|
| 126 |
+
"""Get CSS class based on confidence."""
|
| 127 |
+
if conf >= 0.8:
|
| 128 |
+
return "confidence-high"
|
| 129 |
+
elif conf >= 0.6:
|
| 130 |
+
return "confidence-med"
|
| 131 |
+
return "confidence-low"
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def get_confidence_color(conf: float) -> str:
|
| 135 |
+
"""Get color based on confidence."""
|
| 136 |
+
if conf >= 0.8:
|
| 137 |
+
return "#4ECDC4"
|
| 138 |
+
elif conf >= 0.6:
|
| 139 |
+
return "#ffc107"
|
| 140 |
+
return "#dc3545"
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def get_type_color(region_type: str) -> str:
|
| 144 |
+
"""Get color for region type."""
|
| 145 |
+
colors = {
|
| 146 |
+
"title": "#FF6B6B",
|
| 147 |
+
"heading": "#FF8E6B",
|
| 148 |
+
"paragraph": "#4ECDC4",
|
| 149 |
+
"text": "#45B7D1",
|
| 150 |
+
"list": "#96CEB4",
|
| 151 |
+
"table": "#FFEAA7",
|
| 152 |
+
"figure": "#DDA0DD",
|
| 153 |
+
"header": "#98D8C8",
|
| 154 |
+
"footer": "#8b949e",
|
| 155 |
+
}
|
| 156 |
+
return colors.get(region_type.lower(), "#666")
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
# Initialize state manager
|
| 160 |
+
state_manager = get_state_manager()
|
| 161 |
+
|
| 162 |
+
# Header
|
| 163 |
+
st.markdown("# 🎯 Evidence Viewer")
|
| 164 |
+
st.markdown("Visualize OCR regions, layout structure, and evidence grounding with confidence scoring")
|
| 165 |
+
|
| 166 |
+
# Global status bar
|
| 167 |
+
render_global_status_bar()
|
| 168 |
+
|
| 169 |
+
st.markdown("---")
|
| 170 |
+
|
| 171 |
+
# Get documents from state
|
| 172 |
+
all_docs = state_manager.get_all_documents()
|
| 173 |
+
indexed_docs = get_indexed_documents()
|
| 174 |
+
|
| 175 |
+
# Sidebar for document selection
|
| 176 |
+
with st.sidebar:
|
| 177 |
+
st.markdown("## 📚 Select Document")
|
| 178 |
+
|
| 179 |
+
if all_docs:
|
| 180 |
+
doc_options = {f"{d.filename} ({len(d.ocr_regions)} regions)": d.doc_id for d in all_docs}
|
| 181 |
+
selected_doc_name = st.selectbox("Processed Documents", list(doc_options.keys()))
|
| 182 |
+
selected_doc_id = doc_options.get(selected_doc_name)
|
| 183 |
+
|
| 184 |
+
if selected_doc_id:
|
| 185 |
+
state_manager.set_active_document(selected_doc_id)
|
| 186 |
+
else:
|
| 187 |
+
st.info("No documents processed yet")
|
| 188 |
+
selected_doc_id = None
|
| 189 |
+
|
| 190 |
+
st.markdown("---")
|
| 191 |
+
st.markdown("## 🎨 Display Options")
|
| 192 |
+
|
| 193 |
+
show_ocr = st.checkbox("Show OCR Regions", value=True)
|
| 194 |
+
show_layout = st.checkbox("Show Layout Regions", value=True)
|
| 195 |
+
show_bbox = st.checkbox("Show Bounding Boxes", value=True)
|
| 196 |
+
|
| 197 |
+
st.markdown("---")
|
| 198 |
+
st.markdown("## 🎚️ Filters")
|
| 199 |
+
|
| 200 |
+
min_confidence = st.slider("Min Confidence", 0.0, 1.0, 0.0, 0.1)
|
| 201 |
+
|
| 202 |
+
region_types = ["All", "title", "heading", "paragraph", "text", "list", "table", "figure"]
|
| 203 |
+
selected_type = st.selectbox("Region Type", region_types)
|
| 204 |
+
|
| 205 |
+
# Main content
|
| 206 |
+
active_doc = state_manager.get_active_document()
|
| 207 |
+
|
| 208 |
+
if active_doc:
|
| 209 |
+
# Document header
|
| 210 |
+
col1, col2 = st.columns([3, 1])
|
| 211 |
+
with col1:
|
| 212 |
+
st.markdown(f"## 📄 {active_doc.filename}")
|
| 213 |
+
st.caption(f"ID: `{active_doc.doc_id}` | {active_doc.page_count} pages")
|
| 214 |
+
with col2:
|
| 215 |
+
if active_doc.indexed:
|
| 216 |
+
st.success("Indexed")
|
| 217 |
+
else:
|
| 218 |
+
st.warning("Not indexed")
|
| 219 |
+
|
| 220 |
+
# Statistics cards
|
| 221 |
+
stat_cols = st.columns(5)
|
| 222 |
+
|
| 223 |
+
# Calculate stats
|
| 224 |
+
ocr_regions = active_doc.ocr_regions
|
| 225 |
+
layout_regions = active_doc.layout_data.get("regions", [])
|
| 226 |
+
|
| 227 |
+
avg_ocr_conf = sum(r.get("confidence", 0) for r in ocr_regions) / len(ocr_regions) if ocr_regions else 0
|
| 228 |
+
high_conf_count = len([r for r in ocr_regions if r.get("confidence", 0) >= 0.8])
|
| 229 |
+
med_conf_count = len([r for r in ocr_regions if 0.6 <= r.get("confidence", 0) < 0.8])
|
| 230 |
+
low_conf_count = len([r for r in ocr_regions if r.get("confidence", 0) < 0.6])
|
| 231 |
+
|
| 232 |
+
stat_cols[0].markdown(f"""
|
| 233 |
+
<div class="stats-card">
|
| 234 |
+
<div class="stats-value">{len(ocr_regions)}</div>
|
| 235 |
+
<div class="stats-label">OCR Regions</div>
|
| 236 |
+
</div>
|
| 237 |
+
""", unsafe_allow_html=True)
|
| 238 |
+
|
| 239 |
+
stat_cols[1].markdown(f"""
|
| 240 |
+
<div class="stats-card">
|
| 241 |
+
<div class="stats-value">{len(layout_regions)}</div>
|
| 242 |
+
<div class="stats-label">Layout Regions</div>
|
| 243 |
+
</div>
|
| 244 |
+
""", unsafe_allow_html=True)
|
| 245 |
+
|
| 246 |
+
stat_cols[2].markdown(f"""
|
| 247 |
+
<div class="stats-card">
|
| 248 |
+
<div class="stats-value" style="color: #4ECDC4;">{avg_ocr_conf:.0%}</div>
|
| 249 |
+
<div class="stats-label">Avg Confidence</div>
|
| 250 |
+
</div>
|
| 251 |
+
""", unsafe_allow_html=True)
|
| 252 |
+
|
| 253 |
+
stat_cols[3].markdown(f"""
|
| 254 |
+
<div class="stats-card">
|
| 255 |
+
<div class="stats-value" style="color: #4ECDC4;">{high_conf_count}</div>
|
| 256 |
+
<div class="stats-label">High Conf (>80%)</div>
|
| 257 |
+
</div>
|
| 258 |
+
""", unsafe_allow_html=True)
|
| 259 |
+
|
| 260 |
+
stat_cols[4].markdown(f"""
|
| 261 |
+
<div class="stats-card">
|
| 262 |
+
<div class="stats-value" style="color: #dc3545;">{low_conf_count}</div>
|
| 263 |
+
<div class="stats-label">Low Conf (<60%)</div>
|
| 264 |
+
</div>
|
| 265 |
+
""", unsafe_allow_html=True)
|
| 266 |
+
|
| 267 |
+
st.markdown("---")
|
| 268 |
+
|
| 269 |
+
# Main view - Page images and regions
|
| 270 |
+
tab_regions, tab_pages, tab_export = st.tabs(["📋 Regions", "📄 Page View", "📥 Export"])
|
| 271 |
+
|
| 272 |
+
with tab_regions:
|
| 273 |
+
# Filter regions
|
| 274 |
+
filtered_ocr = ocr_regions
|
| 275 |
+
if min_confidence > 0:
|
| 276 |
+
filtered_ocr = [r for r in filtered_ocr if r.get("confidence", 0) >= min_confidence]
|
| 277 |
+
|
| 278 |
+
# Page selector
|
| 279 |
+
pages = sorted(set(r.get("page", 0) for r in filtered_ocr))
|
| 280 |
+
if pages:
|
| 281 |
+
selected_page = st.selectbox(
|
| 282 |
+
"Select Page",
|
| 283 |
+
pages,
|
| 284 |
+
format_func=lambda x: f"Page {x + 1} ({len([r for r in filtered_ocr if r.get('page') == x])} regions)"
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
page_regions = [r for r in filtered_ocr if r.get("page") == selected_page]
|
| 288 |
+
|
| 289 |
+
st.markdown(f"### OCR Regions on Page {selected_page + 1}")
|
| 290 |
+
st.caption(f"Showing {len(page_regions)} regions (filtered by confidence >= {min_confidence:.0%})")
|
| 291 |
+
|
| 292 |
+
# Display regions with confidence coloring
|
| 293 |
+
for i, region in enumerate(page_regions):
|
| 294 |
+
conf = region.get("confidence", 0)
|
| 295 |
+
conf_class = get_confidence_class(conf)
|
| 296 |
+
conf_color = get_confidence_color(conf)
|
| 297 |
+
text = region.get("text", "")
|
| 298 |
+
bbox = region.get("bbox")
|
| 299 |
+
|
| 300 |
+
st.markdown(f"""
|
| 301 |
+
<div class="region-card {conf_class}">
|
| 302 |
+
<div class="region-header">
|
| 303 |
+
<span class="region-type" style="color: {conf_color};">Region {i + 1}</span>
|
| 304 |
+
<span class="region-conf" style="color: {conf_color};">{conf:.0%}</span>
|
| 305 |
+
</div>
|
| 306 |
+
<div class="region-text">{text}</div>
|
| 307 |
+
<div class="region-meta">
|
| 308 |
+
{f'<span class="bbox-display">Bbox: ({bbox[0]:.0f}, {bbox[1]:.0f}) - ({bbox[2]:.0f}, {bbox[3]:.0f})</span>' if bbox and show_bbox else ''}
|
| 309 |
+
</div>
|
| 310 |
+
</div>
|
| 311 |
+
""", unsafe_allow_html=True)
|
| 312 |
+
|
| 313 |
+
# Copy button
|
| 314 |
+
col1, col2 = st.columns([4, 1])
|
| 315 |
+
with col2:
|
| 316 |
+
if st.button("📋 Copy", key=f"copy_{i}"):
|
| 317 |
+
st.toast(f"Copied region {i+1} text!")
|
| 318 |
+
|
| 319 |
+
else:
|
| 320 |
+
st.info("No OCR regions available for this document")
|
| 321 |
+
|
| 322 |
+
# Layout regions
|
| 323 |
+
if show_layout and layout_regions:
|
| 324 |
+
st.markdown("---")
|
| 325 |
+
st.markdown("### Layout Regions")
|
| 326 |
+
|
| 327 |
+
# Group by type
|
| 328 |
+
by_type = {}
|
| 329 |
+
for r in layout_regions:
|
| 330 |
+
rtype = r.get("type", "unknown")
|
| 331 |
+
if rtype not in by_type:
|
| 332 |
+
by_type[rtype] = []
|
| 333 |
+
by_type[rtype].append(r)
|
| 334 |
+
|
| 335 |
+
# Type pills
|
| 336 |
+
st.markdown("**Detected types:**")
|
| 337 |
+
type_html = ""
|
| 338 |
+
for rtype, regions in by_type.items():
|
| 339 |
+
color = get_type_color(rtype)
|
| 340 |
+
type_html += f'<span style="background: {color}33; color: {color}; padding: 4px 10px; border-radius: 12px; margin: 4px; display: inline-block; font-size: 12px;">{rtype.title()} ({len(regions)})</span>'
|
| 341 |
+
st.markdown(type_html, unsafe_allow_html=True)
|
| 342 |
+
|
| 343 |
+
# Layout details
|
| 344 |
+
for rtype, regions in by_type.items():
|
| 345 |
+
with st.expander(f"{rtype.title()} ({len(regions)} regions)"):
|
| 346 |
+
for r in regions[:10]:
|
| 347 |
+
conf = r.get("confidence", 0)
|
| 348 |
+
conf_color = get_confidence_color(conf)
|
| 349 |
+
st.markdown(f"""
|
| 350 |
+
<div style="background: #0d1117; padding: 8px; border-radius: 6px; margin: 4px 0; border-left: 3px solid {get_type_color(rtype)};">
|
| 351 |
+
<span style="color: {conf_color};">{conf:.0%}</span> | Page {r.get('page', 0) + 1}
|
| 352 |
+
</div>
|
| 353 |
+
""", unsafe_allow_html=True)
|
| 354 |
+
|
| 355 |
+
with tab_pages:
|
| 356 |
+
st.markdown("### Page Images with Regions")
|
| 357 |
+
|
| 358 |
+
if active_doc.page_images:
|
| 359 |
+
page_select = st.selectbox(
|
| 360 |
+
"Page",
|
| 361 |
+
range(len(active_doc.page_images)),
|
| 362 |
+
format_func=lambda x: f"Page {x + 1}",
|
| 363 |
+
key="page_view_select"
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
+
if page_select is not None:
|
| 367 |
+
# Display page image
|
| 368 |
+
img_data = active_doc.page_images[page_select]
|
| 369 |
+
st.image(
|
| 370 |
+
f"data:image/png;base64,{img_data}",
|
| 371 |
+
caption=f"Page {page_select + 1}",
|
| 372 |
+
use_container_width=True
|
| 373 |
+
)
|
| 374 |
+
|
| 375 |
+
# Regions on this page
|
| 376 |
+
page_ocr = [r for r in ocr_regions if r.get("page") == page_select]
|
| 377 |
+
page_layout = [r for r in layout_regions if r.get("page") == page_select]
|
| 378 |
+
|
| 379 |
+
col1, col2 = st.columns(2)
|
| 380 |
+
with col1:
|
| 381 |
+
st.metric("OCR Regions", len(page_ocr))
|
| 382 |
+
with col2:
|
| 383 |
+
st.metric("Layout Regions", len(page_layout))
|
| 384 |
+
|
| 385 |
+
st.info("Bounding box overlay visualization will be available in future updates")
|
| 386 |
+
else:
|
| 387 |
+
st.info("No page images available. Process a PDF document to see page images.")
|
| 388 |
+
|
| 389 |
+
with tab_export:
|
| 390 |
+
st.markdown("### Export Evidence Data")
|
| 391 |
+
|
| 392 |
+
export_cols = st.columns(3)
|
| 393 |
+
|
| 394 |
+
with export_cols[0]:
|
| 395 |
+
st.markdown("**OCR Regions JSON**")
|
| 396 |
+
if st.button("📥 Export OCR", use_container_width=True):
|
| 397 |
+
import json
|
| 398 |
+
ocr_json = json.dumps({
|
| 399 |
+
"document_id": active_doc.doc_id,
|
| 400 |
+
"filename": active_doc.filename,
|
| 401 |
+
"ocr_regions": ocr_regions,
|
| 402 |
+
}, indent=2)
|
| 403 |
+
st.download_button(
|
| 404 |
+
"Download JSON",
|
| 405 |
+
ocr_json,
|
| 406 |
+
file_name=f"{active_doc.doc_id}_ocr.json",
|
| 407 |
+
mime="application/json"
|
| 408 |
+
)
|
| 409 |
+
|
| 410 |
+
with export_cols[1]:
|
| 411 |
+
st.markdown("**Layout Regions JSON**")
|
| 412 |
+
if st.button("📥 Export Layout", use_container_width=True):
|
| 413 |
+
import json
|
| 414 |
+
layout_json = json.dumps({
|
| 415 |
+
"document_id": active_doc.doc_id,
|
| 416 |
+
"filename": active_doc.filename,
|
| 417 |
+
"layout_regions": layout_regions,
|
| 418 |
+
}, indent=2)
|
| 419 |
+
st.download_button(
|
| 420 |
+
"Download JSON",
|
| 421 |
+
layout_json,
|
| 422 |
+
file_name=f"{active_doc.doc_id}_layout.json",
|
| 423 |
+
mime="application/json"
|
| 424 |
+
)
|
| 425 |
+
|
| 426 |
+
with export_cols[2]:
|
| 427 |
+
st.markdown("**Full Text**")
|
| 428 |
+
st.download_button(
|
| 429 |
+
"📥 Export Text",
|
| 430 |
+
active_doc.raw_text,
|
| 431 |
+
file_name=f"{active_doc.doc_id}.txt",
|
| 432 |
+
mime="text/plain",
|
| 433 |
+
use_container_width=True
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
# Confidence distribution chart
|
| 437 |
+
st.markdown("---")
|
| 438 |
+
st.markdown("### Confidence Distribution")
|
| 439 |
+
|
| 440 |
+
if ocr_regions:
|
| 441 |
+
import pandas as pd
|
| 442 |
+
|
| 443 |
+
# Build distribution data
|
| 444 |
+
conf_bins = {"High (>80%)": 0, "Medium (60-80%)": 0, "Low (<60%)": 0}
|
| 445 |
+
for r in ocr_regions:
|
| 446 |
+
c = r.get("confidence", 0)
|
| 447 |
+
if c >= 0.8:
|
| 448 |
+
conf_bins["High (>80%)"] += 1
|
| 449 |
+
elif c >= 0.6:
|
| 450 |
+
conf_bins["Medium (60-80%)"] += 1
|
| 451 |
+
else:
|
| 452 |
+
conf_bins["Low (<60%)"] += 1
|
| 453 |
+
|
| 454 |
+
df = pd.DataFrame({
|
| 455 |
+
"Confidence Level": list(conf_bins.keys()),
|
| 456 |
+
"Count": list(conf_bins.values())
|
| 457 |
+
})
|
| 458 |
+
st.bar_chart(df.set_index("Confidence Level"))
|
| 459 |
+
|
| 460 |
+
# Navigation
|
| 461 |
+
st.markdown("---")
|
| 462 |
+
st.markdown("### Actions")
|
| 463 |
+
nav_cols = st.columns(4)
|
| 464 |
+
|
| 465 |
+
with nav_cols[0]:
|
| 466 |
+
if st.button("💬 Query RAG", use_container_width=True):
|
| 467 |
+
st.switch_page("pages/2_💬_Interactive_RAG.py")
|
| 468 |
+
with nav_cols[1]:
|
| 469 |
+
if st.button("📄 Document Viewer", use_container_width=True):
|
| 470 |
+
st.switch_page("pages/5_📄_Document_Viewer.py")
|
| 471 |
+
with nav_cols[2]:
|
| 472 |
+
if st.button("📊 Compare", use_container_width=True):
|
| 473 |
+
st.switch_page("pages/3_📊_Document_Comparison.py")
|
| 474 |
+
with nav_cols[3]:
|
| 475 |
+
if st.button("🔬 Process New", use_container_width=True):
|
| 476 |
+
st.switch_page("pages/1_🔬_Live_Processing.py")
|
| 477 |
+
|
| 478 |
+
else:
|
| 479 |
+
# No document selected
|
| 480 |
+
st.markdown("## No Document Selected")
|
| 481 |
+
|
| 482 |
+
st.markdown("""
|
| 483 |
+
### Getting Started
|
| 484 |
+
|
| 485 |
+
1. Go to **Live Processing** to upload and process a document
|
| 486 |
+
2. Come back here to view OCR regions and evidence grounding
|
| 487 |
+
3. Use confidence filters to focus on high or low quality regions
|
| 488 |
+
|
| 489 |
+
Evidence viewer shows:
|
| 490 |
+
- OCR extracted text regions with confidence scores
|
| 491 |
+
- Layout detection results (titles, paragraphs, tables, etc.)
|
| 492 |
+
- Bounding box coordinates for each region
|
| 493 |
+
- Page images with region overlays
|
| 494 |
+
""")
|
| 495 |
+
|
| 496 |
+
col1, col2 = st.columns(2)
|
| 497 |
+
with col1:
|
| 498 |
+
if st.button("🔬 Go to Live Processing", type="primary", use_container_width=True):
|
| 499 |
+
st.switch_page("pages/1_🔬_Live_Processing.py")
|
| 500 |
+
with col2:
|
| 501 |
+
if st.button("📄 Go to Document Viewer", use_container_width=True):
|
| 502 |
+
st.switch_page("pages/5_📄_Document_Viewer.py")
|
| 503 |
+
|
| 504 |
+
# Legend
|
| 505 |
+
st.markdown("---")
|
| 506 |
+
st.markdown("### Confidence Color Legend")
|
| 507 |
+
|
| 508 |
+
legend_cols = st.columns(3)
|
| 509 |
+
with legend_cols[0]:
|
| 510 |
+
st.markdown("""
|
| 511 |
+
<div style="background: rgba(78,205,196,0.2); padding: 10px; border-radius: 8px; border-left: 4px solid #4ECDC4;">
|
| 512 |
+
<strong style="color: #4ECDC4;">High Confidence (>80%)</strong><br>
|
| 513 |
+
<span style="color: #8b949e;">Reliable extraction</span>
|
| 514 |
+
</div>
|
| 515 |
+
""", unsafe_allow_html=True)
|
| 516 |
+
with legend_cols[1]:
|
| 517 |
+
st.markdown("""
|
| 518 |
+
<div style="background: rgba(255,193,7,0.2); padding: 10px; border-radius: 8px; border-left: 4px solid #ffc107;">
|
| 519 |
+
<strong style="color: #ffc107;">Medium Confidence (60-80%)</strong><br>
|
| 520 |
+
<span style="color: #8b949e;">Review recommended</span>
|
| 521 |
+
</div>
|
| 522 |
+
""", unsafe_allow_html=True)
|
| 523 |
+
with legend_cols[2]:
|
| 524 |
+
st.markdown("""
|
| 525 |
+
<div style="background: rgba(220,53,69,0.2); padding: 10px; border-radius: 8px; border-left: 4px solid #dc3545;">
|
| 526 |
+
<strong style="color: #dc3545;">Low Confidence (<60%)</strong><br>
|
| 527 |
+
<span style="color: #8b949e;">Manual verification needed</span>
|
| 528 |
+
</div>
|
| 529 |
+
""", unsafe_allow_html=True)
|
demo/pages/5_📄_Document_Viewer.py
ADDED
|
@@ -0,0 +1,565 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document Viewer - SPARKNET
|
| 3 |
+
|
| 4 |
+
View and explore processed documents from the state manager.
|
| 5 |
+
Provides visual chunk segmentation, OCR regions, and layout visualization.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import streamlit as st
|
| 9 |
+
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import time
|
| 12 |
+
import hashlib
|
| 13 |
+
import base64
|
| 14 |
+
from typing import List, Dict, Any
|
| 15 |
+
|
| 16 |
+
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
| 17 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 18 |
+
sys.path.insert(0, str(PROJECT_ROOT / "demo"))
|
| 19 |
+
|
| 20 |
+
# Import state manager and RAG config
|
| 21 |
+
from state_manager import (
|
| 22 |
+
get_state_manager,
|
| 23 |
+
ProcessedDocument,
|
| 24 |
+
render_global_status_bar,
|
| 25 |
+
)
|
| 26 |
+
from rag_config import (
|
| 27 |
+
get_unified_rag_system,
|
| 28 |
+
get_store_stats,
|
| 29 |
+
get_indexed_documents,
|
| 30 |
+
get_chunks_for_document,
|
| 31 |
+
check_ollama,
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
st.set_page_config(
|
| 35 |
+
page_title="Document Viewer - SPARKNET",
|
| 36 |
+
page_icon="📄",
|
| 37 |
+
layout="wide"
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
# Custom CSS
|
| 41 |
+
st.markdown("""
|
| 42 |
+
<style>
|
| 43 |
+
.chunk-card {
|
| 44 |
+
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
|
| 45 |
+
border-radius: 10px;
|
| 46 |
+
padding: 12px;
|
| 47 |
+
margin: 8px 0;
|
| 48 |
+
border-left: 4px solid #4ECDC4;
|
| 49 |
+
}
|
| 50 |
+
.chunk-header {
|
| 51 |
+
display: flex;
|
| 52 |
+
justify-content: space-between;
|
| 53 |
+
font-size: 11px;
|
| 54 |
+
color: #8b949e;
|
| 55 |
+
margin-bottom: 6px;
|
| 56 |
+
}
|
| 57 |
+
.chunk-text {
|
| 58 |
+
font-size: 13px;
|
| 59 |
+
line-height: 1.5;
|
| 60 |
+
color: #c9d1d9;
|
| 61 |
+
font-family: 'Monaco', 'Menlo', monospace;
|
| 62 |
+
}
|
| 63 |
+
.ocr-region {
|
| 64 |
+
background: #161b22;
|
| 65 |
+
border-radius: 6px;
|
| 66 |
+
padding: 8px;
|
| 67 |
+
margin: 4px 0;
|
| 68 |
+
border-left: 3px solid;
|
| 69 |
+
}
|
| 70 |
+
.layout-region {
|
| 71 |
+
display: inline-block;
|
| 72 |
+
padding: 4px 8px;
|
| 73 |
+
margin: 3px;
|
| 74 |
+
border-radius: 4px;
|
| 75 |
+
font-size: 11px;
|
| 76 |
+
}
|
| 77 |
+
.doc-card {
|
| 78 |
+
background: #0d1117;
|
| 79 |
+
border-radius: 10px;
|
| 80 |
+
padding: 15px;
|
| 81 |
+
margin: 10px 0;
|
| 82 |
+
border: 1px solid #30363d;
|
| 83 |
+
cursor: pointer;
|
| 84 |
+
transition: border-color 0.2s;
|
| 85 |
+
}
|
| 86 |
+
.doc-card:hover {
|
| 87 |
+
border-color: #4ECDC4;
|
| 88 |
+
}
|
| 89 |
+
.doc-card.active {
|
| 90 |
+
border-color: #4ECDC4;
|
| 91 |
+
border-width: 2px;
|
| 92 |
+
}
|
| 93 |
+
.metric-mini {
|
| 94 |
+
background: #161b22;
|
| 95 |
+
border-radius: 6px;
|
| 96 |
+
padding: 8px;
|
| 97 |
+
text-align: center;
|
| 98 |
+
margin: 4px;
|
| 99 |
+
}
|
| 100 |
+
.metric-mini .value {
|
| 101 |
+
font-size: 18px;
|
| 102 |
+
font-weight: bold;
|
| 103 |
+
color: #4ECDC4;
|
| 104 |
+
}
|
| 105 |
+
.metric-mini .label {
|
| 106 |
+
font-size: 10px;
|
| 107 |
+
color: #8b949e;
|
| 108 |
+
text-transform: uppercase;
|
| 109 |
+
}
|
| 110 |
+
.page-viewer {
|
| 111 |
+
background: #0d1117;
|
| 112 |
+
border-radius: 10px;
|
| 113 |
+
padding: 20px;
|
| 114 |
+
max-height: 600px;
|
| 115 |
+
overflow-y: auto;
|
| 116 |
+
}
|
| 117 |
+
.confidence-high { color: #4ECDC4; }
|
| 118 |
+
.confidence-med { color: #ffc107; }
|
| 119 |
+
.confidence-low { color: #dc3545; }
|
| 120 |
+
</style>
|
| 121 |
+
""", unsafe_allow_html=True)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def get_chunk_color(index: int) -> str:
|
| 125 |
+
"""Get distinct color for chunk visualization."""
|
| 126 |
+
colors = [
|
| 127 |
+
"#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4",
|
| 128 |
+
"#FFEAA7", "#DDA0DD", "#98D8C8", "#F7DC6F",
|
| 129 |
+
"#BB8FCE", "#85C1E9", "#F8B500", "#00CED1"
|
| 130 |
+
]
|
| 131 |
+
return colors[index % len(colors)]
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def get_confidence_class(conf: float) -> str:
|
| 135 |
+
"""Get confidence CSS class."""
|
| 136 |
+
if conf >= 0.8:
|
| 137 |
+
return "confidence-high"
|
| 138 |
+
elif conf >= 0.6:
|
| 139 |
+
return "confidence-med"
|
| 140 |
+
return "confidence-low"
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def get_layout_color(layout_type: str) -> str:
|
| 144 |
+
"""Get color for layout type."""
|
| 145 |
+
colors = {
|
| 146 |
+
"title": "#FF6B6B",
|
| 147 |
+
"heading": "#FF8E6B",
|
| 148 |
+
"paragraph": "#4ECDC4",
|
| 149 |
+
"text": "#45B7D1",
|
| 150 |
+
"list": "#96CEB4",
|
| 151 |
+
"table": "#FFEAA7",
|
| 152 |
+
"figure": "#DDA0DD",
|
| 153 |
+
"header": "#98D8C8",
|
| 154 |
+
"footer": "#8b949e",
|
| 155 |
+
}
|
| 156 |
+
return colors.get(layout_type.lower(), "#666")
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
# Initialize state manager
|
| 160 |
+
state_manager = get_state_manager()
|
| 161 |
+
|
| 162 |
+
# Header
|
| 163 |
+
st.markdown("# 📄 Document Viewer")
|
| 164 |
+
st.markdown("Explore processed documents, chunks, OCR regions, and layout structure")
|
| 165 |
+
|
| 166 |
+
# Global status bar
|
| 167 |
+
render_global_status_bar()
|
| 168 |
+
|
| 169 |
+
st.markdown("---")
|
| 170 |
+
|
| 171 |
+
# Get all documents from state and RAG
|
| 172 |
+
all_state_docs = state_manager.get_all_documents()
|
| 173 |
+
rag_docs = get_indexed_documents()
|
| 174 |
+
|
| 175 |
+
# Sidebar for document selection
|
| 176 |
+
with st.sidebar:
|
| 177 |
+
st.markdown("## 📚 Documents")
|
| 178 |
+
|
| 179 |
+
# Processed documents from state manager
|
| 180 |
+
if all_state_docs:
|
| 181 |
+
st.markdown("### Recently Processed")
|
| 182 |
+
selected_doc_id = None
|
| 183 |
+
|
| 184 |
+
for doc in reversed(all_state_docs[-10:]):
|
| 185 |
+
is_active = state_manager.state.get("active_doc_id") == doc.doc_id
|
| 186 |
+
card_class = "doc-card active" if is_active else "doc-card"
|
| 187 |
+
|
| 188 |
+
if st.button(
|
| 189 |
+
f"📄 {doc.filename[:25]}...",
|
| 190 |
+
key=f"doc_{doc.doc_id}",
|
| 191 |
+
use_container_width=True,
|
| 192 |
+
type="primary" if is_active else "secondary"
|
| 193 |
+
):
|
| 194 |
+
state_manager.set_active_document(doc.doc_id)
|
| 195 |
+
st.rerun()
|
| 196 |
+
|
| 197 |
+
# Mini stats
|
| 198 |
+
cols = st.columns(3)
|
| 199 |
+
cols[0].caption(f"📄 {doc.page_count}p")
|
| 200 |
+
cols[1].caption(f"📦 {len(doc.chunks)}")
|
| 201 |
+
if doc.indexed:
|
| 202 |
+
cols[2].caption("✓ Indexed")
|
| 203 |
+
st.markdown("---")
|
| 204 |
+
else:
|
| 205 |
+
st.info("No documents processed yet")
|
| 206 |
+
st.markdown("Go to **Live Processing** to process documents")
|
| 207 |
+
|
| 208 |
+
# RAG indexed documents
|
| 209 |
+
if rag_docs:
|
| 210 |
+
st.markdown("### 📊 RAG Index")
|
| 211 |
+
st.caption(f"{len(rag_docs)} documents indexed")
|
| 212 |
+
for doc in rag_docs[:5]:
|
| 213 |
+
st.caption(f"• {doc.get('document_id', 'unknown')[:20]}...")
|
| 214 |
+
|
| 215 |
+
# Main content
|
| 216 |
+
active_doc = state_manager.get_active_document()
|
| 217 |
+
|
| 218 |
+
if active_doc:
|
| 219 |
+
# Document header
|
| 220 |
+
col1, col2 = st.columns([3, 1])
|
| 221 |
+
|
| 222 |
+
with col1:
|
| 223 |
+
st.markdown(f"## 📄 {active_doc.filename}")
|
| 224 |
+
st.caption(f"ID: `{active_doc.doc_id}` | Type: {active_doc.file_type} | Processed: {active_doc.created_at.strftime('%Y-%m-%d %H:%M')}")
|
| 225 |
+
|
| 226 |
+
with col2:
|
| 227 |
+
if active_doc.indexed:
|
| 228 |
+
st.success(f"✓ Indexed ({active_doc.indexed_chunks} chunks)")
|
| 229 |
+
else:
|
| 230 |
+
st.warning("Not indexed")
|
| 231 |
+
|
| 232 |
+
# Summary metrics
|
| 233 |
+
metric_cols = st.columns(6)
|
| 234 |
+
metric_cols[0].markdown(f"""
|
| 235 |
+
<div class="metric-mini">
|
| 236 |
+
<div class="value">{active_doc.page_count}</div>
|
| 237 |
+
<div class="label">Pages</div>
|
| 238 |
+
</div>
|
| 239 |
+
""", unsafe_allow_html=True)
|
| 240 |
+
metric_cols[1].markdown(f"""
|
| 241 |
+
<div class="metric-mini">
|
| 242 |
+
<div class="value">{len(active_doc.chunks)}</div>
|
| 243 |
+
<div class="label">Chunks</div>
|
| 244 |
+
</div>
|
| 245 |
+
""", unsafe_allow_html=True)
|
| 246 |
+
metric_cols[2].markdown(f"""
|
| 247 |
+
<div class="metric-mini">
|
| 248 |
+
<div class="value">{len(active_doc.ocr_regions)}</div>
|
| 249 |
+
<div class="label">OCR Regions</div>
|
| 250 |
+
</div>
|
| 251 |
+
""", unsafe_allow_html=True)
|
| 252 |
+
layout_count = len(active_doc.layout_data.get("regions", []))
|
| 253 |
+
metric_cols[3].markdown(f"""
|
| 254 |
+
<div class="metric-mini">
|
| 255 |
+
<div class="value">{layout_count}</div>
|
| 256 |
+
<div class="label">Layout Regions</div>
|
| 257 |
+
</div>
|
| 258 |
+
""", unsafe_allow_html=True)
|
| 259 |
+
metric_cols[4].markdown(f"""
|
| 260 |
+
<div class="metric-mini">
|
| 261 |
+
<div class="value">{len(active_doc.raw_text):,}</div>
|
| 262 |
+
<div class="label">Characters</div>
|
| 263 |
+
</div>
|
| 264 |
+
""", unsafe_allow_html=True)
|
| 265 |
+
metric_cols[5].markdown(f"""
|
| 266 |
+
<div class="metric-mini">
|
| 267 |
+
<div class="value">{active_doc.processing_time:.1f}s</div>
|
| 268 |
+
<div class="label">Process Time</div>
|
| 269 |
+
</div>
|
| 270 |
+
""", unsafe_allow_html=True)
|
| 271 |
+
|
| 272 |
+
st.markdown("---")
|
| 273 |
+
|
| 274 |
+
# Tabs for different views
|
| 275 |
+
tab_chunks, tab_text, tab_ocr, tab_layout, tab_pages = st.tabs([
|
| 276 |
+
"📦 Chunks",
|
| 277 |
+
"📝 Full Text",
|
| 278 |
+
"🔍 OCR Regions",
|
| 279 |
+
"🗺️ Layout",
|
| 280 |
+
"📄 Page Images"
|
| 281 |
+
])
|
| 282 |
+
|
| 283 |
+
with tab_chunks:
|
| 284 |
+
st.markdown("### Document Chunks")
|
| 285 |
+
|
| 286 |
+
# Filter options
|
| 287 |
+
filter_cols = st.columns([2, 1, 1])
|
| 288 |
+
with filter_cols[0]:
|
| 289 |
+
search_term = st.text_input("Search in chunks", placeholder="Enter search term...")
|
| 290 |
+
with filter_cols[1]:
|
| 291 |
+
chunk_types = list(set(c.get("chunk_type", "text") for c in active_doc.chunks))
|
| 292 |
+
selected_type = st.selectbox("Filter by type", ["All"] + chunk_types)
|
| 293 |
+
with filter_cols[2]:
|
| 294 |
+
page_filter = st.selectbox("Filter by page", ["All"] + list(range(1, active_doc.page_count + 1)))
|
| 295 |
+
|
| 296 |
+
# Filter chunks
|
| 297 |
+
filtered_chunks = active_doc.chunks
|
| 298 |
+
if search_term:
|
| 299 |
+
filtered_chunks = [c for c in filtered_chunks if search_term.lower() in c.get("text", "").lower()]
|
| 300 |
+
if selected_type != "All":
|
| 301 |
+
filtered_chunks = [c for c in filtered_chunks if c.get("chunk_type") == selected_type]
|
| 302 |
+
if page_filter != "All":
|
| 303 |
+
filtered_chunks = [c for c in filtered_chunks if c.get("page", 0) + 1 == page_filter]
|
| 304 |
+
|
| 305 |
+
st.caption(f"Showing {len(filtered_chunks)} of {len(active_doc.chunks)} chunks")
|
| 306 |
+
|
| 307 |
+
# Display chunks
|
| 308 |
+
for i, chunk in enumerate(filtered_chunks[:30]):
|
| 309 |
+
chunk_type = chunk.get("chunk_type", "text")
|
| 310 |
+
conf = chunk.get("confidence", 0)
|
| 311 |
+
color = get_chunk_color(i)
|
| 312 |
+
conf_class = get_confidence_class(conf)
|
| 313 |
+
|
| 314 |
+
with st.expander(f"[{i+1}] {chunk_type.upper()} - {chunk.get('text', '')[:60]}...", expanded=(i == 0)):
|
| 315 |
+
st.markdown(f"""
|
| 316 |
+
<div class="chunk-card" style="border-left-color: {color};">
|
| 317 |
+
<div class="chunk-header">
|
| 318 |
+
<span>ID: <code>{chunk.get('chunk_id', 'N/A')}</code></span>
|
| 319 |
+
<span>Page {chunk.get('page', 0) + 1}</span>
|
| 320 |
+
<span class="{conf_class}">Confidence: {conf:.0%}</span>
|
| 321 |
+
</div>
|
| 322 |
+
<div class="chunk-text">{chunk.get('text', '')}</div>
|
| 323 |
+
</div>
|
| 324 |
+
""", unsafe_allow_html=True)
|
| 325 |
+
|
| 326 |
+
# Bounding box info
|
| 327 |
+
bbox = chunk.get("bbox")
|
| 328 |
+
if bbox:
|
| 329 |
+
st.caption(f"Bbox: ({bbox[0]:.0f}, {bbox[1]:.0f}) - ({bbox[2]:.0f}, {bbox[3]:.0f})")
|
| 330 |
+
|
| 331 |
+
if len(filtered_chunks) > 30:
|
| 332 |
+
st.info(f"Showing 30 of {len(filtered_chunks)} matching chunks")
|
| 333 |
+
|
| 334 |
+
with tab_text:
|
| 335 |
+
st.markdown("### Extracted Text")
|
| 336 |
+
|
| 337 |
+
# Text display options
|
| 338 |
+
text_cols = st.columns([1, 1, 1])
|
| 339 |
+
with text_cols[0]:
|
| 340 |
+
show_page_markers = st.checkbox("Show page markers", value=True)
|
| 341 |
+
with text_cols[1]:
|
| 342 |
+
font_size = st.slider("Font size", 10, 18, 13)
|
| 343 |
+
with text_cols[2]:
|
| 344 |
+
max_chars = st.slider("Max characters", 5000, 50000, 20000, 1000)
|
| 345 |
+
|
| 346 |
+
text_to_display = active_doc.raw_text[:max_chars]
|
| 347 |
+
if len(active_doc.raw_text) > max_chars:
|
| 348 |
+
text_to_display += f"\n\n... [Truncated - {len(active_doc.raw_text) - max_chars:,} more characters]"
|
| 349 |
+
|
| 350 |
+
st.markdown(f"""
|
| 351 |
+
<div class="page-viewer" style="font-size: {font_size}px;">
|
| 352 |
+
<pre style="white-space: pre-wrap; font-family: monospace; margin: 0;">{text_to_display}</pre>
|
| 353 |
+
</div>
|
| 354 |
+
""", unsafe_allow_html=True)
|
| 355 |
+
|
| 356 |
+
# Download button
|
| 357 |
+
st.download_button(
|
| 358 |
+
"📥 Download Full Text",
|
| 359 |
+
active_doc.raw_text,
|
| 360 |
+
file_name=f"{active_doc.filename}.txt",
|
| 361 |
+
mime="text/plain"
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
with tab_ocr:
|
| 365 |
+
st.markdown("### OCR Regions")
|
| 366 |
+
|
| 367 |
+
if active_doc.ocr_regions:
|
| 368 |
+
# Group by page
|
| 369 |
+
by_page = {}
|
| 370 |
+
for region in active_doc.ocr_regions:
|
| 371 |
+
page = region.get("page", 0)
|
| 372 |
+
if page not in by_page:
|
| 373 |
+
by_page[page] = []
|
| 374 |
+
by_page[page].append(region)
|
| 375 |
+
|
| 376 |
+
# Page selector
|
| 377 |
+
page_select = st.selectbox(
|
| 378 |
+
"Select page",
|
| 379 |
+
sorted(by_page.keys()),
|
| 380 |
+
format_func=lambda x: f"Page {x + 1} ({len(by_page.get(x, []))} regions)"
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
if page_select is not None and page_select in by_page:
|
| 384 |
+
page_regions = by_page[page_select]
|
| 385 |
+
|
| 386 |
+
# Summary
|
| 387 |
+
avg_conf = sum(r.get("confidence", 0) for r in page_regions) / len(page_regions) if page_regions else 0
|
| 388 |
+
conf_class = get_confidence_class(avg_conf)
|
| 389 |
+
|
| 390 |
+
st.markdown(f"**{len(page_regions)} regions** | Average confidence: <span class='{conf_class}'>{avg_conf:.0%}</span>", unsafe_allow_html=True)
|
| 391 |
+
|
| 392 |
+
# Filter by confidence
|
| 393 |
+
min_conf = st.slider("Minimum confidence", 0.0, 1.0, 0.5, 0.1)
|
| 394 |
+
filtered_regions = [r for r in page_regions if r.get("confidence", 0) >= min_conf]
|
| 395 |
+
|
| 396 |
+
for i, region in enumerate(filtered_regions[:50]):
|
| 397 |
+
conf = region.get("confidence", 0)
|
| 398 |
+
conf_class = get_confidence_class(conf)
|
| 399 |
+
color = "#4ECDC4" if conf >= 0.8 else "#ffc107" if conf >= 0.6 else "#dc3545"
|
| 400 |
+
|
| 401 |
+
st.markdown(f"""
|
| 402 |
+
<div class="ocr-region" style="border-left-color: {color};">
|
| 403 |
+
<div style="display: flex; justify-content: space-between; margin-bottom: 4px;">
|
| 404 |
+
<span style="font-size: 11px; color: #8b949e;">Region {i+1}</span>
|
| 405 |
+
<span class="{conf_class}" style="font-size: 11px;">{conf:.0%}</span>
|
| 406 |
+
</div>
|
| 407 |
+
<div style="font-family: monospace; font-size: 12px;">{region.get('text', '')}</div>
|
| 408 |
+
</div>
|
| 409 |
+
""", unsafe_allow_html=True)
|
| 410 |
+
|
| 411 |
+
if len(filtered_regions) > 50:
|
| 412 |
+
st.info(f"Showing 50 of {len(filtered_regions)} regions")
|
| 413 |
+
else:
|
| 414 |
+
st.info("No OCR regions available for this document")
|
| 415 |
+
st.markdown("OCR regions are extracted during document processing with OCR enabled.")
|
| 416 |
+
|
| 417 |
+
with tab_layout:
|
| 418 |
+
st.markdown("### Layout Structure")
|
| 419 |
+
|
| 420 |
+
layout_regions = active_doc.layout_data.get("regions", [])
|
| 421 |
+
|
| 422 |
+
if layout_regions:
|
| 423 |
+
# Group by type
|
| 424 |
+
by_type = {}
|
| 425 |
+
for region in layout_regions:
|
| 426 |
+
rtype = region.get("type", "unknown")
|
| 427 |
+
if rtype not in by_type:
|
| 428 |
+
by_type[rtype] = []
|
| 429 |
+
by_type[rtype].append(region)
|
| 430 |
+
|
| 431 |
+
# Type summary
|
| 432 |
+
st.markdown("**Detected region types:**")
|
| 433 |
+
type_cols = st.columns(min(len(by_type), 6))
|
| 434 |
+
for i, (rtype, regions) in enumerate(by_type.items()):
|
| 435 |
+
color = get_layout_color(rtype)
|
| 436 |
+
type_cols[i % 6].markdown(f"""
|
| 437 |
+
<div class="layout-region" style="background: {color}20; border: 1px solid {color};">
|
| 438 |
+
<strong>{rtype.title()}</strong>: {len(regions)}
|
| 439 |
+
</div>
|
| 440 |
+
""", unsafe_allow_html=True)
|
| 441 |
+
|
| 442 |
+
st.markdown("---")
|
| 443 |
+
|
| 444 |
+
# Layout regions list
|
| 445 |
+
type_filter = st.selectbox("Filter by type", ["All"] + list(by_type.keys()))
|
| 446 |
+
|
| 447 |
+
filtered_layout = layout_regions
|
| 448 |
+
if type_filter != "All":
|
| 449 |
+
filtered_layout = by_type.get(type_filter, [])
|
| 450 |
+
|
| 451 |
+
for i, region in enumerate(filtered_layout[:30]):
|
| 452 |
+
rtype = region.get("type", "unknown")
|
| 453 |
+
conf = region.get("confidence", 0)
|
| 454 |
+
color = get_layout_color(rtype)
|
| 455 |
+
conf_class = get_confidence_class(conf)
|
| 456 |
+
|
| 457 |
+
st.markdown(f"""
|
| 458 |
+
<div style="background: #161b22; border-radius: 6px; padding: 10px; margin: 6px 0; border-left: 3px solid {color};">
|
| 459 |
+
<div style="display: flex; justify-content: space-between;">
|
| 460 |
+
<span><strong style="color: {color};">{rtype.upper()}</strong></span>
|
| 461 |
+
<span>Page {region.get('page', 0) + 1}</span>
|
| 462 |
+
<span class="{conf_class}">{conf:.0%}</span>
|
| 463 |
+
</div>
|
| 464 |
+
</div>
|
| 465 |
+
""", unsafe_allow_html=True)
|
| 466 |
+
|
| 467 |
+
if len(filtered_layout) > 30:
|
| 468 |
+
st.info(f"Showing 30 of {len(filtered_layout)} regions")
|
| 469 |
+
else:
|
| 470 |
+
st.info("No layout regions available for this document")
|
| 471 |
+
st.markdown("Layout regions are extracted during document processing with layout detection enabled.")
|
| 472 |
+
|
| 473 |
+
with tab_pages:
|
| 474 |
+
st.markdown("### Page Images")
|
| 475 |
+
|
| 476 |
+
if active_doc.page_images:
|
| 477 |
+
page_idx = st.selectbox(
|
| 478 |
+
"Select page",
|
| 479 |
+
list(range(len(active_doc.page_images))),
|
| 480 |
+
format_func=lambda x: f"Page {x + 1}"
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
if page_idx is not None and page_idx < len(active_doc.page_images):
|
| 484 |
+
img_data = active_doc.page_images[page_idx]
|
| 485 |
+
|
| 486 |
+
# Display image
|
| 487 |
+
st.image(
|
| 488 |
+
f"data:image/png;base64,{img_data}",
|
| 489 |
+
caption=f"Page {page_idx + 1}",
|
| 490 |
+
use_container_width=True
|
| 491 |
+
)
|
| 492 |
+
|
| 493 |
+
# Overlay options
|
| 494 |
+
st.markdown("**Overlay options:**")
|
| 495 |
+
overlay_cols = st.columns(3)
|
| 496 |
+
with overlay_cols[0]:
|
| 497 |
+
show_chunks = st.checkbox("Show chunk boundaries", value=False)
|
| 498 |
+
with overlay_cols[1]:
|
| 499 |
+
show_ocr = st.checkbox("Show OCR regions", value=False)
|
| 500 |
+
with overlay_cols[2]:
|
| 501 |
+
show_layout = st.checkbox("Show layout regions", value=False)
|
| 502 |
+
|
| 503 |
+
if show_chunks or show_ocr or show_layout:
|
| 504 |
+
st.info("Overlay visualization coming soon - requires image annotation support")
|
| 505 |
+
else:
|
| 506 |
+
st.info("No page images available for this document")
|
| 507 |
+
st.markdown("Page images are extracted from PDF documents during processing.")
|
| 508 |
+
|
| 509 |
+
# Navigation to other modules
|
| 510 |
+
st.markdown("---")
|
| 511 |
+
st.markdown("### 🔗 Actions")
|
| 512 |
+
|
| 513 |
+
nav_cols = st.columns(4)
|
| 514 |
+
|
| 515 |
+
with nav_cols[0]:
|
| 516 |
+
if st.button("💬 Ask Questions", use_container_width=True):
|
| 517 |
+
st.switch_page("pages/2_💬_Interactive_RAG.py")
|
| 518 |
+
|
| 519 |
+
with nav_cols[1]:
|
| 520 |
+
if st.button("🎯 View Evidence", use_container_width=True):
|
| 521 |
+
st.switch_page("pages/4_🎯_Evidence_Viewer.py")
|
| 522 |
+
|
| 523 |
+
with nav_cols[2]:
|
| 524 |
+
if st.button("📊 Compare Documents", use_container_width=True):
|
| 525 |
+
st.switch_page("pages/3_📊_Document_Comparison.py")
|
| 526 |
+
|
| 527 |
+
with nav_cols[3]:
|
| 528 |
+
if st.button("🔬 Process New", use_container_width=True):
|
| 529 |
+
st.switch_page("pages/1_🔬_Live_Processing.py")
|
| 530 |
+
|
| 531 |
+
else:
|
| 532 |
+
# No active document
|
| 533 |
+
st.markdown("## No Document Selected")
|
| 534 |
+
|
| 535 |
+
col1, col2 = st.columns(2)
|
| 536 |
+
|
| 537 |
+
with col1:
|
| 538 |
+
st.markdown("""
|
| 539 |
+
### Getting Started
|
| 540 |
+
|
| 541 |
+
1. Go to **Live Processing** to upload and process a document
|
| 542 |
+
2. Processed documents will appear in the sidebar
|
| 543 |
+
3. Click on a document to view its details
|
| 544 |
+
|
| 545 |
+
Or select a document from the sidebar if you've already processed some.
|
| 546 |
+
""")
|
| 547 |
+
|
| 548 |
+
if st.button("🔬 Go to Live Processing", type="primary", use_container_width=True):
|
| 549 |
+
st.switch_page("pages/1_🔬_Live_Processing.py")
|
| 550 |
+
|
| 551 |
+
with col2:
|
| 552 |
+
# Show RAG stats
|
| 553 |
+
stats = get_store_stats()
|
| 554 |
+
st.markdown("### RAG Index Status")
|
| 555 |
+
st.metric("Total Indexed Chunks", stats.get("total_chunks", 0))
|
| 556 |
+
|
| 557 |
+
if rag_docs:
|
| 558 |
+
st.markdown("**Indexed Documents:**")
|
| 559 |
+
for doc in rag_docs[:5]:
|
| 560 |
+
doc_id = doc.get("document_id", "unknown")
|
| 561 |
+
chunks = doc.get("chunk_count", 0)
|
| 562 |
+
st.caption(f"• {doc_id[:30]}... ({chunks} chunks)")
|
| 563 |
+
|
| 564 |
+
if len(rag_docs) > 5:
|
| 565 |
+
st.caption(f"... and {len(rag_docs) - 5} more")
|
demo/rag_config.py
ADDED
|
@@ -0,0 +1,396 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unified RAG Configuration for SPARKNET Demo
|
| 3 |
+
|
| 4 |
+
This module provides a single source of truth for RAG system configuration,
|
| 5 |
+
ensuring all demo pages use the same vector store, embeddings, and models.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import streamlit as st
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import sys
|
| 11 |
+
|
| 12 |
+
PROJECT_ROOT = Path(__file__).parent.parent
|
| 13 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 14 |
+
|
| 15 |
+
# Configuration constants
|
| 16 |
+
OLLAMA_BASE_URL = "http://localhost:11434"
|
| 17 |
+
VECTOR_STORE_PATH = "data/sparknet_unified_rag"
|
| 18 |
+
COLLECTION_NAME = "sparknet_documents"
|
| 19 |
+
|
| 20 |
+
# Model preferences (in order of preference)
|
| 21 |
+
EMBEDDING_MODELS = ["nomic-embed-text", "mxbai-embed-large:latest", "mxbai-embed-large"]
|
| 22 |
+
LLM_MODELS = ["llama3.2:latest", "llama3.1:8b", "mistral:latest", "qwen2.5:14b", "qwen2.5:32b"]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def check_ollama():
|
| 26 |
+
"""Check Ollama availability and get available models."""
|
| 27 |
+
try:
|
| 28 |
+
import httpx
|
| 29 |
+
with httpx.Client(timeout=5.0) as client:
|
| 30 |
+
resp = client.get(f"{OLLAMA_BASE_URL}/api/tags")
|
| 31 |
+
if resp.status_code == 200:
|
| 32 |
+
models = [m["name"] for m in resp.json().get("models", [])]
|
| 33 |
+
return True, models
|
| 34 |
+
except:
|
| 35 |
+
pass
|
| 36 |
+
return False, []
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def select_model(available_models: list, preferred_models: list) -> str:
|
| 40 |
+
"""Select the best available model from preferences."""
|
| 41 |
+
for model in preferred_models:
|
| 42 |
+
if model in available_models:
|
| 43 |
+
return model
|
| 44 |
+
# Return first preference as fallback
|
| 45 |
+
return preferred_models[0] if preferred_models else "llama3.2:latest"
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@st.cache_resource
|
| 49 |
+
def get_unified_rag_system():
|
| 50 |
+
"""
|
| 51 |
+
Initialize and return the unified RAG system.
|
| 52 |
+
|
| 53 |
+
This is cached at the Streamlit level so all pages share the same instance.
|
| 54 |
+
"""
|
| 55 |
+
try:
|
| 56 |
+
from src.rag.agentic import AgenticRAG, RAGConfig
|
| 57 |
+
from src.rag.store import get_vector_store, VectorStoreConfig, reset_vector_store
|
| 58 |
+
from src.rag.embeddings import get_embedding_adapter, EmbeddingConfig, reset_embedding_adapter
|
| 59 |
+
|
| 60 |
+
# Check Ollama
|
| 61 |
+
ollama_ok, available_models = check_ollama()
|
| 62 |
+
if not ollama_ok:
|
| 63 |
+
return {
|
| 64 |
+
"status": "error",
|
| 65 |
+
"error": "Ollama is not running. Please start Ollama first.",
|
| 66 |
+
"rag": None,
|
| 67 |
+
"store": None,
|
| 68 |
+
"embedder": None,
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
# Select models
|
| 72 |
+
embed_model = select_model(available_models, EMBEDDING_MODELS)
|
| 73 |
+
llm_model = select_model(available_models, LLM_MODELS)
|
| 74 |
+
|
| 75 |
+
# Reset singletons to ensure fresh config
|
| 76 |
+
reset_vector_store()
|
| 77 |
+
reset_embedding_adapter()
|
| 78 |
+
|
| 79 |
+
# Initialize embedding adapter
|
| 80 |
+
embed_config = EmbeddingConfig(
|
| 81 |
+
ollama_model=embed_model,
|
| 82 |
+
ollama_base_url=OLLAMA_BASE_URL,
|
| 83 |
+
)
|
| 84 |
+
embedder = get_embedding_adapter(config=embed_config)
|
| 85 |
+
|
| 86 |
+
# Initialize vector store
|
| 87 |
+
store_config = VectorStoreConfig(
|
| 88 |
+
persist_directory=VECTOR_STORE_PATH,
|
| 89 |
+
collection_name=COLLECTION_NAME,
|
| 90 |
+
similarity_threshold=0.0, # No threshold - let reranker handle filtering
|
| 91 |
+
)
|
| 92 |
+
store = get_vector_store(config=store_config)
|
| 93 |
+
|
| 94 |
+
# Initialize RAG config
|
| 95 |
+
rag_config = RAGConfig(
|
| 96 |
+
model=llm_model,
|
| 97 |
+
base_url=OLLAMA_BASE_URL,
|
| 98 |
+
max_revision_attempts=1,
|
| 99 |
+
enable_query_planning=True,
|
| 100 |
+
enable_reranking=True,
|
| 101 |
+
enable_validation=True,
|
| 102 |
+
retrieval_top_k=10,
|
| 103 |
+
final_top_k=5,
|
| 104 |
+
min_confidence=0.3,
|
| 105 |
+
verbose=False,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
# Initialize RAG system
|
| 109 |
+
rag = AgenticRAG(
|
| 110 |
+
config=rag_config,
|
| 111 |
+
vector_store=store,
|
| 112 |
+
embedding_adapter=embedder,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
return {
|
| 116 |
+
"status": "ready",
|
| 117 |
+
"error": None,
|
| 118 |
+
"rag": rag,
|
| 119 |
+
"store": store,
|
| 120 |
+
"embedder": embedder,
|
| 121 |
+
"embed_model": embed_model,
|
| 122 |
+
"llm_model": llm_model,
|
| 123 |
+
"available_models": available_models,
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
except Exception as e:
|
| 127 |
+
import traceback
|
| 128 |
+
return {
|
| 129 |
+
"status": "error",
|
| 130 |
+
"error": f"{str(e)}\n{traceback.format_exc()}",
|
| 131 |
+
"rag": None,
|
| 132 |
+
"store": None,
|
| 133 |
+
"embedder": None,
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def get_store_stats():
|
| 138 |
+
"""Get current vector store statistics."""
|
| 139 |
+
system = get_unified_rag_system()
|
| 140 |
+
if system["status"] != "ready":
|
| 141 |
+
return {"total_chunks": 0, "status": "error"}
|
| 142 |
+
|
| 143 |
+
try:
|
| 144 |
+
return {
|
| 145 |
+
"total_chunks": system["store"].count(),
|
| 146 |
+
"status": "ready",
|
| 147 |
+
"embed_model": system.get("embed_model", "unknown"),
|
| 148 |
+
"llm_model": system.get("llm_model", "unknown"),
|
| 149 |
+
}
|
| 150 |
+
except:
|
| 151 |
+
return {"total_chunks": 0, "status": "error"}
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def index_document(text: str, document_id: str, metadata: dict = None) -> dict:
|
| 155 |
+
"""Index a document into the unified RAG system."""
|
| 156 |
+
system = get_unified_rag_system()
|
| 157 |
+
if system["status"] != "ready":
|
| 158 |
+
return {"success": False, "error": system["error"], "num_chunks": 0}
|
| 159 |
+
|
| 160 |
+
try:
|
| 161 |
+
num_chunks = system["rag"].index_text(
|
| 162 |
+
text=text,
|
| 163 |
+
document_id=document_id,
|
| 164 |
+
metadata=metadata or {},
|
| 165 |
+
)
|
| 166 |
+
return {"success": True, "num_chunks": num_chunks, "error": None}
|
| 167 |
+
except Exception as e:
|
| 168 |
+
return {"success": False, "error": str(e), "num_chunks": 0}
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def query_rag(question: str, filters: dict = None):
|
| 172 |
+
"""Query the unified RAG system."""
|
| 173 |
+
system = get_unified_rag_system()
|
| 174 |
+
if system["status"] != "ready":
|
| 175 |
+
return None, system["error"]
|
| 176 |
+
|
| 177 |
+
try:
|
| 178 |
+
response = system["rag"].query(question, filters=filters)
|
| 179 |
+
return response, None
|
| 180 |
+
except Exception as e:
|
| 181 |
+
return None, str(e)
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def clear_index():
|
| 185 |
+
"""Clear the vector store index."""
|
| 186 |
+
# Force reinitialization by clearing cache
|
| 187 |
+
get_unified_rag_system.clear()
|
| 188 |
+
return True
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def get_indexed_documents() -> list:
|
| 192 |
+
"""Get list of indexed document IDs from vector store."""
|
| 193 |
+
system = get_unified_rag_system()
|
| 194 |
+
if system["status"] != "ready":
|
| 195 |
+
return []
|
| 196 |
+
|
| 197 |
+
try:
|
| 198 |
+
# Query ChromaDB for unique document IDs
|
| 199 |
+
store = system["store"]
|
| 200 |
+
collection = store._collection
|
| 201 |
+
|
| 202 |
+
# Get all metadata to extract unique document_ids
|
| 203 |
+
results = collection.get(include=["metadatas"])
|
| 204 |
+
if not results or not results.get("metadatas"):
|
| 205 |
+
return []
|
| 206 |
+
|
| 207 |
+
doc_ids = set()
|
| 208 |
+
doc_info = {}
|
| 209 |
+
for meta in results["metadatas"]:
|
| 210 |
+
doc_id = meta.get("document_id", "unknown")
|
| 211 |
+
if doc_id not in doc_info:
|
| 212 |
+
doc_info[doc_id] = {
|
| 213 |
+
"document_id": doc_id,
|
| 214 |
+
"source_path": meta.get("source_path", ""),
|
| 215 |
+
"chunk_count": 0,
|
| 216 |
+
}
|
| 217 |
+
doc_info[doc_id]["chunk_count"] += 1
|
| 218 |
+
|
| 219 |
+
return list(doc_info.values())
|
| 220 |
+
except Exception as e:
|
| 221 |
+
return []
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def get_chunks_for_document(document_id: str) -> list:
|
| 225 |
+
"""Get all chunks for a specific document."""
|
| 226 |
+
system = get_unified_rag_system()
|
| 227 |
+
if system["status"] != "ready":
|
| 228 |
+
return []
|
| 229 |
+
|
| 230 |
+
try:
|
| 231 |
+
store = system["store"]
|
| 232 |
+
collection = store._collection
|
| 233 |
+
|
| 234 |
+
# Query for chunks with this document_id
|
| 235 |
+
results = collection.get(
|
| 236 |
+
where={"document_id": document_id},
|
| 237 |
+
include=["documents", "metadatas"]
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
if not results or not results.get("ids"):
|
| 241 |
+
return []
|
| 242 |
+
|
| 243 |
+
chunks = []
|
| 244 |
+
for i, chunk_id in enumerate(results["ids"]):
|
| 245 |
+
chunks.append({
|
| 246 |
+
"chunk_id": chunk_id,
|
| 247 |
+
"text": results["documents"][i] if results.get("documents") else "",
|
| 248 |
+
"metadata": results["metadatas"][i] if results.get("metadatas") else {},
|
| 249 |
+
})
|
| 250 |
+
|
| 251 |
+
return chunks
|
| 252 |
+
except Exception as e:
|
| 253 |
+
return []
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def search_similar_chunks(query: str, top_k: int = 5, doc_filter: str = None):
|
| 257 |
+
"""Search for similar chunks with optional document filter."""
|
| 258 |
+
system = get_unified_rag_system()
|
| 259 |
+
if system["status"] != "ready":
|
| 260 |
+
return []
|
| 261 |
+
|
| 262 |
+
try:
|
| 263 |
+
embedder = system["embedder"]
|
| 264 |
+
store = system["store"]
|
| 265 |
+
|
| 266 |
+
# Generate query embedding
|
| 267 |
+
query_embedding = embedder.embed_text(query)
|
| 268 |
+
|
| 269 |
+
# Build filter
|
| 270 |
+
filters = None
|
| 271 |
+
if doc_filter:
|
| 272 |
+
filters = {"document_id": doc_filter}
|
| 273 |
+
|
| 274 |
+
# Search
|
| 275 |
+
results = store.search(
|
| 276 |
+
query_embedding=query_embedding,
|
| 277 |
+
top_k=top_k,
|
| 278 |
+
filters=filters,
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
return [
|
| 282 |
+
{
|
| 283 |
+
"chunk_id": r.chunk_id,
|
| 284 |
+
"document_id": r.document_id,
|
| 285 |
+
"text": r.text,
|
| 286 |
+
"similarity": r.similarity,
|
| 287 |
+
"page": r.page,
|
| 288 |
+
"metadata": r.metadata,
|
| 289 |
+
}
|
| 290 |
+
for r in results
|
| 291 |
+
]
|
| 292 |
+
except Exception as e:
|
| 293 |
+
return []
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def compute_document_similarity(doc_id_1: str, doc_id_2: str) -> dict:
|
| 297 |
+
"""Compute semantic similarity between two documents."""
|
| 298 |
+
system = get_unified_rag_system()
|
| 299 |
+
if system["status"] != "ready":
|
| 300 |
+
return {"error": "RAG system not ready", "similarity": 0.0}
|
| 301 |
+
|
| 302 |
+
try:
|
| 303 |
+
# Get chunks for both documents
|
| 304 |
+
chunks_1 = get_chunks_for_document(doc_id_1)
|
| 305 |
+
chunks_2 = get_chunks_for_document(doc_id_2)
|
| 306 |
+
|
| 307 |
+
if not chunks_1 or not chunks_2:
|
| 308 |
+
return {"error": "One or both documents not found", "similarity": 0.0}
|
| 309 |
+
|
| 310 |
+
embedder = system["embedder"]
|
| 311 |
+
|
| 312 |
+
# Compute average embeddings for each document
|
| 313 |
+
def avg_embedding(chunks):
|
| 314 |
+
embeddings = []
|
| 315 |
+
for chunk in chunks[:10]: # Limit to first 10 chunks
|
| 316 |
+
emb = embedder.embed_text(chunk["text"])
|
| 317 |
+
embeddings.append(emb)
|
| 318 |
+
if not embeddings:
|
| 319 |
+
return None
|
| 320 |
+
# Average
|
| 321 |
+
import numpy as np
|
| 322 |
+
return np.mean(embeddings, axis=0).tolist()
|
| 323 |
+
|
| 324 |
+
emb1 = avg_embedding(chunks_1)
|
| 325 |
+
emb2 = avg_embedding(chunks_2)
|
| 326 |
+
|
| 327 |
+
if emb1 is None or emb2 is None:
|
| 328 |
+
return {"error": "Could not compute embeddings", "similarity": 0.0}
|
| 329 |
+
|
| 330 |
+
# Compute cosine similarity
|
| 331 |
+
import numpy as np
|
| 332 |
+
emb1 = np.array(emb1)
|
| 333 |
+
emb2 = np.array(emb2)
|
| 334 |
+
similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
|
| 335 |
+
|
| 336 |
+
return {
|
| 337 |
+
"similarity": float(similarity),
|
| 338 |
+
"doc1_chunks": len(chunks_1),
|
| 339 |
+
"doc2_chunks": len(chunks_2),
|
| 340 |
+
"error": None,
|
| 341 |
+
}
|
| 342 |
+
except Exception as e:
|
| 343 |
+
return {"error": str(e), "similarity": 0.0}
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
def auto_index_processed_document(doc_id: str, text: str, chunks: list, metadata: dict = None):
|
| 347 |
+
"""
|
| 348 |
+
Auto-index a processed document with pre-computed chunks.
|
| 349 |
+
|
| 350 |
+
This is called after document processing completes to immediately
|
| 351 |
+
make the document available in RAG.
|
| 352 |
+
"""
|
| 353 |
+
system = get_unified_rag_system()
|
| 354 |
+
if system["status"] != "ready":
|
| 355 |
+
return {"success": False, "error": "RAG system not ready", "num_chunks": 0}
|
| 356 |
+
|
| 357 |
+
try:
|
| 358 |
+
store = system["store"]
|
| 359 |
+
embedder = system["embedder"]
|
| 360 |
+
|
| 361 |
+
# Prepare chunks for indexing
|
| 362 |
+
chunk_dicts = []
|
| 363 |
+
embeddings = []
|
| 364 |
+
|
| 365 |
+
for i, chunk in enumerate(chunks):
|
| 366 |
+
chunk_text = chunk.get("text", chunk) if isinstance(chunk, dict) else chunk
|
| 367 |
+
|
| 368 |
+
if len(chunk_text.strip()) < 20:
|
| 369 |
+
continue
|
| 370 |
+
|
| 371 |
+
chunk_id = f"{doc_id}_chunk_{i}"
|
| 372 |
+
chunk_dict = {
|
| 373 |
+
"chunk_id": chunk_id,
|
| 374 |
+
"document_id": doc_id,
|
| 375 |
+
"text": chunk_text,
|
| 376 |
+
"page": chunk.get("page", 0) if isinstance(chunk, dict) else 0,
|
| 377 |
+
"chunk_type": "text",
|
| 378 |
+
"source_path": metadata.get("filename", "") if metadata else "",
|
| 379 |
+
"sequence_index": i,
|
| 380 |
+
}
|
| 381 |
+
chunk_dicts.append(chunk_dict)
|
| 382 |
+
|
| 383 |
+
# Generate embedding
|
| 384 |
+
embedding = embedder.embed_text(chunk_text)
|
| 385 |
+
embeddings.append(embedding)
|
| 386 |
+
|
| 387 |
+
if not chunk_dicts:
|
| 388 |
+
return {"success": False, "error": "No valid chunks to index", "num_chunks": 0}
|
| 389 |
+
|
| 390 |
+
# Add to store
|
| 391 |
+
store.add_chunks(chunk_dicts, embeddings)
|
| 392 |
+
|
| 393 |
+
return {"success": True, "num_chunks": len(chunk_dicts), "error": None}
|
| 394 |
+
|
| 395 |
+
except Exception as e:
|
| 396 |
+
return {"success": False, "error": str(e), "num_chunks": 0}
|
demo/requirements.txt
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPARKNET Demo Requirements
|
| 2 |
+
# Run: pip install -r demo/requirements.txt
|
| 3 |
+
|
| 4 |
+
# Streamlit
|
| 5 |
+
streamlit>=1.28.0
|
| 6 |
+
|
| 7 |
+
# Data handling
|
| 8 |
+
pandas>=2.0.0
|
| 9 |
+
numpy>=1.24.0
|
| 10 |
+
|
| 11 |
+
# HTTP client (for Ollama checks)
|
| 12 |
+
httpx>=0.25.0
|
| 13 |
+
|
| 14 |
+
# Image handling (optional, for advanced features)
|
| 15 |
+
Pillow>=10.0.0
|
| 16 |
+
|
| 17 |
+
# Charts (optional)
|
| 18 |
+
plotly>=5.18.0
|
| 19 |
+
altair>=5.2.0
|
demo/state_manager.py
ADDED
|
@@ -0,0 +1,833 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unified State Manager for SPARKNET Demo
|
| 3 |
+
|
| 4 |
+
Enhanced state management for cross-module communication (Phase 1B):
|
| 5 |
+
- Document processing state tracking
|
| 6 |
+
- Indexed documents registry
|
| 7 |
+
- Cross-module event system (pub/sub)
|
| 8 |
+
- Real-time status updates
|
| 9 |
+
- Evidence highlighting synchronization
|
| 10 |
+
- Document selection synchronization
|
| 11 |
+
- Query/response sharing between modules
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import streamlit as st
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import Dict, List, Any, Optional, Callable, Set
|
| 17 |
+
from dataclasses import dataclass, field
|
| 18 |
+
from datetime import datetime
|
| 19 |
+
from enum import Enum
|
| 20 |
+
import hashlib
|
| 21 |
+
import json
|
| 22 |
+
import sys
|
| 23 |
+
import time
|
| 24 |
+
from threading import Lock
|
| 25 |
+
|
| 26 |
+
PROJECT_ROOT = Path(__file__).parent.parent
|
| 27 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# ==============================================================================
|
| 31 |
+
# Event System (Phase 1B Enhancement)
|
| 32 |
+
# ==============================================================================
|
| 33 |
+
|
| 34 |
+
class EventType(str, Enum):
|
| 35 |
+
"""Cross-module event types for synchronization."""
|
| 36 |
+
DOCUMENT_SELECTED = "document_selected"
|
| 37 |
+
DOCUMENT_PROCESSED = "document_processed"
|
| 38 |
+
DOCUMENT_INDEXED = "document_indexed"
|
| 39 |
+
DOCUMENT_REMOVED = "document_removed"
|
| 40 |
+
CHUNK_SELECTED = "chunk_selected"
|
| 41 |
+
EVIDENCE_HIGHLIGHT = "evidence_highlight"
|
| 42 |
+
RAG_QUERY_STARTED = "rag_query_started"
|
| 43 |
+
RAG_QUERY_COMPLETED = "rag_query_completed"
|
| 44 |
+
PAGE_CHANGED = "page_changed"
|
| 45 |
+
PROCESSING_STARTED = "processing_started"
|
| 46 |
+
PROCESSING_COMPLETED = "processing_completed"
|
| 47 |
+
SYSTEM_STATUS_CHANGED = "system_status_changed"
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@dataclass
|
| 51 |
+
class Event:
|
| 52 |
+
"""Cross-module event for synchronization."""
|
| 53 |
+
event_type: EventType
|
| 54 |
+
source_module: str
|
| 55 |
+
payload: Dict[str, Any]
|
| 56 |
+
timestamp: datetime = field(default_factory=datetime.now)
|
| 57 |
+
event_id: str = field(default_factory=lambda: hashlib.md5(
|
| 58 |
+
f"{time.time()}".encode()
|
| 59 |
+
).hexdigest()[:8])
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@dataclass
|
| 63 |
+
class EvidenceHighlight:
|
| 64 |
+
"""Evidence highlight for cross-module visualization."""
|
| 65 |
+
doc_id: str
|
| 66 |
+
chunk_id: str
|
| 67 |
+
page: int
|
| 68 |
+
bbox: tuple # (x_min, y_min, x_max, y_max)
|
| 69 |
+
text_snippet: str
|
| 70 |
+
confidence: float
|
| 71 |
+
source_query: Optional[str] = None
|
| 72 |
+
highlight_color: str = "#FFE082" # Amber highlight
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@dataclass
|
| 76 |
+
class ProcessedDocument:
|
| 77 |
+
"""Represents a processed document with all extracted data."""
|
| 78 |
+
doc_id: str
|
| 79 |
+
filename: str
|
| 80 |
+
file_type: str
|
| 81 |
+
raw_text: str
|
| 82 |
+
chunks: List[Dict[str, Any]]
|
| 83 |
+
page_count: int = 1
|
| 84 |
+
page_images: List[bytes] = field(default_factory=list)
|
| 85 |
+
ocr_regions: List[Dict[str, Any]] = field(default_factory=list)
|
| 86 |
+
layout_data: Dict[str, Any] = field(default_factory=dict)
|
| 87 |
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
| 88 |
+
indexed: bool = False
|
| 89 |
+
indexed_chunks: int = 0
|
| 90 |
+
processing_time: float = 0.0
|
| 91 |
+
created_at: datetime = field(default_factory=datetime.now)
|
| 92 |
+
|
| 93 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 94 |
+
return {
|
| 95 |
+
"doc_id": self.doc_id,
|
| 96 |
+
"filename": self.filename,
|
| 97 |
+
"file_type": self.file_type,
|
| 98 |
+
"text_length": len(self.raw_text),
|
| 99 |
+
"chunk_count": len(self.chunks),
|
| 100 |
+
"page_count": self.page_count,
|
| 101 |
+
"ocr_region_count": len(self.ocr_regions),
|
| 102 |
+
"indexed": self.indexed,
|
| 103 |
+
"indexed_chunks": self.indexed_chunks,
|
| 104 |
+
"processing_time": self.processing_time,
|
| 105 |
+
"created_at": self.created_at.isoformat(),
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
@dataclass
|
| 110 |
+
class ProcessingStatus:
|
| 111 |
+
"""Tracks processing status for a document."""
|
| 112 |
+
doc_id: str
|
| 113 |
+
stage: str # loading, ocr, chunking, embedding, indexing, complete, error
|
| 114 |
+
progress: float # 0.0 - 1.0
|
| 115 |
+
message: str
|
| 116 |
+
started_at: datetime = field(default_factory=datetime.now)
|
| 117 |
+
completed_at: Optional[datetime] = None
|
| 118 |
+
error: Optional[str] = None
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
class UnifiedStateManager:
|
| 122 |
+
"""
|
| 123 |
+
Central state manager for SPARKNET demo.
|
| 124 |
+
|
| 125 |
+
Enhanced with Phase 1B features:
|
| 126 |
+
- Document processing state tracking
|
| 127 |
+
- Indexed documents registry
|
| 128 |
+
- Cross-module event system (pub/sub)
|
| 129 |
+
- Real-time status updates
|
| 130 |
+
- Evidence highlighting sync
|
| 131 |
+
- Query/response sharing
|
| 132 |
+
"""
|
| 133 |
+
|
| 134 |
+
def __init__(self):
|
| 135 |
+
self._ensure_session_state()
|
| 136 |
+
self._event_handlers: Dict[EventType, List[Callable]] = {}
|
| 137 |
+
|
| 138 |
+
def _ensure_session_state(self):
|
| 139 |
+
"""Initialize session state if not exists."""
|
| 140 |
+
if "unified_state" not in st.session_state:
|
| 141 |
+
st.session_state.unified_state = {
|
| 142 |
+
"documents": {}, # doc_id -> ProcessedDocument
|
| 143 |
+
"processing_status": {}, # doc_id -> ProcessingStatus
|
| 144 |
+
"indexed_doc_ids": set(),
|
| 145 |
+
"active_doc_id": None,
|
| 146 |
+
"active_page": 0,
|
| 147 |
+
"active_chunk_id": None,
|
| 148 |
+
"notifications": [],
|
| 149 |
+
"rag_ready": False,
|
| 150 |
+
"total_indexed_chunks": 0,
|
| 151 |
+
"last_update": datetime.now().isoformat(),
|
| 152 |
+
# Phase 1B: Cross-module sync
|
| 153 |
+
"event_queue": [], # List of Event objects
|
| 154 |
+
"evidence_highlights": [], # List of EvidenceHighlight
|
| 155 |
+
"last_rag_query": None,
|
| 156 |
+
"last_rag_response": None,
|
| 157 |
+
"selected_sources": [], # Source chunks from RAG
|
| 158 |
+
"module_states": {}, # Per-module custom state
|
| 159 |
+
"sync_version": 0, # Increment on any state change
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
@property
|
| 163 |
+
def state(self) -> Dict:
|
| 164 |
+
"""Get the unified state dict."""
|
| 165 |
+
self._ensure_session_state()
|
| 166 |
+
return st.session_state.unified_state
|
| 167 |
+
|
| 168 |
+
# ==================== Document Management ====================
|
| 169 |
+
|
| 170 |
+
def add_document(self, doc: ProcessedDocument) -> str:
|
| 171 |
+
"""Add a processed document to the state."""
|
| 172 |
+
self.state["documents"][doc.doc_id] = doc
|
| 173 |
+
self._notify(f"Document '{doc.filename}' added", "info")
|
| 174 |
+
self._update_timestamp()
|
| 175 |
+
return doc.doc_id
|
| 176 |
+
|
| 177 |
+
def get_document(self, doc_id: str) -> Optional[ProcessedDocument]:
|
| 178 |
+
"""Get a document by ID."""
|
| 179 |
+
return self.state["documents"].get(doc_id)
|
| 180 |
+
|
| 181 |
+
def get_all_documents(self) -> List[ProcessedDocument]:
|
| 182 |
+
"""Get all documents."""
|
| 183 |
+
return list(self.state["documents"].values())
|
| 184 |
+
|
| 185 |
+
def get_indexed_documents(self) -> List[ProcessedDocument]:
|
| 186 |
+
"""Get only indexed documents."""
|
| 187 |
+
return [d for d in self.state["documents"].values() if d.indexed]
|
| 188 |
+
|
| 189 |
+
def remove_document(self, doc_id: str):
|
| 190 |
+
"""Remove a document from state."""
|
| 191 |
+
if doc_id in self.state["documents"]:
|
| 192 |
+
doc = self.state["documents"].pop(doc_id)
|
| 193 |
+
self.state["indexed_doc_ids"].discard(doc_id)
|
| 194 |
+
self._notify(f"Document '{doc.filename}' removed", "warning")
|
| 195 |
+
self._update_timestamp()
|
| 196 |
+
|
| 197 |
+
def set_active_document(self, doc_id: Optional[str]):
|
| 198 |
+
"""Set the currently active document."""
|
| 199 |
+
self.state["active_doc_id"] = doc_id
|
| 200 |
+
self._update_timestamp()
|
| 201 |
+
|
| 202 |
+
def get_active_document(self) -> Optional[ProcessedDocument]:
|
| 203 |
+
"""Get the currently active document."""
|
| 204 |
+
if self.state["active_doc_id"]:
|
| 205 |
+
return self.get_document(self.state["active_doc_id"])
|
| 206 |
+
return None
|
| 207 |
+
|
| 208 |
+
# ==================== Processing Status ====================
|
| 209 |
+
|
| 210 |
+
def start_processing(self, doc_id: str, filename: str):
|
| 211 |
+
"""Start processing a document."""
|
| 212 |
+
status = ProcessingStatus(
|
| 213 |
+
doc_id=doc_id,
|
| 214 |
+
stage="loading",
|
| 215 |
+
progress=0.0,
|
| 216 |
+
message=f"Loading {filename}..."
|
| 217 |
+
)
|
| 218 |
+
self.state["processing_status"][doc_id] = status
|
| 219 |
+
self._update_timestamp()
|
| 220 |
+
|
| 221 |
+
def update_processing(self, doc_id: str, stage: str, progress: float, message: str):
|
| 222 |
+
"""Update processing status."""
|
| 223 |
+
if doc_id in self.state["processing_status"]:
|
| 224 |
+
status = self.state["processing_status"][doc_id]
|
| 225 |
+
status.stage = stage
|
| 226 |
+
status.progress = progress
|
| 227 |
+
status.message = message
|
| 228 |
+
self._update_timestamp()
|
| 229 |
+
|
| 230 |
+
def complete_processing(self, doc_id: str, success: bool = True, error: str = None):
|
| 231 |
+
"""Mark processing as complete."""
|
| 232 |
+
if doc_id in self.state["processing_status"]:
|
| 233 |
+
status = self.state["processing_status"][doc_id]
|
| 234 |
+
status.stage = "complete" if success else "error"
|
| 235 |
+
status.progress = 1.0 if success else status.progress
|
| 236 |
+
status.completed_at = datetime.now()
|
| 237 |
+
status.error = error
|
| 238 |
+
status.message = "Processing complete!" if success else f"Error: {error}"
|
| 239 |
+
|
| 240 |
+
if success:
|
| 241 |
+
self._notify(f"Document processed successfully!", "success")
|
| 242 |
+
else:
|
| 243 |
+
self._notify(f"Processing failed: {error}", "error")
|
| 244 |
+
|
| 245 |
+
self._update_timestamp()
|
| 246 |
+
|
| 247 |
+
def get_processing_status(self, doc_id: str) -> Optional[ProcessingStatus]:
|
| 248 |
+
"""Get processing status for a document."""
|
| 249 |
+
return self.state["processing_status"].get(doc_id)
|
| 250 |
+
|
| 251 |
+
def is_processing(self, doc_id: str) -> bool:
|
| 252 |
+
"""Check if document is being processed."""
|
| 253 |
+
status = self.get_processing_status(doc_id)
|
| 254 |
+
return status is not None and status.stage not in ["complete", "error"]
|
| 255 |
+
|
| 256 |
+
# ==================== Indexing ====================
|
| 257 |
+
|
| 258 |
+
def mark_indexed(self, doc_id: str, chunk_count: int):
|
| 259 |
+
"""Mark a document as indexed to RAG."""
|
| 260 |
+
if doc_id in self.state["documents"]:
|
| 261 |
+
doc = self.state["documents"][doc_id]
|
| 262 |
+
doc.indexed = True
|
| 263 |
+
doc.indexed_chunks = chunk_count
|
| 264 |
+
self.state["indexed_doc_ids"].add(doc_id)
|
| 265 |
+
self.state["total_indexed_chunks"] += chunk_count
|
| 266 |
+
self._notify(f"Indexed {chunk_count} chunks from '{doc.filename}'", "success")
|
| 267 |
+
self._update_timestamp()
|
| 268 |
+
|
| 269 |
+
def is_indexed(self, doc_id: str) -> bool:
|
| 270 |
+
"""Check if document is indexed."""
|
| 271 |
+
return doc_id in self.state["indexed_doc_ids"]
|
| 272 |
+
|
| 273 |
+
def get_total_indexed_chunks(self) -> int:
|
| 274 |
+
"""Get total number of indexed chunks."""
|
| 275 |
+
return self.state["total_indexed_chunks"]
|
| 276 |
+
|
| 277 |
+
# ==================== Notifications ====================
|
| 278 |
+
|
| 279 |
+
def _notify(self, message: str, level: str = "info"):
|
| 280 |
+
"""Add a notification."""
|
| 281 |
+
self.state["notifications"].append({
|
| 282 |
+
"message": message,
|
| 283 |
+
"level": level,
|
| 284 |
+
"timestamp": datetime.now().isoformat(),
|
| 285 |
+
})
|
| 286 |
+
# Keep only last 50 notifications
|
| 287 |
+
if len(self.state["notifications"]) > 50:
|
| 288 |
+
self.state["notifications"] = self.state["notifications"][-50:]
|
| 289 |
+
|
| 290 |
+
def get_notifications(self, limit: int = 10) -> List[Dict]:
|
| 291 |
+
"""Get recent notifications."""
|
| 292 |
+
return self.state["notifications"][-limit:]
|
| 293 |
+
|
| 294 |
+
def clear_notifications(self):
|
| 295 |
+
"""Clear all notifications."""
|
| 296 |
+
self.state["notifications"] = []
|
| 297 |
+
|
| 298 |
+
# ==================== RAG Status ====================
|
| 299 |
+
|
| 300 |
+
def set_rag_ready(self, ready: bool):
|
| 301 |
+
"""Set RAG system ready status."""
|
| 302 |
+
self.state["rag_ready"] = ready
|
| 303 |
+
self._update_timestamp()
|
| 304 |
+
|
| 305 |
+
def is_rag_ready(self) -> bool:
|
| 306 |
+
"""Check if RAG is ready."""
|
| 307 |
+
return self.state["rag_ready"]
|
| 308 |
+
|
| 309 |
+
# ==================== Utilities ====================
|
| 310 |
+
|
| 311 |
+
def _update_timestamp(self):
|
| 312 |
+
"""Update the last update timestamp."""
|
| 313 |
+
self.state["last_update"] = datetime.now().isoformat()
|
| 314 |
+
self.state["sync_version"] += 1
|
| 315 |
+
|
| 316 |
+
def get_summary(self) -> Dict[str, Any]:
|
| 317 |
+
"""Get a summary of current state."""
|
| 318 |
+
return {
|
| 319 |
+
"total_documents": len(self.state["documents"]),
|
| 320 |
+
"indexed_documents": len(self.state["indexed_doc_ids"]),
|
| 321 |
+
"total_indexed_chunks": self.state["total_indexed_chunks"],
|
| 322 |
+
"active_doc_id": self.state["active_doc_id"],
|
| 323 |
+
"active_page": self.state.get("active_page", 0),
|
| 324 |
+
"rag_ready": self.state["rag_ready"],
|
| 325 |
+
"last_update": self.state["last_update"],
|
| 326 |
+
"sync_version": self.state.get("sync_version", 0),
|
| 327 |
+
"processing_count": sum(
|
| 328 |
+
1 for s in self.state["processing_status"].values()
|
| 329 |
+
if s.stage not in ["complete", "error"]
|
| 330 |
+
),
|
| 331 |
+
"evidence_count": len(self.state.get("evidence_highlights", [])),
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
def reset(self):
|
| 335 |
+
"""Reset all state."""
|
| 336 |
+
st.session_state.unified_state = {
|
| 337 |
+
"documents": {},
|
| 338 |
+
"processing_status": {},
|
| 339 |
+
"indexed_doc_ids": set(),
|
| 340 |
+
"active_doc_id": None,
|
| 341 |
+
"active_page": 0,
|
| 342 |
+
"active_chunk_id": None,
|
| 343 |
+
"notifications": [],
|
| 344 |
+
"rag_ready": False,
|
| 345 |
+
"total_indexed_chunks": 0,
|
| 346 |
+
"last_update": datetime.now().isoformat(),
|
| 347 |
+
"event_queue": [],
|
| 348 |
+
"evidence_highlights": [],
|
| 349 |
+
"last_rag_query": None,
|
| 350 |
+
"last_rag_response": None,
|
| 351 |
+
"selected_sources": [],
|
| 352 |
+
"module_states": {},
|
| 353 |
+
"sync_version": 0,
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
# ==================== Event System (Phase 1B) ====================
|
| 357 |
+
|
| 358 |
+
def publish_event(
|
| 359 |
+
self,
|
| 360 |
+
event_type: EventType,
|
| 361 |
+
source_module: str,
|
| 362 |
+
payload: Dict[str, Any]
|
| 363 |
+
) -> Event:
|
| 364 |
+
"""
|
| 365 |
+
Publish an event for cross-module synchronization.
|
| 366 |
+
|
| 367 |
+
Args:
|
| 368 |
+
event_type: Type of event
|
| 369 |
+
source_module: Name of module publishing the event
|
| 370 |
+
payload: Event data
|
| 371 |
+
|
| 372 |
+
Returns:
|
| 373 |
+
The created Event object
|
| 374 |
+
"""
|
| 375 |
+
event = Event(
|
| 376 |
+
event_type=event_type,
|
| 377 |
+
source_module=source_module,
|
| 378 |
+
payload=payload
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
+
# Add to event queue
|
| 382 |
+
self.state["event_queue"].append(event)
|
| 383 |
+
|
| 384 |
+
# Keep only last 100 events
|
| 385 |
+
if len(self.state["event_queue"]) > 100:
|
| 386 |
+
self.state["event_queue"] = self.state["event_queue"][-100:]
|
| 387 |
+
|
| 388 |
+
# Call registered handlers
|
| 389 |
+
if event_type in self._event_handlers:
|
| 390 |
+
for handler in self._event_handlers[event_type]:
|
| 391 |
+
try:
|
| 392 |
+
handler(event)
|
| 393 |
+
except Exception as e:
|
| 394 |
+
self._notify(f"Event handler error: {e}", "error")
|
| 395 |
+
|
| 396 |
+
self._update_timestamp()
|
| 397 |
+
return event
|
| 398 |
+
|
| 399 |
+
def subscribe(self, event_type: EventType, handler: Callable[[Event], None]):
|
| 400 |
+
"""
|
| 401 |
+
Subscribe to an event type.
|
| 402 |
+
|
| 403 |
+
Args:
|
| 404 |
+
event_type: Type of event to subscribe to
|
| 405 |
+
handler: Callback function to handle the event
|
| 406 |
+
"""
|
| 407 |
+
if event_type not in self._event_handlers:
|
| 408 |
+
self._event_handlers[event_type] = []
|
| 409 |
+
self._event_handlers[event_type].append(handler)
|
| 410 |
+
|
| 411 |
+
def unsubscribe(self, event_type: EventType, handler: Callable[[Event], None]):
|
| 412 |
+
"""Unsubscribe from an event type."""
|
| 413 |
+
if event_type in self._event_handlers:
|
| 414 |
+
self._event_handlers[event_type] = [
|
| 415 |
+
h for h in self._event_handlers[event_type] if h != handler
|
| 416 |
+
]
|
| 417 |
+
|
| 418 |
+
def get_recent_events(
|
| 419 |
+
self,
|
| 420 |
+
event_type: Optional[EventType] = None,
|
| 421 |
+
limit: int = 10
|
| 422 |
+
) -> List[Event]:
|
| 423 |
+
"""Get recent events, optionally filtered by type."""
|
| 424 |
+
events = self.state.get("event_queue", [])
|
| 425 |
+
|
| 426 |
+
if event_type:
|
| 427 |
+
events = [e for e in events if e.event_type == event_type]
|
| 428 |
+
|
| 429 |
+
return events[-limit:]
|
| 430 |
+
|
| 431 |
+
# ==================== Evidence Highlighting (Phase 1B) ====================
|
| 432 |
+
|
| 433 |
+
def add_evidence_highlight(self, highlight: EvidenceHighlight):
|
| 434 |
+
"""
|
| 435 |
+
Add an evidence highlight for cross-module visualization.
|
| 436 |
+
|
| 437 |
+
Used when RAG finds relevant evidence that should be displayed
|
| 438 |
+
in the Document Viewer or Evidence Viewer.
|
| 439 |
+
"""
|
| 440 |
+
self.state["evidence_highlights"].append(highlight)
|
| 441 |
+
|
| 442 |
+
# Publish event for other modules
|
| 443 |
+
self.publish_event(
|
| 444 |
+
EventType.EVIDENCE_HIGHLIGHT,
|
| 445 |
+
source_module="rag",
|
| 446 |
+
payload={
|
| 447 |
+
"doc_id": highlight.doc_id,
|
| 448 |
+
"chunk_id": highlight.chunk_id,
|
| 449 |
+
"page": highlight.page,
|
| 450 |
+
"bbox": highlight.bbox,
|
| 451 |
+
"text_snippet": highlight.text_snippet[:100],
|
| 452 |
+
}
|
| 453 |
+
)
|
| 454 |
+
|
| 455 |
+
self._update_timestamp()
|
| 456 |
+
|
| 457 |
+
def clear_evidence_highlights(self, doc_id: Optional[str] = None):
|
| 458 |
+
"""Clear evidence highlights, optionally for a specific document."""
|
| 459 |
+
if doc_id:
|
| 460 |
+
self.state["evidence_highlights"] = [
|
| 461 |
+
h for h in self.state["evidence_highlights"]
|
| 462 |
+
if h.doc_id != doc_id
|
| 463 |
+
]
|
| 464 |
+
else:
|
| 465 |
+
self.state["evidence_highlights"] = []
|
| 466 |
+
|
| 467 |
+
self._update_timestamp()
|
| 468 |
+
|
| 469 |
+
def get_evidence_highlights(
|
| 470 |
+
self,
|
| 471 |
+
doc_id: Optional[str] = None,
|
| 472 |
+
page: Optional[int] = None
|
| 473 |
+
) -> List[EvidenceHighlight]:
|
| 474 |
+
"""Get evidence highlights, optionally filtered by doc_id and page."""
|
| 475 |
+
highlights = self.state.get("evidence_highlights", [])
|
| 476 |
+
|
| 477 |
+
if doc_id:
|
| 478 |
+
highlights = [h for h in highlights if h.doc_id == doc_id]
|
| 479 |
+
|
| 480 |
+
if page is not None:
|
| 481 |
+
highlights = [h for h in highlights if h.page == page]
|
| 482 |
+
|
| 483 |
+
return highlights
|
| 484 |
+
|
| 485 |
+
# ==================== Page/Chunk Selection (Phase 1B) ====================
|
| 486 |
+
|
| 487 |
+
def select_page(self, page: int, source_module: str = "unknown"):
|
| 488 |
+
"""
|
| 489 |
+
Set the active page and notify other modules.
|
| 490 |
+
|
| 491 |
+
Used for synchronized scrolling between Document Viewer and Evidence Viewer.
|
| 492 |
+
"""
|
| 493 |
+
old_page = self.state.get("active_page", 0)
|
| 494 |
+
self.state["active_page"] = page
|
| 495 |
+
|
| 496 |
+
if old_page != page:
|
| 497 |
+
self.publish_event(
|
| 498 |
+
EventType.PAGE_CHANGED,
|
| 499 |
+
source_module=source_module,
|
| 500 |
+
payload={"page": page, "previous_page": old_page}
|
| 501 |
+
)
|
| 502 |
+
|
| 503 |
+
def get_active_page(self) -> int:
|
| 504 |
+
"""Get the currently active page."""
|
| 505 |
+
return self.state.get("active_page", 0)
|
| 506 |
+
|
| 507 |
+
def select_chunk(
|
| 508 |
+
self,
|
| 509 |
+
chunk_id: str,
|
| 510 |
+
doc_id: str,
|
| 511 |
+
source_module: str = "unknown"
|
| 512 |
+
):
|
| 513 |
+
"""
|
| 514 |
+
Select a chunk and navigate to its location.
|
| 515 |
+
|
| 516 |
+
Publishes event to trigger synchronized navigation.
|
| 517 |
+
"""
|
| 518 |
+
self.state["active_chunk_id"] = chunk_id
|
| 519 |
+
|
| 520 |
+
# Get chunk details to navigate
|
| 521 |
+
doc = self.get_document(doc_id)
|
| 522 |
+
if doc:
|
| 523 |
+
for chunk in doc.chunks:
|
| 524 |
+
if chunk.get("chunk_id") == chunk_id:
|
| 525 |
+
page = chunk.get("page", 0)
|
| 526 |
+
self.select_page(page, source_module)
|
| 527 |
+
|
| 528 |
+
self.publish_event(
|
| 529 |
+
EventType.CHUNK_SELECTED,
|
| 530 |
+
source_module=source_module,
|
| 531 |
+
payload={
|
| 532 |
+
"chunk_id": chunk_id,
|
| 533 |
+
"doc_id": doc_id,
|
| 534 |
+
"page": page,
|
| 535 |
+
"bbox": chunk.get("bbox"),
|
| 536 |
+
}
|
| 537 |
+
)
|
| 538 |
+
break
|
| 539 |
+
|
| 540 |
+
def get_active_chunk_id(self) -> Optional[str]:
|
| 541 |
+
"""Get the currently selected chunk ID."""
|
| 542 |
+
return self.state.get("active_chunk_id")
|
| 543 |
+
|
| 544 |
+
# ==================== RAG Query Sync (Phase 1B) ====================
|
| 545 |
+
|
| 546 |
+
def store_rag_query(
|
| 547 |
+
self,
|
| 548 |
+
query: str,
|
| 549 |
+
response: Dict[str, Any],
|
| 550 |
+
sources: List[Dict[str, Any]]
|
| 551 |
+
):
|
| 552 |
+
"""
|
| 553 |
+
Store the last RAG query and response for cross-module access.
|
| 554 |
+
|
| 555 |
+
Allows Evidence Viewer to display sources from Interactive RAG.
|
| 556 |
+
"""
|
| 557 |
+
self.state["last_rag_query"] = query
|
| 558 |
+
self.state["last_rag_response"] = response
|
| 559 |
+
self.state["selected_sources"] = sources
|
| 560 |
+
|
| 561 |
+
# Clear old highlights and add new ones from sources
|
| 562 |
+
self.clear_evidence_highlights()
|
| 563 |
+
|
| 564 |
+
for source in sources:
|
| 565 |
+
if all(k in source for k in ["doc_id", "chunk_id", "page"]):
|
| 566 |
+
bbox = source.get("bbox", (0, 0, 1, 1))
|
| 567 |
+
if isinstance(bbox, dict):
|
| 568 |
+
bbox = (bbox.get("x_min", 0), bbox.get("y_min", 0),
|
| 569 |
+
bbox.get("x_max", 1), bbox.get("y_max", 1))
|
| 570 |
+
|
| 571 |
+
highlight = EvidenceHighlight(
|
| 572 |
+
doc_id=source["doc_id"],
|
| 573 |
+
chunk_id=source["chunk_id"],
|
| 574 |
+
page=source["page"],
|
| 575 |
+
bbox=bbox,
|
| 576 |
+
text_snippet=source.get("text", "")[:200],
|
| 577 |
+
confidence=source.get("score", 0.0),
|
| 578 |
+
source_query=query,
|
| 579 |
+
)
|
| 580 |
+
self.add_evidence_highlight(highlight)
|
| 581 |
+
|
| 582 |
+
self.publish_event(
|
| 583 |
+
EventType.RAG_QUERY_COMPLETED,
|
| 584 |
+
source_module="rag",
|
| 585 |
+
payload={
|
| 586 |
+
"query": query,
|
| 587 |
+
"source_count": len(sources),
|
| 588 |
+
"response_length": len(str(response)),
|
| 589 |
+
}
|
| 590 |
+
)
|
| 591 |
+
|
| 592 |
+
self._update_timestamp()
|
| 593 |
+
|
| 594 |
+
def get_last_rag_query(self) -> Optional[str]:
|
| 595 |
+
"""Get the last RAG query."""
|
| 596 |
+
return self.state.get("last_rag_query")
|
| 597 |
+
|
| 598 |
+
def get_last_rag_response(self) -> Optional[Dict[str, Any]]:
|
| 599 |
+
"""Get the last RAG response."""
|
| 600 |
+
return self.state.get("last_rag_response")
|
| 601 |
+
|
| 602 |
+
def get_selected_sources(self) -> List[Dict[str, Any]]:
|
| 603 |
+
"""Get the sources from the last RAG query."""
|
| 604 |
+
return self.state.get("selected_sources", [])
|
| 605 |
+
|
| 606 |
+
# ==================== Module State (Phase 1B) ====================
|
| 607 |
+
|
| 608 |
+
def set_module_state(self, module_name: str, state: Dict[str, Any]):
|
| 609 |
+
"""
|
| 610 |
+
Store custom state for a specific module.
|
| 611 |
+
|
| 612 |
+
Allows modules to persist their own state across reruns.
|
| 613 |
+
"""
|
| 614 |
+
self.state["module_states"][module_name] = {
|
| 615 |
+
**state,
|
| 616 |
+
"updated_at": datetime.now().isoformat()
|
| 617 |
+
}
|
| 618 |
+
|
| 619 |
+
def get_module_state(self, module_name: str) -> Dict[str, Any]:
|
| 620 |
+
"""Get custom state for a specific module."""
|
| 621 |
+
return self.state.get("module_states", {}).get(module_name, {})
|
| 622 |
+
|
| 623 |
+
def get_sync_version(self) -> int:
|
| 624 |
+
"""
|
| 625 |
+
Get the current sync version.
|
| 626 |
+
|
| 627 |
+
Modules can use this to detect if state has changed since last check.
|
| 628 |
+
"""
|
| 629 |
+
return self.state.get("sync_version", 0)
|
| 630 |
+
|
| 631 |
+
|
| 632 |
+
def generate_doc_id(filename: str, content_hash: str = None) -> str:
|
| 633 |
+
"""Generate a unique document ID."""
|
| 634 |
+
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
| 635 |
+
base = f"{filename}_{timestamp}"
|
| 636 |
+
if content_hash:
|
| 637 |
+
base = f"{base}_{content_hash[:8]}"
|
| 638 |
+
return hashlib.md5(base.encode()).hexdigest()[:12]
|
| 639 |
+
|
| 640 |
+
|
| 641 |
+
def get_state_manager() -> UnifiedStateManager:
|
| 642 |
+
"""Get or create the unified state manager."""
|
| 643 |
+
if "state_manager_instance" not in st.session_state:
|
| 644 |
+
st.session_state.state_manager_instance = UnifiedStateManager()
|
| 645 |
+
return st.session_state.state_manager_instance
|
| 646 |
+
|
| 647 |
+
|
| 648 |
+
# ==================== Global Status Bar Component ====================
|
| 649 |
+
|
| 650 |
+
def render_global_status_bar():
|
| 651 |
+
"""Render a global status bar showing system state."""
|
| 652 |
+
manager = get_state_manager()
|
| 653 |
+
summary = manager.get_summary()
|
| 654 |
+
|
| 655 |
+
# Import RAG config for additional status
|
| 656 |
+
try:
|
| 657 |
+
from rag_config import get_unified_rag_system, check_ollama
|
| 658 |
+
rag_system = get_unified_rag_system()
|
| 659 |
+
ollama_ok, models = check_ollama()
|
| 660 |
+
rag_status = rag_system["status"]
|
| 661 |
+
llm_model = rag_system.get("llm_model", "N/A")
|
| 662 |
+
except:
|
| 663 |
+
ollama_ok = False
|
| 664 |
+
rag_status = "error"
|
| 665 |
+
llm_model = "N/A"
|
| 666 |
+
models = []
|
| 667 |
+
|
| 668 |
+
# Status bar
|
| 669 |
+
cols = st.columns(6)
|
| 670 |
+
|
| 671 |
+
with cols[0]:
|
| 672 |
+
if ollama_ok:
|
| 673 |
+
st.success(f"Ollama ({len(models)})")
|
| 674 |
+
else:
|
| 675 |
+
st.error("Ollama Offline")
|
| 676 |
+
|
| 677 |
+
with cols[1]:
|
| 678 |
+
if rag_status == "ready":
|
| 679 |
+
st.success("RAG Ready")
|
| 680 |
+
else:
|
| 681 |
+
st.error("RAG Error")
|
| 682 |
+
|
| 683 |
+
with cols[2]:
|
| 684 |
+
st.info(f"{llm_model.split(':')[0]}")
|
| 685 |
+
|
| 686 |
+
with cols[3]:
|
| 687 |
+
st.info(f"{summary['total_documents']} Docs")
|
| 688 |
+
|
| 689 |
+
with cols[4]:
|
| 690 |
+
if summary['indexed_documents'] > 0:
|
| 691 |
+
st.success(f"{summary['total_indexed_chunks']} Chunks")
|
| 692 |
+
else:
|
| 693 |
+
st.warning("0 Chunks")
|
| 694 |
+
|
| 695 |
+
with cols[5]:
|
| 696 |
+
if summary['processing_count'] > 0:
|
| 697 |
+
st.warning(f"Processing...")
|
| 698 |
+
else:
|
| 699 |
+
st.info("Idle")
|
| 700 |
+
|
| 701 |
+
|
| 702 |
+
def render_notifications():
|
| 703 |
+
"""Render recent notifications."""
|
| 704 |
+
manager = get_state_manager()
|
| 705 |
+
notifications = manager.get_notifications(5)
|
| 706 |
+
|
| 707 |
+
if notifications:
|
| 708 |
+
for notif in reversed(notifications):
|
| 709 |
+
level = notif["level"]
|
| 710 |
+
msg = notif["message"]
|
| 711 |
+
if level == "success":
|
| 712 |
+
st.success(msg)
|
| 713 |
+
elif level == "error":
|
| 714 |
+
st.error(msg)
|
| 715 |
+
elif level == "warning":
|
| 716 |
+
st.warning(msg)
|
| 717 |
+
else:
|
| 718 |
+
st.info(msg)
|
| 719 |
+
|
| 720 |
+
|
| 721 |
+
# ==================== Helper Components (Phase 1B) ====================
|
| 722 |
+
|
| 723 |
+
def render_evidence_panel():
|
| 724 |
+
"""
|
| 725 |
+
Render a panel showing current evidence highlights.
|
| 726 |
+
|
| 727 |
+
Can be used in any module to show sources from RAG queries.
|
| 728 |
+
"""
|
| 729 |
+
manager = get_state_manager()
|
| 730 |
+
highlights = manager.get_evidence_highlights()
|
| 731 |
+
|
| 732 |
+
if not highlights:
|
| 733 |
+
st.info("No evidence highlights. Run a RAG query to see sources.")
|
| 734 |
+
return
|
| 735 |
+
|
| 736 |
+
st.subheader(f"Evidence Sources ({len(highlights)})")
|
| 737 |
+
|
| 738 |
+
for i, h in enumerate(highlights):
|
| 739 |
+
with st.expander(f"Source {i+1}: Page {h.page + 1} ({h.confidence:.0%})"):
|
| 740 |
+
st.markdown(f"**Document:** {h.doc_id}")
|
| 741 |
+
st.markdown(f"**Text:** {h.text_snippet}")
|
| 742 |
+
|
| 743 |
+
if h.source_query:
|
| 744 |
+
st.markdown(f"**Query:** _{h.source_query}_")
|
| 745 |
+
|
| 746 |
+
# Button to navigate to source
|
| 747 |
+
if st.button(f"View in Document", key=f"view_source_{i}"):
|
| 748 |
+
manager.set_active_document(h.doc_id)
|
| 749 |
+
manager.select_page(h.page, "evidence_panel")
|
| 750 |
+
manager.select_chunk(h.chunk_id, h.doc_id, "evidence_panel")
|
| 751 |
+
st.rerun()
|
| 752 |
+
|
| 753 |
+
|
| 754 |
+
def render_sync_status():
|
| 755 |
+
"""Render sync status indicator for debugging."""
|
| 756 |
+
manager = get_state_manager()
|
| 757 |
+
summary = manager.get_summary()
|
| 758 |
+
|
| 759 |
+
with st.expander("Sync Status", expanded=False):
|
| 760 |
+
st.json({
|
| 761 |
+
"sync_version": summary["sync_version"],
|
| 762 |
+
"active_doc": summary["active_doc_id"],
|
| 763 |
+
"active_page": summary["active_page"],
|
| 764 |
+
"evidence_count": summary["evidence_count"],
|
| 765 |
+
"last_update": summary["last_update"],
|
| 766 |
+
})
|
| 767 |
+
|
| 768 |
+
# Recent events
|
| 769 |
+
events = manager.get_recent_events(limit=5)
|
| 770 |
+
if events:
|
| 771 |
+
st.subheader("Recent Events")
|
| 772 |
+
for event in reversed(events):
|
| 773 |
+
st.text(f"{event.event_type.value}: {event.source_module}")
|
| 774 |
+
|
| 775 |
+
|
| 776 |
+
def render_document_selector():
|
| 777 |
+
"""
|
| 778 |
+
Render a document selector that syncs with state manager.
|
| 779 |
+
|
| 780 |
+
Returns the selected document ID.
|
| 781 |
+
"""
|
| 782 |
+
manager = get_state_manager()
|
| 783 |
+
documents = manager.get_all_documents()
|
| 784 |
+
|
| 785 |
+
if not documents:
|
| 786 |
+
st.info("No documents uploaded. Upload a document to get started.")
|
| 787 |
+
return None
|
| 788 |
+
|
| 789 |
+
# Get current selection
|
| 790 |
+
active_doc_id = manager.state.get("active_doc_id")
|
| 791 |
+
|
| 792 |
+
# Create options
|
| 793 |
+
options = {doc.doc_id: f"{doc.filename} ({doc.indexed_chunks} chunks)" for doc in documents}
|
| 794 |
+
option_list = list(options.keys())
|
| 795 |
+
|
| 796 |
+
# Find current index
|
| 797 |
+
current_index = option_list.index(active_doc_id) if active_doc_id in option_list else 0
|
| 798 |
+
|
| 799 |
+
# Render selectbox
|
| 800 |
+
selected_id = st.selectbox(
|
| 801 |
+
"Select Document",
|
| 802 |
+
options=option_list,
|
| 803 |
+
format_func=lambda x: options[x],
|
| 804 |
+
index=current_index,
|
| 805 |
+
key="global_doc_selector"
|
| 806 |
+
)
|
| 807 |
+
|
| 808 |
+
# Update state if changed
|
| 809 |
+
if selected_id != active_doc_id:
|
| 810 |
+
manager.set_active_document(selected_id)
|
| 811 |
+
manager.publish_event(
|
| 812 |
+
EventType.DOCUMENT_SELECTED,
|
| 813 |
+
source_module="selector",
|
| 814 |
+
payload={"doc_id": selected_id}
|
| 815 |
+
)
|
| 816 |
+
|
| 817 |
+
return selected_id
|
| 818 |
+
|
| 819 |
+
|
| 820 |
+
def create_sync_callback(module_name: str) -> Callable:
|
| 821 |
+
"""
|
| 822 |
+
Create a rerun callback for a module.
|
| 823 |
+
|
| 824 |
+
Returns a function that can be used as an event handler
|
| 825 |
+
to trigger Streamlit rerun when relevant events occur.
|
| 826 |
+
"""
|
| 827 |
+
def callback(event: Event):
|
| 828 |
+
# Only rerun if event is from a different module
|
| 829 |
+
if event.source_module != module_name:
|
| 830 |
+
# Store that we need to rerun
|
| 831 |
+
st.session_state[f"_{module_name}_needs_rerun"] = True
|
| 832 |
+
|
| 833 |
+
return callback
|
docker-compose.dev.yml
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
# SPARKNET Development Docker Compose
|
| 4 |
+
# Lighter configuration for local development
|
| 5 |
+
|
| 6 |
+
services:
|
| 7 |
+
sparknet-api:
|
| 8 |
+
build:
|
| 9 |
+
context: .
|
| 10 |
+
dockerfile: Dockerfile
|
| 11 |
+
target: development
|
| 12 |
+
container_name: sparknet-api-dev
|
| 13 |
+
ports:
|
| 14 |
+
- "8000:8000"
|
| 15 |
+
volumes:
|
| 16 |
+
- .:/app
|
| 17 |
+
- ./data:/app/data
|
| 18 |
+
- ./uploads:/app/uploads
|
| 19 |
+
- ./outputs:/app/outputs
|
| 20 |
+
environment:
|
| 21 |
+
- PYTHONPATH=/app
|
| 22 |
+
- OLLAMA_HOST=http://host.docker.internal:11434
|
| 23 |
+
- LOG_LEVEL=DEBUG
|
| 24 |
+
- SPARKNET_SECRET_KEY=dev-secret-key
|
| 25 |
+
extra_hosts:
|
| 26 |
+
- "host.docker.internal:host-gateway"
|
| 27 |
+
networks:
|
| 28 |
+
- sparknet-dev-network
|
| 29 |
+
restart: unless-stopped
|
| 30 |
+
|
| 31 |
+
sparknet-demo:
|
| 32 |
+
build:
|
| 33 |
+
context: .
|
| 34 |
+
dockerfile: Dockerfile
|
| 35 |
+
target: development
|
| 36 |
+
container_name: sparknet-demo-dev
|
| 37 |
+
command: ["streamlit", "run", "demo/app.py", "--server.address", "0.0.0.0", "--server.port", "4000", "--server.runOnSave", "true"]
|
| 38 |
+
ports:
|
| 39 |
+
- "4000:4000"
|
| 40 |
+
volumes:
|
| 41 |
+
- .:/app
|
| 42 |
+
- ./data:/app/data
|
| 43 |
+
- ./uploads:/app/uploads
|
| 44 |
+
environment:
|
| 45 |
+
- PYTHONPATH=/app
|
| 46 |
+
- OLLAMA_HOST=http://host.docker.internal:11434
|
| 47 |
+
- API_URL=http://sparknet-api:8000
|
| 48 |
+
extra_hosts:
|
| 49 |
+
- "host.docker.internal:host-gateway"
|
| 50 |
+
depends_on:
|
| 51 |
+
- sparknet-api
|
| 52 |
+
networks:
|
| 53 |
+
- sparknet-dev-network
|
| 54 |
+
restart: unless-stopped
|
| 55 |
+
|
| 56 |
+
redis:
|
| 57 |
+
image: redis:7-alpine
|
| 58 |
+
container_name: sparknet-redis-dev
|
| 59 |
+
ports:
|
| 60 |
+
- "6379:6379"
|
| 61 |
+
networks:
|
| 62 |
+
- sparknet-dev-network
|
| 63 |
+
|
| 64 |
+
networks:
|
| 65 |
+
sparknet-dev-network:
|
| 66 |
+
driver: bridge
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
# SPARKNET Docker Compose Configuration
|
| 4 |
+
# Full stack deployment with all services
|
| 5 |
+
|
| 6 |
+
services:
|
| 7 |
+
# ============== Main Application ==============
|
| 8 |
+
sparknet-api:
|
| 9 |
+
build:
|
| 10 |
+
context: .
|
| 11 |
+
dockerfile: Dockerfile
|
| 12 |
+
target: production
|
| 13 |
+
container_name: sparknet-api
|
| 14 |
+
ports:
|
| 15 |
+
- "8000:8000"
|
| 16 |
+
volumes:
|
| 17 |
+
- ./data:/app/data
|
| 18 |
+
- ./uploads:/app/uploads
|
| 19 |
+
- ./outputs:/app/outputs
|
| 20 |
+
- ./logs:/app/logs
|
| 21 |
+
environment:
|
| 22 |
+
- PYTHONPATH=/app
|
| 23 |
+
- OLLAMA_HOST=http://ollama:11434
|
| 24 |
+
- CHROMA_HOST=chromadb
|
| 25 |
+
- CHROMA_PORT=8000
|
| 26 |
+
- REDIS_URL=redis://redis:6379
|
| 27 |
+
- SPARKNET_SECRET_KEY=${SPARKNET_SECRET_KEY:-sparknet-docker-secret-key}
|
| 28 |
+
- LOG_LEVEL=INFO
|
| 29 |
+
depends_on:
|
| 30 |
+
ollama:
|
| 31 |
+
condition: service_healthy
|
| 32 |
+
chromadb:
|
| 33 |
+
condition: service_started
|
| 34 |
+
redis:
|
| 35 |
+
condition: service_healthy
|
| 36 |
+
networks:
|
| 37 |
+
- sparknet-network
|
| 38 |
+
restart: unless-stopped
|
| 39 |
+
healthcheck:
|
| 40 |
+
test: ["CMD", "curl", "-f", "http://localhost:8000/api/health"]
|
| 41 |
+
interval: 30s
|
| 42 |
+
timeout: 10s
|
| 43 |
+
retries: 3
|
| 44 |
+
start_period: 60s
|
| 45 |
+
|
| 46 |
+
sparknet-demo:
|
| 47 |
+
build:
|
| 48 |
+
context: .
|
| 49 |
+
dockerfile: Dockerfile
|
| 50 |
+
target: production
|
| 51 |
+
container_name: sparknet-demo
|
| 52 |
+
command: ["streamlit", "run", "demo/app.py", "--server.address", "0.0.0.0", "--server.port", "4000"]
|
| 53 |
+
ports:
|
| 54 |
+
- "4000:4000"
|
| 55 |
+
volumes:
|
| 56 |
+
- ./data:/app/data
|
| 57 |
+
- ./uploads:/app/uploads
|
| 58 |
+
- ./outputs:/app/outputs
|
| 59 |
+
environment:
|
| 60 |
+
- PYTHONPATH=/app
|
| 61 |
+
- OLLAMA_HOST=http://ollama:11434
|
| 62 |
+
- CHROMA_HOST=chromadb
|
| 63 |
+
- CHROMA_PORT=8000
|
| 64 |
+
- API_URL=http://sparknet-api:8000
|
| 65 |
+
depends_on:
|
| 66 |
+
- sparknet-api
|
| 67 |
+
networks:
|
| 68 |
+
- sparknet-network
|
| 69 |
+
restart: unless-stopped
|
| 70 |
+
|
| 71 |
+
# ============== Ollama LLM Service ==============
|
| 72 |
+
ollama:
|
| 73 |
+
image: ollama/ollama:latest
|
| 74 |
+
container_name: sparknet-ollama
|
| 75 |
+
ports:
|
| 76 |
+
- "11434:11434"
|
| 77 |
+
volumes:
|
| 78 |
+
- ollama_data:/root/.ollama
|
| 79 |
+
environment:
|
| 80 |
+
- OLLAMA_KEEP_ALIVE=24h
|
| 81 |
+
deploy:
|
| 82 |
+
resources:
|
| 83 |
+
reservations:
|
| 84 |
+
devices:
|
| 85 |
+
- driver: nvidia
|
| 86 |
+
count: all
|
| 87 |
+
capabilities: [gpu]
|
| 88 |
+
networks:
|
| 89 |
+
- sparknet-network
|
| 90 |
+
restart: unless-stopped
|
| 91 |
+
healthcheck:
|
| 92 |
+
test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
|
| 93 |
+
interval: 30s
|
| 94 |
+
timeout: 10s
|
| 95 |
+
retries: 5
|
| 96 |
+
start_period: 120s
|
| 97 |
+
|
| 98 |
+
# ============== ChromaDB Vector Store ==============
|
| 99 |
+
chromadb:
|
| 100 |
+
image: chromadb/chroma:latest
|
| 101 |
+
container_name: sparknet-chromadb
|
| 102 |
+
ports:
|
| 103 |
+
- "8001:8000"
|
| 104 |
+
volumes:
|
| 105 |
+
- chroma_data:/chroma/chroma
|
| 106 |
+
environment:
|
| 107 |
+
- IS_PERSISTENT=TRUE
|
| 108 |
+
- PERSIST_DIRECTORY=/chroma/chroma
|
| 109 |
+
- ANONYMIZED_TELEMETRY=FALSE
|
| 110 |
+
networks:
|
| 111 |
+
- sparknet-network
|
| 112 |
+
restart: unless-stopped
|
| 113 |
+
|
| 114 |
+
# ============== Redis Cache ==============
|
| 115 |
+
redis:
|
| 116 |
+
image: redis:7-alpine
|
| 117 |
+
container_name: sparknet-redis
|
| 118 |
+
ports:
|
| 119 |
+
- "6379:6379"
|
| 120 |
+
volumes:
|
| 121 |
+
- redis_data:/data
|
| 122 |
+
command: redis-server --appendonly yes --maxmemory 256mb --maxmemory-policy allkeys-lru
|
| 123 |
+
networks:
|
| 124 |
+
- sparknet-network
|
| 125 |
+
restart: unless-stopped
|
| 126 |
+
healthcheck:
|
| 127 |
+
test: ["CMD", "redis-cli", "ping"]
|
| 128 |
+
interval: 10s
|
| 129 |
+
timeout: 5s
|
| 130 |
+
retries: 5
|
| 131 |
+
|
| 132 |
+
# ============== Nginx Reverse Proxy (Optional) ==============
|
| 133 |
+
nginx:
|
| 134 |
+
image: nginx:alpine
|
| 135 |
+
container_name: sparknet-nginx
|
| 136 |
+
ports:
|
| 137 |
+
- "80:80"
|
| 138 |
+
- "443:443"
|
| 139 |
+
volumes:
|
| 140 |
+
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
|
| 141 |
+
- ./nginx/ssl:/etc/nginx/ssl:ro
|
| 142 |
+
depends_on:
|
| 143 |
+
- sparknet-api
|
| 144 |
+
- sparknet-demo
|
| 145 |
+
networks:
|
| 146 |
+
- sparknet-network
|
| 147 |
+
restart: unless-stopped
|
| 148 |
+
profiles:
|
| 149 |
+
- production
|
| 150 |
+
|
| 151 |
+
# ============== Volumes ==============
|
| 152 |
+
volumes:
|
| 153 |
+
ollama_data:
|
| 154 |
+
driver: local
|
| 155 |
+
chroma_data:
|
| 156 |
+
driver: local
|
| 157 |
+
redis_data:
|
| 158 |
+
driver: local
|
| 159 |
+
|
| 160 |
+
# ============== Networks ==============
|
| 161 |
+
networks:
|
| 162 |
+
sparknet-network:
|
| 163 |
+
driver: bridge
|
docs/CLOUD_ARCHITECTURE.md
ADDED
|
@@ -0,0 +1,392 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPARKNET Cloud Architecture
|
| 2 |
+
|
| 3 |
+
This document outlines the cloud-ready architecture for deploying SPARKNET on AWS.
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
SPARKNET is designed with a modular architecture that supports both local development and cloud deployment. The system can scale from a single developer machine to enterprise-grade cloud infrastructure.
|
| 8 |
+
|
| 9 |
+
## Local Development Stack
|
| 10 |
+
|
| 11 |
+
```
|
| 12 |
+
┌─────────────────────────────────────────────────────┐
|
| 13 |
+
│ Local Machine │
|
| 14 |
+
├─────────────────────────────────────────────────────┤
|
| 15 |
+
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
| 16 |
+
│ │ Ollama │ │ ChromaDB │ │ File I/O │ │
|
| 17 |
+
│ │ (LLM) │ │ (Vector) │ │ (Storage) │ │
|
| 18 |
+
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
| 19 |
+
│ │ │ │ │
|
| 20 |
+
│ └───────────────┼───────────────┘ │
|
| 21 |
+
│ │ │
|
| 22 |
+
│ ┌────────┴────────┐ │
|
| 23 |
+
│ │ SPARKNET │ │
|
| 24 |
+
│ │ Application │ │
|
| 25 |
+
│ └─────────────────┘ │
|
| 26 |
+
└─────────────────────────────────────────────────────┘
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
## AWS Cloud Architecture
|
| 30 |
+
|
| 31 |
+
### Target Architecture
|
| 32 |
+
|
| 33 |
+
```
|
| 34 |
+
┌────────────────────────────────────────────────────────────────────┐
|
| 35 |
+
│ AWS Cloud │
|
| 36 |
+
├────────────────────────────────────────────────────────────────────┤
|
| 37 |
+
│ │
|
| 38 |
+
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │
|
| 39 |
+
│ │ API GW │──────│ Lambda │──────│ Step Functions │ │
|
| 40 |
+
│ │ (REST) │ │ (Compute) │ │ (Orchestration) │ │
|
| 41 |
+
│ └─────────────┘ └─────────────┘ └─────────────────────┘ │
|
| 42 |
+
│ │ │ │ │
|
| 43 |
+
│ │ │ │ │
|
| 44 |
+
│ ▼ ▼ ▼ │
|
| 45 |
+
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │
|
| 46 |
+
│ │ S3 │ │ Bedrock │ │ OpenSearch │ │
|
| 47 |
+
│ │ (Storage) │ │ (LLM) │ │ (Vector Store) │ │
|
| 48 |
+
│ └─────────────┘ └─────────────┘ └─────────────────────┘ │
|
| 49 |
+
│ │
|
| 50 |
+
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │
|
| 51 |
+
│ │ Textract │ │ Titan │ │ DynamoDB │ │
|
| 52 |
+
│ │ (OCR) │ │ (Embeddings)│ │ (Metadata) │ │
|
| 53 |
+
│ └─────────────┘ └─────────────┘ └─────────────────────┘ │
|
| 54 |
+
│ │
|
| 55 |
+
└────────────────────────────────────────────────────────────────────┘
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
### Component Mapping
|
| 59 |
+
|
| 60 |
+
| Local Component | AWS Service | Purpose |
|
| 61 |
+
|----------------|-------------|---------|
|
| 62 |
+
| File I/O | S3 | Document storage |
|
| 63 |
+
| PaddleOCR/Tesseract | Textract | OCR extraction |
|
| 64 |
+
| Ollama LLM | Bedrock (Claude/Titan) | Text generation |
|
| 65 |
+
| Ollama Embeddings | Titan Embeddings | Vector embeddings |
|
| 66 |
+
| ChromaDB | OpenSearch Serverless | Vector search |
|
| 67 |
+
| SQLite (optional) | DynamoDB | Metadata storage |
|
| 68 |
+
| Python Process | Lambda | Compute |
|
| 69 |
+
| CLI | API Gateway | HTTP interface |
|
| 70 |
+
|
| 71 |
+
## Migration Strategy
|
| 72 |
+
|
| 73 |
+
### Phase 1: Storage Migration
|
| 74 |
+
|
| 75 |
+
```python
|
| 76 |
+
# Abstract storage interface
|
| 77 |
+
class StorageAdapter:
|
| 78 |
+
def put(self, key: str, data: bytes) -> str: ...
|
| 79 |
+
def get(self, key: str) -> bytes: ...
|
| 80 |
+
def delete(self, key: str) -> bool: ...
|
| 81 |
+
|
| 82 |
+
# Local implementation
|
| 83 |
+
class LocalStorageAdapter(StorageAdapter):
|
| 84 |
+
def __init__(self, base_path: str):
|
| 85 |
+
self.base_path = Path(base_path)
|
| 86 |
+
|
| 87 |
+
# S3 implementation
|
| 88 |
+
class S3StorageAdapter(StorageAdapter):
|
| 89 |
+
def __init__(self, bucket: str):
|
| 90 |
+
self.client = boto3.client('s3')
|
| 91 |
+
self.bucket = bucket
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
### Phase 2: OCR Migration
|
| 95 |
+
|
| 96 |
+
```python
|
| 97 |
+
# Abstract OCR interface
|
| 98 |
+
class OCREngine:
|
| 99 |
+
def recognize(self, image: np.ndarray) -> OCRResult: ...
|
| 100 |
+
|
| 101 |
+
# Local: PaddleOCR
|
| 102 |
+
class PaddleOCREngine(OCREngine): ...
|
| 103 |
+
|
| 104 |
+
# Cloud: Textract
|
| 105 |
+
class TextractEngine(OCREngine):
|
| 106 |
+
def __init__(self):
|
| 107 |
+
self.client = boto3.client('textract')
|
| 108 |
+
|
| 109 |
+
def recognize(self, image: np.ndarray) -> OCRResult:
|
| 110 |
+
response = self.client.detect_document_text(
|
| 111 |
+
Document={'Bytes': image_bytes}
|
| 112 |
+
)
|
| 113 |
+
return self._convert_response(response)
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
### Phase 3: LLM Migration
|
| 117 |
+
|
| 118 |
+
```python
|
| 119 |
+
# Abstract LLM interface
|
| 120 |
+
class LLMAdapter:
|
| 121 |
+
def generate(self, prompt: str) -> str: ...
|
| 122 |
+
|
| 123 |
+
# Local: Ollama
|
| 124 |
+
class OllamaAdapter(LLMAdapter): ...
|
| 125 |
+
|
| 126 |
+
# Cloud: Bedrock
|
| 127 |
+
class BedrockAdapter(LLMAdapter):
|
| 128 |
+
def __init__(self, model_id: str = "anthropic.claude-3-sonnet"):
|
| 129 |
+
self.client = boto3.client('bedrock-runtime')
|
| 130 |
+
self.model_id = model_id
|
| 131 |
+
|
| 132 |
+
def generate(self, prompt: str) -> str:
|
| 133 |
+
response = self.client.invoke_model(
|
| 134 |
+
modelId=self.model_id,
|
| 135 |
+
body=json.dumps({"prompt": prompt})
|
| 136 |
+
)
|
| 137 |
+
return response['body']
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
### Phase 4: Vector Store Migration
|
| 141 |
+
|
| 142 |
+
```python
|
| 143 |
+
# Abstract vector store interface (already implemented)
|
| 144 |
+
class VectorStore:
|
| 145 |
+
def add_chunks(self, chunks, embeddings): ...
|
| 146 |
+
def search(self, query_embedding, top_k): ...
|
| 147 |
+
|
| 148 |
+
# Local: ChromaDB (already implemented)
|
| 149 |
+
class ChromaVectorStore(VectorStore): ...
|
| 150 |
+
|
| 151 |
+
# Cloud: OpenSearch
|
| 152 |
+
class OpenSearchVectorStore(VectorStore):
|
| 153 |
+
def __init__(self, endpoint: str, index: str):
|
| 154 |
+
self.client = OpenSearch(hosts=[endpoint])
|
| 155 |
+
self.index = index
|
| 156 |
+
|
| 157 |
+
def search(self, query_embedding, top_k):
|
| 158 |
+
response = self.client.search(
|
| 159 |
+
index=self.index,
|
| 160 |
+
body={
|
| 161 |
+
"knn": {
|
| 162 |
+
"embedding": {
|
| 163 |
+
"vector": query_embedding,
|
| 164 |
+
"k": top_k
|
| 165 |
+
}
|
| 166 |
+
}
|
| 167 |
+
}
|
| 168 |
+
)
|
| 169 |
+
return self._convert_results(response)
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
## AWS Services Deep Dive
|
| 173 |
+
|
| 174 |
+
### Amazon S3
|
| 175 |
+
|
| 176 |
+
- **Purpose**: Document storage and processed results
|
| 177 |
+
- **Structure**:
|
| 178 |
+
```
|
| 179 |
+
s3://sparknet-documents/
|
| 180 |
+
├── raw/ # Original documents
|
| 181 |
+
│ └── {doc_id}/
|
| 182 |
+
│ └── document.pdf
|
| 183 |
+
├── processed/ # Processed results
|
| 184 |
+
│ └── {doc_id}/
|
| 185 |
+
│ ├── metadata.json
|
| 186 |
+
│ ├── chunks.json
|
| 187 |
+
│ └── pages/
|
| 188 |
+
│ ├── page_0.png
|
| 189 |
+
│ └── page_1.png
|
| 190 |
+
└── cache/ # Processing cache
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
### Amazon Textract
|
| 194 |
+
|
| 195 |
+
- **Purpose**: OCR extraction with layout analysis
|
| 196 |
+
- **Features**:
|
| 197 |
+
- Document text detection
|
| 198 |
+
- Table extraction
|
| 199 |
+
- Form extraction
|
| 200 |
+
- Handwriting recognition
|
| 201 |
+
|
| 202 |
+
### Amazon Bedrock
|
| 203 |
+
|
| 204 |
+
- **Purpose**: LLM inference
|
| 205 |
+
- **Models**:
|
| 206 |
+
- Claude 3.5 Sonnet (primary)
|
| 207 |
+
- Titan Text (cost-effective)
|
| 208 |
+
- Titan Embeddings (vectors)
|
| 209 |
+
|
| 210 |
+
### Amazon OpenSearch Serverless
|
| 211 |
+
|
| 212 |
+
- **Purpose**: Vector search and retrieval
|
| 213 |
+
- **Configuration**:
|
| 214 |
+
```json
|
| 215 |
+
{
|
| 216 |
+
"index": "sparknet-vectors",
|
| 217 |
+
"settings": {
|
| 218 |
+
"index.knn": true,
|
| 219 |
+
"index.knn.space_type": "cosinesimil"
|
| 220 |
+
},
|
| 221 |
+
"mappings": {
|
| 222 |
+
"properties": {
|
| 223 |
+
"embedding": {
|
| 224 |
+
"type": "knn_vector",
|
| 225 |
+
"dimension": 1024
|
| 226 |
+
}
|
| 227 |
+
}
|
| 228 |
+
}
|
| 229 |
+
}
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
### AWS Lambda
|
| 233 |
+
|
| 234 |
+
- **Purpose**: Serverless compute
|
| 235 |
+
- **Functions**:
|
| 236 |
+
- `process-document`: Document processing pipeline
|
| 237 |
+
- `extract-fields`: Field extraction
|
| 238 |
+
- `rag-query`: RAG query handling
|
| 239 |
+
- `index-document`: Vector indexing
|
| 240 |
+
|
| 241 |
+
### AWS Step Functions
|
| 242 |
+
|
| 243 |
+
- **Purpose**: Workflow orchestration
|
| 244 |
+
- **Workflow**:
|
| 245 |
+
```json
|
| 246 |
+
{
|
| 247 |
+
"StartAt": "ProcessDocument",
|
| 248 |
+
"States": {
|
| 249 |
+
"ProcessDocument": {
|
| 250 |
+
"Type": "Task",
|
| 251 |
+
"Resource": "arn:aws:lambda:process-document",
|
| 252 |
+
"Next": "IndexChunks"
|
| 253 |
+
},
|
| 254 |
+
"IndexChunks": {
|
| 255 |
+
"Type": "Task",
|
| 256 |
+
"Resource": "arn:aws:lambda:index-document",
|
| 257 |
+
"End": true
|
| 258 |
+
}
|
| 259 |
+
}
|
| 260 |
+
}
|
| 261 |
+
```
|
| 262 |
+
|
| 263 |
+
## Cost Optimization
|
| 264 |
+
|
| 265 |
+
### Tiered Processing
|
| 266 |
+
|
| 267 |
+
| Tier | Use Case | Services | Cost |
|
| 268 |
+
|------|----------|----------|------|
|
| 269 |
+
| Basic | Simple OCR | Textract + Titan | $ |
|
| 270 |
+
| Standard | Full pipeline | + Claude Haiku | $$ |
|
| 271 |
+
| Premium | Complex analysis | + Claude Sonnet | $$$ |
|
| 272 |
+
|
| 273 |
+
### Caching Strategy
|
| 274 |
+
|
| 275 |
+
1. **Document Cache**: S3 with lifecycle policies
|
| 276 |
+
2. **Embedding Cache**: ElastiCache (Redis)
|
| 277 |
+
3. **Query Cache**: Lambda@Edge
|
| 278 |
+
|
| 279 |
+
## Security
|
| 280 |
+
|
| 281 |
+
### IAM Policies
|
| 282 |
+
|
| 283 |
+
```json
|
| 284 |
+
{
|
| 285 |
+
"Version": "2012-10-17",
|
| 286 |
+
"Statement": [
|
| 287 |
+
{
|
| 288 |
+
"Effect": "Allow",
|
| 289 |
+
"Action": [
|
| 290 |
+
"s3:GetObject",
|
| 291 |
+
"s3:PutObject"
|
| 292 |
+
],
|
| 293 |
+
"Resource": "arn:aws:s3:::sparknet-documents/*"
|
| 294 |
+
},
|
| 295 |
+
{
|
| 296 |
+
"Effect": "Allow",
|
| 297 |
+
"Action": [
|
| 298 |
+
"textract:DetectDocumentText",
|
| 299 |
+
"textract:AnalyzeDocument"
|
| 300 |
+
],
|
| 301 |
+
"Resource": "*"
|
| 302 |
+
},
|
| 303 |
+
{
|
| 304 |
+
"Effect": "Allow",
|
| 305 |
+
"Action": [
|
| 306 |
+
"bedrock:InvokeModel"
|
| 307 |
+
],
|
| 308 |
+
"Resource": "arn:aws:bedrock:*::foundation-model/*"
|
| 309 |
+
}
|
| 310 |
+
]
|
| 311 |
+
}
|
| 312 |
+
```
|
| 313 |
+
|
| 314 |
+
### Data Encryption
|
| 315 |
+
|
| 316 |
+
- S3: Server-side encryption (SSE-S3 or SSE-KMS)
|
| 317 |
+
- OpenSearch: Encryption at rest
|
| 318 |
+
- Lambda: Environment variable encryption
|
| 319 |
+
|
| 320 |
+
## Deployment
|
| 321 |
+
|
| 322 |
+
### Infrastructure as Code (Terraform)
|
| 323 |
+
|
| 324 |
+
```hcl
|
| 325 |
+
# S3 Bucket
|
| 326 |
+
resource "aws_s3_bucket" "documents" {
|
| 327 |
+
bucket = "sparknet-documents"
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
# Lambda Function
|
| 331 |
+
resource "aws_lambda_function" "processor" {
|
| 332 |
+
function_name = "sparknet-processor"
|
| 333 |
+
runtime = "python3.11"
|
| 334 |
+
handler = "handler.process"
|
| 335 |
+
memory_size = 1024
|
| 336 |
+
timeout = 300
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
# OpenSearch Serverless
|
| 340 |
+
resource "aws_opensearchserverless_collection" "vectors" {
|
| 341 |
+
name = "sparknet-vectors"
|
| 342 |
+
type = "VECTORSEARCH"
|
| 343 |
+
}
|
| 344 |
+
```
|
| 345 |
+
|
| 346 |
+
### CI/CD Pipeline
|
| 347 |
+
|
| 348 |
+
```yaml
|
| 349 |
+
# GitHub Actions
|
| 350 |
+
name: Deploy SPARKNET
|
| 351 |
+
|
| 352 |
+
on:
|
| 353 |
+
push:
|
| 354 |
+
branches: [main]
|
| 355 |
+
|
| 356 |
+
jobs:
|
| 357 |
+
deploy:
|
| 358 |
+
runs-on: ubuntu-latest
|
| 359 |
+
steps:
|
| 360 |
+
- uses: actions/checkout@v3
|
| 361 |
+
- name: Deploy Lambda
|
| 362 |
+
run: |
|
| 363 |
+
aws lambda update-function-code \
|
| 364 |
+
--function-name sparknet-processor \
|
| 365 |
+
--zip-file fileb://package.zip
|
| 366 |
+
```
|
| 367 |
+
|
| 368 |
+
## Monitoring
|
| 369 |
+
|
| 370 |
+
### CloudWatch Metrics
|
| 371 |
+
|
| 372 |
+
- Lambda invocations and duration
|
| 373 |
+
- S3 request counts
|
| 374 |
+
- OpenSearch query latency
|
| 375 |
+
- Bedrock token usage
|
| 376 |
+
|
| 377 |
+
### Dashboards
|
| 378 |
+
|
| 379 |
+
- Processing throughput
|
| 380 |
+
- Error rates
|
| 381 |
+
- Cost tracking
|
| 382 |
+
- Vector store statistics
|
| 383 |
+
|
| 384 |
+
## Next Steps
|
| 385 |
+
|
| 386 |
+
1. **Implement Storage Abstraction**: Create S3 adapter
|
| 387 |
+
2. **Add Textract Engine**: Implement AWS OCR
|
| 388 |
+
3. **Create Bedrock Adapter**: LLM migration
|
| 389 |
+
4. **Deploy OpenSearch**: Vector store setup
|
| 390 |
+
5. **Build Lambda Functions**: Serverless compute
|
| 391 |
+
6. **Setup Step Functions**: Workflow orchestration
|
| 392 |
+
7. **Configure CI/CD**: Automated deployment
|
docs/DOCUMENT_INTELLIGENCE.md
ADDED
|
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPARKNET Document Intelligence
|
| 2 |
+
|
| 3 |
+
A vision-first agentic document understanding platform that goes beyond OCR, supports complex layouts, and produces LLM-ready, visually grounded outputs suitable for RAG and field extraction at scale.
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
The Document Intelligence subsystem provides:
|
| 8 |
+
|
| 9 |
+
- **Vision-First Understanding**: Treats documents as visual objects, not just text
|
| 10 |
+
- **Semantic Chunking**: Classifies regions by type (text, table, figure, chart, form, etc.)
|
| 11 |
+
- **Visual Grounding**: Every extraction includes evidence (page, bbox, snippet, confidence)
|
| 12 |
+
- **Zero-Shot Capability**: Works across diverse document formats without training
|
| 13 |
+
- **Schema-Driven Extraction**: Define fields using JSON Schema or Pydantic models
|
| 14 |
+
- **Abstention Policy**: Never guesses - abstains when confidence is low
|
| 15 |
+
- **Local-First**: All processing happens locally for privacy
|
| 16 |
+
|
| 17 |
+
## Quick Start
|
| 18 |
+
|
| 19 |
+
### Basic Parsing
|
| 20 |
+
|
| 21 |
+
```python
|
| 22 |
+
from src.document_intelligence import DocumentParser, ParserConfig
|
| 23 |
+
|
| 24 |
+
# Configure parser
|
| 25 |
+
config = ParserConfig(
|
| 26 |
+
render_dpi=200,
|
| 27 |
+
max_pages=10,
|
| 28 |
+
include_markdown=True,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
parser = DocumentParser(config=config)
|
| 32 |
+
result = parser.parse("document.pdf")
|
| 33 |
+
|
| 34 |
+
print(f"Parsed {len(result.chunks)} chunks from {result.num_pages} pages")
|
| 35 |
+
|
| 36 |
+
# Access chunks
|
| 37 |
+
for chunk in result.chunks:
|
| 38 |
+
print(f"[Page {chunk.page}] {chunk.chunk_type.value}: {chunk.text[:100]}...")
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
### Field Extraction
|
| 42 |
+
|
| 43 |
+
```python
|
| 44 |
+
from src.document_intelligence import (
|
| 45 |
+
FieldExtractor,
|
| 46 |
+
ExtractionSchema,
|
| 47 |
+
create_invoice_schema,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# Use preset schema
|
| 51 |
+
schema = create_invoice_schema()
|
| 52 |
+
|
| 53 |
+
# Or create custom schema
|
| 54 |
+
schema = ExtractionSchema(name="CustomSchema")
|
| 55 |
+
schema.add_string_field("company_name", "Name of the company", required=True)
|
| 56 |
+
schema.add_date_field("document_date", "Date on document")
|
| 57 |
+
schema.add_currency_field("total_amount", "Total amount")
|
| 58 |
+
|
| 59 |
+
# Extract fields
|
| 60 |
+
extractor = FieldExtractor()
|
| 61 |
+
extraction = extractor.extract(parse_result, schema)
|
| 62 |
+
|
| 63 |
+
print("Extracted Data:")
|
| 64 |
+
for key, value in extraction.data.items():
|
| 65 |
+
if key in extraction.abstained_fields:
|
| 66 |
+
print(f" {key}: [ABSTAINED]")
|
| 67 |
+
else:
|
| 68 |
+
print(f" {key}: {value}")
|
| 69 |
+
|
| 70 |
+
print(f"Confidence: {extraction.overall_confidence:.2f}")
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
### Visual Grounding
|
| 74 |
+
|
| 75 |
+
```python
|
| 76 |
+
from src.document_intelligence import (
|
| 77 |
+
load_document,
|
| 78 |
+
RenderOptions,
|
| 79 |
+
)
|
| 80 |
+
from src.document_intelligence.grounding import (
|
| 81 |
+
crop_region,
|
| 82 |
+
create_annotated_image,
|
| 83 |
+
EvidenceBuilder,
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Load and render page
|
| 87 |
+
loader, renderer = load_document("document.pdf")
|
| 88 |
+
page_image = renderer.render_page(1, RenderOptions(dpi=200))
|
| 89 |
+
|
| 90 |
+
# Create annotated visualization
|
| 91 |
+
bboxes = [chunk.bbox for chunk in result.chunks if chunk.page == 1]
|
| 92 |
+
labels = [chunk.chunk_type.value for chunk in result.chunks if chunk.page == 1]
|
| 93 |
+
annotated = create_annotated_image(page_image, bboxes, labels)
|
| 94 |
+
|
| 95 |
+
# Crop specific region
|
| 96 |
+
crop = crop_region(page_image, chunk.bbox, padding_percent=0.02)
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
### Question Answering
|
| 100 |
+
|
| 101 |
+
```python
|
| 102 |
+
from src.document_intelligence.tools import get_tool
|
| 103 |
+
|
| 104 |
+
qa_tool = get_tool("answer_question")
|
| 105 |
+
result = qa_tool.execute(
|
| 106 |
+
parse_result=parse_result,
|
| 107 |
+
question="What is the total amount due?",
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
if result.success:
|
| 111 |
+
print(f"Answer: {result.data['answer']}")
|
| 112 |
+
print(f"Confidence: {result.data['confidence']:.2f}")
|
| 113 |
+
|
| 114 |
+
for ev in result.evidence:
|
| 115 |
+
print(f" Evidence: Page {ev['page']}, {ev['snippet'][:50]}...")
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
## Architecture
|
| 119 |
+
|
| 120 |
+
### Module Structure
|
| 121 |
+
|
| 122 |
+
```
|
| 123 |
+
src/document_intelligence/
|
| 124 |
+
├── __init__.py # Main exports
|
| 125 |
+
├── chunks/ # Core data models
|
| 126 |
+
│ ├── models.py # BoundingBox, DocumentChunk, TableChunk, etc.
|
| 127 |
+
│ └── __init__.py
|
| 128 |
+
├── io/ # Document loading
|
| 129 |
+
│ ├── base.py # Abstract interfaces
|
| 130 |
+
│ ├── pdf.py # PDF loading (PyMuPDF)
|
| 131 |
+
│ ├── image.py # Image loading (PIL)
|
| 132 |
+
│ ├── cache.py # Page caching
|
| 133 |
+
│ └── __init__.py
|
| 134 |
+
├── models/ # Model interfaces
|
| 135 |
+
│ ├── base.py # BaseModel, BatchableModel
|
| 136 |
+
│ ├── ocr.py # OCRModel interface
|
| 137 |
+
│ ├── layout.py # LayoutModel interface
|
| 138 |
+
│ ├── table.py # TableModel interface
|
| 139 |
+
│ ├── chart.py # ChartModel interface
|
| 140 |
+
│ ├── vlm.py # VisionLanguageModel interface
|
| 141 |
+
│ └── __init__.py
|
| 142 |
+
├── parsing/ # Document parsing
|
| 143 |
+
│ ├── parser.py # DocumentParser orchestrator
|
| 144 |
+
│ ├── chunking.py # Semantic chunking utilities
|
| 145 |
+
│ └── __init__.py
|
| 146 |
+
├── grounding/ # Visual evidence
|
| 147 |
+
│ ├── evidence.py # EvidenceBuilder, EvidenceTracker
|
| 148 |
+
│ ├── crops.py # Image cropping utilities
|
| 149 |
+
│ └── __init__.py
|
| 150 |
+
├── extraction/ # Field extraction
|
| 151 |
+
│ ├── schema.py # ExtractionSchema, FieldSpec
|
| 152 |
+
│ ├── extractor.py # FieldExtractor
|
| 153 |
+
│ ├── validator.py # ExtractionValidator
|
| 154 |
+
│ └── __init__.py
|
| 155 |
+
├── tools/ # Agent tools
|
| 156 |
+
│ ├── document_tools.py # Tool implementations
|
| 157 |
+
│ └── __init__.py
|
| 158 |
+
├── validation/ # Result validation
|
| 159 |
+
│ └── __init__.py
|
| 160 |
+
└── agent_adapter.py # Agent integration
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
### Data Models
|
| 164 |
+
|
| 165 |
+
#### BoundingBox
|
| 166 |
+
|
| 167 |
+
Represents a rectangular region in XYXY format:
|
| 168 |
+
|
| 169 |
+
```python
|
| 170 |
+
from src.document_intelligence.chunks import BoundingBox
|
| 171 |
+
|
| 172 |
+
# Normalized coordinates (0-1)
|
| 173 |
+
bbox = BoundingBox(
|
| 174 |
+
x_min=0.1, y_min=0.2,
|
| 175 |
+
x_max=0.9, y_max=0.3,
|
| 176 |
+
normalized=True
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
# Convert to pixels
|
| 180 |
+
pixel_bbox = bbox.to_pixel(width=1000, height=800)
|
| 181 |
+
|
| 182 |
+
# Calculate IoU
|
| 183 |
+
overlap = bbox1.iou(bbox2)
|
| 184 |
+
|
| 185 |
+
# Check containment
|
| 186 |
+
is_inside = bbox.contains((0.5, 0.25))
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
#### DocumentChunk
|
| 190 |
+
|
| 191 |
+
Base semantic chunk:
|
| 192 |
+
|
| 193 |
+
```python
|
| 194 |
+
from src.document_intelligence.chunks import DocumentChunk, ChunkType
|
| 195 |
+
|
| 196 |
+
chunk = DocumentChunk(
|
| 197 |
+
chunk_id="abc123",
|
| 198 |
+
doc_id="doc001",
|
| 199 |
+
chunk_type=ChunkType.PARAGRAPH,
|
| 200 |
+
text="Content...",
|
| 201 |
+
page=1,
|
| 202 |
+
bbox=bbox,
|
| 203 |
+
confidence=0.95,
|
| 204 |
+
sequence_index=0,
|
| 205 |
+
)
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
+
#### TableChunk
|
| 209 |
+
|
| 210 |
+
Table with cell structure:
|
| 211 |
+
|
| 212 |
+
```python
|
| 213 |
+
from src.document_intelligence.chunks import TableChunk, TableCell
|
| 214 |
+
|
| 215 |
+
# Access cells
|
| 216 |
+
cell = table.get_cell(row=0, col=1)
|
| 217 |
+
|
| 218 |
+
# Export formats
|
| 219 |
+
csv_data = table.to_csv()
|
| 220 |
+
markdown = table.to_markdown()
|
| 221 |
+
json_data = table.to_structured_json()
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
#### EvidenceRef
|
| 225 |
+
|
| 226 |
+
Links extractions to visual sources:
|
| 227 |
+
|
| 228 |
+
```python
|
| 229 |
+
from src.document_intelligence.chunks import EvidenceRef
|
| 230 |
+
|
| 231 |
+
evidence = EvidenceRef(
|
| 232 |
+
chunk_id="chunk_001",
|
| 233 |
+
doc_id="doc_001",
|
| 234 |
+
page=1,
|
| 235 |
+
bbox=bbox,
|
| 236 |
+
source_type="text",
|
| 237 |
+
snippet="The total is $500",
|
| 238 |
+
confidence=0.9,
|
| 239 |
+
cell_id=None, # For table cells
|
| 240 |
+
crop_path=None, # Path to cropped image
|
| 241 |
+
)
|
| 242 |
+
```
|
| 243 |
+
|
| 244 |
+
## CLI Commands
|
| 245 |
+
|
| 246 |
+
```bash
|
| 247 |
+
# Parse document
|
| 248 |
+
sparknet docint parse document.pdf -o result.json
|
| 249 |
+
sparknet docint parse document.pdf --format markdown
|
| 250 |
+
|
| 251 |
+
# Extract fields
|
| 252 |
+
sparknet docint extract invoice.pdf --preset invoice
|
| 253 |
+
sparknet docint extract doc.pdf -f vendor_name -f total_amount
|
| 254 |
+
sparknet docint extract doc.pdf --schema my_schema.json
|
| 255 |
+
|
| 256 |
+
# Ask questions
|
| 257 |
+
sparknet docint ask document.pdf "What is the contract value?"
|
| 258 |
+
|
| 259 |
+
# Classify document
|
| 260 |
+
sparknet docint classify document.pdf
|
| 261 |
+
|
| 262 |
+
# Search content
|
| 263 |
+
sparknet docint search document.pdf -q "payment terms"
|
| 264 |
+
sparknet docint search document.pdf --type table
|
| 265 |
+
|
| 266 |
+
# Visualize regions
|
| 267 |
+
sparknet docint visualize document.pdf --page 1 --annotate
|
| 268 |
+
```
|
| 269 |
+
|
| 270 |
+
## Configuration
|
| 271 |
+
|
| 272 |
+
### Parser Configuration
|
| 273 |
+
|
| 274 |
+
```python
|
| 275 |
+
from src.document_intelligence import ParserConfig
|
| 276 |
+
|
| 277 |
+
config = ParserConfig(
|
| 278 |
+
# Rendering
|
| 279 |
+
render_dpi=200, # DPI for page rasterization
|
| 280 |
+
max_pages=None, # Limit pages (None = all)
|
| 281 |
+
|
| 282 |
+
# OCR
|
| 283 |
+
ocr_enabled=True,
|
| 284 |
+
ocr_languages=["en"],
|
| 285 |
+
ocr_min_confidence=0.5,
|
| 286 |
+
|
| 287 |
+
# Layout
|
| 288 |
+
layout_enabled=True,
|
| 289 |
+
reading_order_enabled=True,
|
| 290 |
+
|
| 291 |
+
# Specialized extraction
|
| 292 |
+
table_extraction_enabled=True,
|
| 293 |
+
chart_extraction_enabled=True,
|
| 294 |
+
|
| 295 |
+
# Chunking
|
| 296 |
+
merge_adjacent_text=True,
|
| 297 |
+
min_chunk_chars=10,
|
| 298 |
+
max_chunk_chars=4000,
|
| 299 |
+
|
| 300 |
+
# Output
|
| 301 |
+
include_markdown=True,
|
| 302 |
+
cache_enabled=True,
|
| 303 |
+
)
|
| 304 |
+
```
|
| 305 |
+
|
| 306 |
+
### Extraction Configuration
|
| 307 |
+
|
| 308 |
+
```python
|
| 309 |
+
from src.document_intelligence import ExtractionConfig
|
| 310 |
+
|
| 311 |
+
config = ExtractionConfig(
|
| 312 |
+
# Confidence
|
| 313 |
+
min_field_confidence=0.5,
|
| 314 |
+
min_overall_confidence=0.5,
|
| 315 |
+
|
| 316 |
+
# Abstention
|
| 317 |
+
abstain_on_low_confidence=True,
|
| 318 |
+
abstain_threshold=0.3,
|
| 319 |
+
|
| 320 |
+
# Search
|
| 321 |
+
search_all_chunks=True,
|
| 322 |
+
prefer_structured_sources=True,
|
| 323 |
+
|
| 324 |
+
# Validation
|
| 325 |
+
validate_extracted_values=True,
|
| 326 |
+
normalize_values=True,
|
| 327 |
+
)
|
| 328 |
+
```
|
| 329 |
+
|
| 330 |
+
## Preset Schemas
|
| 331 |
+
|
| 332 |
+
### Invoice
|
| 333 |
+
|
| 334 |
+
```python
|
| 335 |
+
from src.document_intelligence import create_invoice_schema
|
| 336 |
+
|
| 337 |
+
schema = create_invoice_schema()
|
| 338 |
+
# Fields: invoice_number, invoice_date, due_date, vendor_name, vendor_address,
|
| 339 |
+
# customer_name, customer_address, subtotal, tax_amount, total_amount,
|
| 340 |
+
# currency, payment_terms
|
| 341 |
+
```
|
| 342 |
+
|
| 343 |
+
### Receipt
|
| 344 |
+
|
| 345 |
+
```python
|
| 346 |
+
from src.document_intelligence import create_receipt_schema
|
| 347 |
+
|
| 348 |
+
schema = create_receipt_schema()
|
| 349 |
+
# Fields: merchant_name, merchant_address, transaction_date, transaction_time,
|
| 350 |
+
# subtotal, tax_amount, total_amount, payment_method, last_four_digits
|
| 351 |
+
```
|
| 352 |
+
|
| 353 |
+
### Contract
|
| 354 |
+
|
| 355 |
+
```python
|
| 356 |
+
from src.document_intelligence import create_contract_schema
|
| 357 |
+
|
| 358 |
+
schema = create_contract_schema()
|
| 359 |
+
# Fields: contract_title, effective_date, expiration_date, party_a_name,
|
| 360 |
+
# party_b_name, contract_value, governing_law, termination_clause
|
| 361 |
+
```
|
| 362 |
+
|
| 363 |
+
## Agent Integration
|
| 364 |
+
|
| 365 |
+
```python
|
| 366 |
+
from src.document_intelligence.agent_adapter import (
|
| 367 |
+
DocumentIntelligenceAdapter,
|
| 368 |
+
EnhancedDocumentAgent,
|
| 369 |
+
AgentConfig,
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
# Create adapter
|
| 373 |
+
config = AgentConfig(
|
| 374 |
+
render_dpi=200,
|
| 375 |
+
min_confidence=0.5,
|
| 376 |
+
max_iterations=10,
|
| 377 |
+
)
|
| 378 |
+
|
| 379 |
+
# With existing LLM client
|
| 380 |
+
agent = EnhancedDocumentAgent(
|
| 381 |
+
llm_client=ollama_client,
|
| 382 |
+
config=config,
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
# Load document
|
| 386 |
+
await agent.load_document("document.pdf")
|
| 387 |
+
|
| 388 |
+
# Extract with schema
|
| 389 |
+
result = await agent.extract_fields(schema)
|
| 390 |
+
|
| 391 |
+
# Answer questions
|
| 392 |
+
answer, evidence = await agent.answer_question("What is the total?")
|
| 393 |
+
|
| 394 |
+
# Classify
|
| 395 |
+
classification = await agent.classify()
|
| 396 |
+
```
|
| 397 |
+
|
| 398 |
+
## Available Tools
|
| 399 |
+
|
| 400 |
+
| Tool | Description |
|
| 401 |
+
|------|-------------|
|
| 402 |
+
| `parse_document` | Parse document into semantic chunks |
|
| 403 |
+
| `extract_fields` | Schema-driven field extraction |
|
| 404 |
+
| `search_chunks` | Search document content |
|
| 405 |
+
| `get_chunk_details` | Get detailed chunk information |
|
| 406 |
+
| `get_table_data` | Extract structured table data |
|
| 407 |
+
| `answer_question` | Document Q&A |
|
| 408 |
+
| `crop_region` | Extract visual regions |
|
| 409 |
+
|
| 410 |
+
## Best Practices
|
| 411 |
+
|
| 412 |
+
### 1. Always Check Confidence
|
| 413 |
+
|
| 414 |
+
```python
|
| 415 |
+
if extraction.overall_confidence < 0.7:
|
| 416 |
+
print("Low confidence - manual review recommended")
|
| 417 |
+
|
| 418 |
+
for field, value in extraction.data.items():
|
| 419 |
+
if field in extraction.abstained_fields:
|
| 420 |
+
print(f"{field}: Needs manual verification")
|
| 421 |
+
```
|
| 422 |
+
|
| 423 |
+
### 2. Use Evidence for Verification
|
| 424 |
+
|
| 425 |
+
```python
|
| 426 |
+
for evidence in extraction.evidence:
|
| 427 |
+
print(f"Found on page {evidence.page}")
|
| 428 |
+
print(f"Location: {evidence.bbox.xyxy}")
|
| 429 |
+
print(f"Source text: {evidence.snippet}")
|
| 430 |
+
```
|
| 431 |
+
|
| 432 |
+
### 3. Handle Abstention Gracefully
|
| 433 |
+
|
| 434 |
+
```python
|
| 435 |
+
result = extractor.extract(parse_result, schema)
|
| 436 |
+
|
| 437 |
+
for field in schema.get_required_fields():
|
| 438 |
+
if field.name in result.abstained_fields:
|
| 439 |
+
# Request human review
|
| 440 |
+
flag_for_review(field.name, parse_result.doc_id)
|
| 441 |
+
```
|
| 442 |
+
|
| 443 |
+
### 4. Validate Before Use
|
| 444 |
+
|
| 445 |
+
```python
|
| 446 |
+
from src.document_intelligence import ExtractionValidator
|
| 447 |
+
|
| 448 |
+
validator = ExtractionValidator(min_confidence=0.7)
|
| 449 |
+
validation = validator.validate(result, schema)
|
| 450 |
+
|
| 451 |
+
if not validation.is_valid:
|
| 452 |
+
for issue in validation.issues:
|
| 453 |
+
print(f"[{issue.severity}] {issue.field_name}: {issue.message}")
|
| 454 |
+
```
|
| 455 |
+
|
| 456 |
+
## Dependencies
|
| 457 |
+
|
| 458 |
+
- `pymupdf` - PDF loading and rendering
|
| 459 |
+
- `pillow` - Image processing
|
| 460 |
+
- `numpy` - Array operations
|
| 461 |
+
- `pydantic` - Data validation
|
| 462 |
+
|
| 463 |
+
Optional:
|
| 464 |
+
- `paddleocr` - OCR engine
|
| 465 |
+
- `tesseract` - Alternative OCR
|
| 466 |
+
- `chromadb` - Vector storage for RAG
|
| 467 |
+
|
| 468 |
+
## License
|
| 469 |
+
|
| 470 |
+
MIT License - see LICENSE file for details.
|
docs/SPARKNET_Progress_Report.py
ADDED
|
@@ -0,0 +1,1432 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
SPARKNET Progress Report & Future Work PDF Generator
|
| 4 |
+
Generates a comprehensive stakeholder presentation document.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from reportlab.lib import colors
|
| 8 |
+
from reportlab.lib.pagesizes import A4, landscape
|
| 9 |
+
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
| 10 |
+
from reportlab.lib.units import inch, cm
|
| 11 |
+
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY, TA_RIGHT
|
| 12 |
+
from reportlab.platypus import (
|
| 13 |
+
SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle,
|
| 14 |
+
PageBreak, Image, ListFlowable, ListItem, KeepTogether,
|
| 15 |
+
Flowable, HRFlowable
|
| 16 |
+
)
|
| 17 |
+
from reportlab.graphics.shapes import Drawing, Rect, String, Line, Polygon
|
| 18 |
+
from reportlab.graphics.charts.barcharts import VerticalBarChart
|
| 19 |
+
from reportlab.graphics.charts.piecharts import Pie
|
| 20 |
+
from reportlab.graphics import renderPDF
|
| 21 |
+
from reportlab.pdfgen import canvas
|
| 22 |
+
from datetime import datetime
|
| 23 |
+
import os
|
| 24 |
+
|
| 25 |
+
# Color Scheme - Professional Blue Theme
|
| 26 |
+
PRIMARY_BLUE = colors.HexColor('#1e3a5f')
|
| 27 |
+
SECONDARY_BLUE = colors.HexColor('#2d5a87')
|
| 28 |
+
ACCENT_BLUE = colors.HexColor('#4a90d9')
|
| 29 |
+
LIGHT_BLUE = colors.HexColor('#e8f4fc')
|
| 30 |
+
SUCCESS_GREEN = colors.HexColor('#28a745')
|
| 31 |
+
WARNING_ORANGE = colors.HexColor('#fd7e14')
|
| 32 |
+
DANGER_RED = colors.HexColor('#dc3545')
|
| 33 |
+
GRAY_DARK = colors.HexColor('#343a40')
|
| 34 |
+
GRAY_LIGHT = colors.HexColor('#f8f9fa')
|
| 35 |
+
WHITE = colors.white
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class DiagramFlowable(Flowable):
|
| 39 |
+
"""Custom flowable for drawing architecture diagrams."""
|
| 40 |
+
|
| 41 |
+
def __init__(self, width, height, diagram_type='architecture'):
|
| 42 |
+
Flowable.__init__(self)
|
| 43 |
+
self.width = width
|
| 44 |
+
self.height = height
|
| 45 |
+
self.diagram_type = diagram_type
|
| 46 |
+
|
| 47 |
+
def draw(self):
|
| 48 |
+
if self.diagram_type == 'architecture':
|
| 49 |
+
self._draw_architecture()
|
| 50 |
+
elif self.diagram_type == 'rag_pipeline':
|
| 51 |
+
self._draw_rag_pipeline()
|
| 52 |
+
elif self.diagram_type == 'document_pipeline':
|
| 53 |
+
self._draw_document_pipeline()
|
| 54 |
+
elif self.diagram_type == 'agent_interaction':
|
| 55 |
+
self._draw_agent_interaction()
|
| 56 |
+
elif self.diagram_type == 'data_flow':
|
| 57 |
+
self._draw_data_flow()
|
| 58 |
+
|
| 59 |
+
def _draw_box(self, x, y, w, h, text, fill_color, text_color=WHITE, font_size=9):
|
| 60 |
+
"""Draw a rounded box with text."""
|
| 61 |
+
self.canv.setFillColor(fill_color)
|
| 62 |
+
self.canv.roundRect(x, y, w, h, 5, fill=1, stroke=0)
|
| 63 |
+
self.canv.setFillColor(text_color)
|
| 64 |
+
self.canv.setFont('Helvetica-Bold', font_size)
|
| 65 |
+
# Center text
|
| 66 |
+
text_width = self.canv.stringWidth(text, 'Helvetica-Bold', font_size)
|
| 67 |
+
self.canv.drawString(x + (w - text_width) / 2, y + h/2 - 3, text)
|
| 68 |
+
|
| 69 |
+
def _draw_arrow(self, x1, y1, x2, y2, color=GRAY_DARK):
|
| 70 |
+
"""Draw an arrow from (x1,y1) to (x2,y2)."""
|
| 71 |
+
self.canv.setStrokeColor(color)
|
| 72 |
+
self.canv.setLineWidth(2)
|
| 73 |
+
self.canv.line(x1, y1, x2, y2)
|
| 74 |
+
# Arrow head
|
| 75 |
+
import math
|
| 76 |
+
angle = math.atan2(y2-y1, x2-x1)
|
| 77 |
+
arrow_len = 8
|
| 78 |
+
self.canv.line(x2, y2, x2 - arrow_len * math.cos(angle - 0.4), y2 - arrow_len * math.sin(angle - 0.4))
|
| 79 |
+
self.canv.line(x2, y2, x2 - arrow_len * math.cos(angle + 0.4), y2 - arrow_len * math.sin(angle + 0.4))
|
| 80 |
+
|
| 81 |
+
def _draw_architecture(self):
|
| 82 |
+
"""Draw the high-level SPARKNET architecture."""
|
| 83 |
+
# Title
|
| 84 |
+
self.canv.setFillColor(PRIMARY_BLUE)
|
| 85 |
+
self.canv.setFont('Helvetica-Bold', 12)
|
| 86 |
+
self.canv.drawCentredString(self.width/2, self.height - 20, 'SPARKNET Architecture Overview')
|
| 87 |
+
|
| 88 |
+
# User Layer
|
| 89 |
+
self._draw_box(self.width/2 - 60, self.height - 70, 120, 35, 'User Interface', ACCENT_BLUE)
|
| 90 |
+
|
| 91 |
+
# Demo Layer
|
| 92 |
+
self.canv.setFillColor(LIGHT_BLUE)
|
| 93 |
+
self.canv.roundRect(30, self.height - 160, self.width - 60, 70, 8, fill=1, stroke=0)
|
| 94 |
+
self.canv.setFillColor(PRIMARY_BLUE)
|
| 95 |
+
self.canv.setFont('Helvetica-Bold', 10)
|
| 96 |
+
self.canv.drawString(40, self.height - 100, 'Streamlit Demo Application')
|
| 97 |
+
|
| 98 |
+
# Demo pages
|
| 99 |
+
pages = ['Live\nProcessing', 'Interactive\nRAG', 'Doc\nComparison', 'Evidence\nViewer', 'Doc\nViewer']
|
| 100 |
+
page_width = (self.width - 100) / 5
|
| 101 |
+
for i, page in enumerate(pages):
|
| 102 |
+
x = 45 + i * page_width
|
| 103 |
+
self._draw_box(x, self.height - 150, page_width - 10, 35, page.replace('\n', ' '), SECONDARY_BLUE, font_size=7)
|
| 104 |
+
|
| 105 |
+
# Arrow from UI to Demo
|
| 106 |
+
self._draw_arrow(self.width/2, self.height - 70, self.width/2, self.height - 90, ACCENT_BLUE)
|
| 107 |
+
|
| 108 |
+
# Core Services Layer
|
| 109 |
+
self.canv.setFillColor(LIGHT_BLUE)
|
| 110 |
+
self.canv.roundRect(30, self.height - 280, self.width - 60, 100, 8, fill=1, stroke=0)
|
| 111 |
+
self.canv.setFillColor(PRIMARY_BLUE)
|
| 112 |
+
self.canv.setFont('Helvetica-Bold', 10)
|
| 113 |
+
self.canv.drawString(40, self.height - 190, 'Core Services')
|
| 114 |
+
|
| 115 |
+
# Core boxes
|
| 116 |
+
self._draw_box(50, self.height - 230, 100, 30, 'Document Intel', PRIMARY_BLUE, font_size=8)
|
| 117 |
+
self._draw_box(170, self.height - 230, 100, 30, 'Multi-Agent RAG', PRIMARY_BLUE, font_size=8)
|
| 118 |
+
self._draw_box(290, self.height - 230, 100, 30, 'Vector Store', PRIMARY_BLUE, font_size=8)
|
| 119 |
+
self._draw_box(410, self.height - 230, 80, 30, 'LLM Layer', PRIMARY_BLUE, font_size=8)
|
| 120 |
+
|
| 121 |
+
# Sub-components
|
| 122 |
+
self._draw_box(50, self.height - 270, 100, 30, 'OCR + Layout', SECONDARY_BLUE, font_size=7)
|
| 123 |
+
self._draw_box(170, self.height - 270, 100, 30, '5 Agents', SECONDARY_BLUE, font_size=7)
|
| 124 |
+
self._draw_box(290, self.height - 270, 100, 30, 'ChromaDB', SECONDARY_BLUE, font_size=7)
|
| 125 |
+
self._draw_box(410, self.height - 270, 80, 30, 'Ollama', SECONDARY_BLUE, font_size=7)
|
| 126 |
+
|
| 127 |
+
# Arrow from Demo to Core
|
| 128 |
+
self._draw_arrow(self.width/2, self.height - 160, self.width/2, self.height - 180, ACCENT_BLUE)
|
| 129 |
+
|
| 130 |
+
# Storage Layer
|
| 131 |
+
self.canv.setFillColor(GRAY_LIGHT)
|
| 132 |
+
self.canv.roundRect(30, self.height - 340, self.width - 60, 45, 8, fill=1, stroke=0)
|
| 133 |
+
self.canv.setFillColor(GRAY_DARK)
|
| 134 |
+
self.canv.setFont('Helvetica-Bold', 10)
|
| 135 |
+
self.canv.drawString(40, self.height - 310, 'Persistent Storage')
|
| 136 |
+
|
| 137 |
+
self._draw_box(150, self.height - 335, 80, 25, 'Embeddings', GRAY_DARK, font_size=7)
|
| 138 |
+
self._draw_box(250, self.height - 335, 80, 25, 'Documents', GRAY_DARK, font_size=7)
|
| 139 |
+
self._draw_box(350, self.height - 335, 80, 25, 'Cache', GRAY_DARK, font_size=7)
|
| 140 |
+
|
| 141 |
+
# Arrow
|
| 142 |
+
self._draw_arrow(self.width/2, self.height - 280, self.width/2, self.height - 295, GRAY_DARK)
|
| 143 |
+
|
| 144 |
+
def _draw_rag_pipeline(self):
|
| 145 |
+
"""Draw the Multi-Agent RAG Pipeline."""
|
| 146 |
+
self.canv.setFillColor(PRIMARY_BLUE)
|
| 147 |
+
self.canv.setFont('Helvetica-Bold', 12)
|
| 148 |
+
self.canv.drawCentredString(self.width/2, self.height - 20, 'Multi-Agent RAG Pipeline')
|
| 149 |
+
|
| 150 |
+
# Query input
|
| 151 |
+
self._draw_box(20, self.height - 70, 80, 30, 'User Query', ACCENT_BLUE, font_size=8)
|
| 152 |
+
|
| 153 |
+
# Agents in sequence
|
| 154 |
+
agents = [
|
| 155 |
+
('QueryPlanner', PRIMARY_BLUE, 'Intent Classification\nQuery Decomposition'),
|
| 156 |
+
('Retriever', SECONDARY_BLUE, 'Hybrid Search\nDense + Sparse'),
|
| 157 |
+
('Reranker', SECONDARY_BLUE, 'Cross-Encoder\nMMR Diversity'),
|
| 158 |
+
('Synthesizer', PRIMARY_BLUE, 'Answer Generation\nCitation Tracking'),
|
| 159 |
+
('Critic', WARNING_ORANGE, 'Hallucination Check\nValidation'),
|
| 160 |
+
]
|
| 161 |
+
|
| 162 |
+
x_start = 120
|
| 163 |
+
box_width = 80
|
| 164 |
+
spacing = 10
|
| 165 |
+
|
| 166 |
+
for i, (name, color, desc) in enumerate(agents):
|
| 167 |
+
x = x_start + i * (box_width + spacing)
|
| 168 |
+
self._draw_box(x, self.height - 70, box_width, 30, name, color, font_size=7)
|
| 169 |
+
# Description below
|
| 170 |
+
self.canv.setFillColor(GRAY_DARK)
|
| 171 |
+
self.canv.setFont('Helvetica', 6)
|
| 172 |
+
lines = desc.split('\n')
|
| 173 |
+
for j, line in enumerate(lines):
|
| 174 |
+
self.canv.drawCentredString(x + box_width/2, self.height - 85 - j*8, line)
|
| 175 |
+
|
| 176 |
+
# Arrow to next
|
| 177 |
+
if i < len(agents) - 1:
|
| 178 |
+
self._draw_arrow(x + box_width, self.height - 55, x + box_width + spacing, self.height - 55, GRAY_DARK)
|
| 179 |
+
|
| 180 |
+
# Arrow from query to first agent
|
| 181 |
+
self._draw_arrow(100, self.height - 55, 120, self.height - 55, ACCENT_BLUE)
|
| 182 |
+
|
| 183 |
+
# Revision loop
|
| 184 |
+
self.canv.setStrokeColor(WARNING_ORANGE)
|
| 185 |
+
self.canv.setLineWidth(1.5)
|
| 186 |
+
self.canv.setDash(3, 3)
|
| 187 |
+
# Draw curved line for revision
|
| 188 |
+
critic_x = x_start + 4 * (box_width + spacing) + box_width
|
| 189 |
+
synth_x = x_start + 3 * (box_width + spacing)
|
| 190 |
+
self.canv.line(critic_x - 40, self.height - 100, synth_x + 40, self.height - 100)
|
| 191 |
+
self.canv.setDash()
|
| 192 |
+
|
| 193 |
+
self.canv.setFillColor(WARNING_ORANGE)
|
| 194 |
+
self.canv.setFont('Helvetica-Oblique', 7)
|
| 195 |
+
self.canv.drawCentredString((critic_x + synth_x)/2, self.height - 115, 'Revision Loop (if validation fails)')
|
| 196 |
+
|
| 197 |
+
# Final output
|
| 198 |
+
self._draw_box(critic_x + 20, self.height - 70, 80, 30, 'Response', SUCCESS_GREEN, font_size=8)
|
| 199 |
+
self._draw_arrow(critic_x, self.height - 55, critic_x + 20, self.height - 55, SUCCESS_GREEN)
|
| 200 |
+
|
| 201 |
+
# State tracking bar
|
| 202 |
+
self.canv.setFillColor(LIGHT_BLUE)
|
| 203 |
+
self.canv.roundRect(20, self.height - 160, self.width - 40, 35, 5, fill=1, stroke=0)
|
| 204 |
+
self.canv.setFillColor(PRIMARY_BLUE)
|
| 205 |
+
self.canv.setFont('Helvetica-Bold', 8)
|
| 206 |
+
self.canv.drawString(30, self.height - 145, 'RAGState: Query → Plan → Retrieved Chunks → Reranked → Answer → Validation → Citations')
|
| 207 |
+
|
| 208 |
+
def _draw_document_pipeline(self):
|
| 209 |
+
"""Draw Document Processing Pipeline."""
|
| 210 |
+
self.canv.setFillColor(PRIMARY_BLUE)
|
| 211 |
+
self.canv.setFont('Helvetica-Bold', 12)
|
| 212 |
+
self.canv.drawCentredString(self.width/2, self.height - 20, 'Document Processing Pipeline')
|
| 213 |
+
|
| 214 |
+
stages = [
|
| 215 |
+
('Input', 'PDF/Image\nUpload', ACCENT_BLUE),
|
| 216 |
+
('OCR', 'PaddleOCR\nTesseract', PRIMARY_BLUE),
|
| 217 |
+
('Layout', 'Region\nDetection', PRIMARY_BLUE),
|
| 218 |
+
('Reading\nOrder', 'Sequence\nReconstruction', SECONDARY_BLUE),
|
| 219 |
+
('Chunking', 'Semantic\nSplitting', SECONDARY_BLUE),
|
| 220 |
+
('Indexing', 'ChromaDB\nEmbedding', SUCCESS_GREEN),
|
| 221 |
+
]
|
| 222 |
+
|
| 223 |
+
box_width = 70
|
| 224 |
+
box_height = 45
|
| 225 |
+
spacing = 15
|
| 226 |
+
total_width = len(stages) * box_width + (len(stages) - 1) * spacing
|
| 227 |
+
x_start = (self.width - total_width) / 2
|
| 228 |
+
y_pos = self.height - 90
|
| 229 |
+
|
| 230 |
+
for i, (name, desc, color) in enumerate(stages):
|
| 231 |
+
x = x_start + i * (box_width + spacing)
|
| 232 |
+
# Main box
|
| 233 |
+
self._draw_box(x, y_pos, box_width, box_height, name.replace('\n', ' '), color, font_size=8)
|
| 234 |
+
# Description
|
| 235 |
+
self.canv.setFillColor(GRAY_DARK)
|
| 236 |
+
self.canv.setFont('Helvetica', 6)
|
| 237 |
+
lines = desc.split('\n')
|
| 238 |
+
for j, line in enumerate(lines):
|
| 239 |
+
self.canv.drawCentredString(x + box_width/2, y_pos - 15 - j*8, line)
|
| 240 |
+
|
| 241 |
+
# Arrow
|
| 242 |
+
if i < len(stages) - 1:
|
| 243 |
+
self._draw_arrow(x + box_width, y_pos + box_height/2, x + box_width + spacing, y_pos + box_height/2)
|
| 244 |
+
|
| 245 |
+
# Output description
|
| 246 |
+
self.canv.setFillColor(PRIMARY_BLUE)
|
| 247 |
+
self.canv.setFont('Helvetica-Bold', 9)
|
| 248 |
+
self.canv.drawCentredString(self.width/2, self.height - 160, 'Output: ProcessedDocument with chunks, OCR regions, layout data, bounding boxes')
|
| 249 |
+
|
| 250 |
+
def _draw_agent_interaction(self):
|
| 251 |
+
"""Draw Agent Interaction Diagram."""
|
| 252 |
+
self.canv.setFillColor(PRIMARY_BLUE)
|
| 253 |
+
self.canv.setFont('Helvetica-Bold', 12)
|
| 254 |
+
self.canv.drawCentredString(self.width/2, self.height - 20, 'Agent Interaction & Data Flow')
|
| 255 |
+
|
| 256 |
+
# Central orchestrator
|
| 257 |
+
center_x, center_y = self.width/2, self.height/2 - 20
|
| 258 |
+
self._draw_box(center_x - 50, center_y - 20, 100, 40, 'Orchestrator', PRIMARY_BLUE, font_size=9)
|
| 259 |
+
|
| 260 |
+
# Surrounding agents
|
| 261 |
+
import math
|
| 262 |
+
agents = [
|
| 263 |
+
('QueryPlanner', -120, 60),
|
| 264 |
+
('Retriever', 0, 90),
|
| 265 |
+
('Reranker', 120, 60),
|
| 266 |
+
('Synthesizer', 120, -60),
|
| 267 |
+
('Critic', 0, -90),
|
| 268 |
+
]
|
| 269 |
+
|
| 270 |
+
for name, dx, dy in agents:
|
| 271 |
+
x = center_x + dx - 45
|
| 272 |
+
y = center_y + dy - 15
|
| 273 |
+
self._draw_box(x, y, 90, 30, name, SECONDARY_BLUE, font_size=8)
|
| 274 |
+
# Arrow to/from orchestrator
|
| 275 |
+
if dy > 0:
|
| 276 |
+
self._draw_arrow(center_x, center_y + 20, center_x + dx*0.3, center_y + dy - 15, ACCENT_BLUE)
|
| 277 |
+
else:
|
| 278 |
+
self._draw_arrow(center_x + dx*0.3, center_y + dy + 15, center_x, center_y - 20, ACCENT_BLUE)
|
| 279 |
+
|
| 280 |
+
# External connections
|
| 281 |
+
# Vector Store
|
| 282 |
+
self._draw_box(30, center_y - 15, 70, 30, 'ChromaDB', SUCCESS_GREEN, font_size=8)
|
| 283 |
+
self._draw_arrow(100, center_y, center_x - 50, center_y, SUCCESS_GREEN)
|
| 284 |
+
|
| 285 |
+
# LLM
|
| 286 |
+
self._draw_box(self.width - 100, center_y - 15, 70, 30, 'Ollama LLM', WARNING_ORANGE, font_size=8)
|
| 287 |
+
self._draw_arrow(self.width - 100, center_y, center_x + 50, center_y, WARNING_ORANGE)
|
| 288 |
+
|
| 289 |
+
def _draw_data_flow(self):
|
| 290 |
+
"""Draw Data Flow Diagram."""
|
| 291 |
+
self.canv.setFillColor(PRIMARY_BLUE)
|
| 292 |
+
self.canv.setFont('Helvetica-Bold', 12)
|
| 293 |
+
self.canv.drawCentredString(self.width/2, self.height - 20, 'End-to-End Data Flow')
|
| 294 |
+
|
| 295 |
+
# Vertical flow
|
| 296 |
+
items = [
|
| 297 |
+
('Document Upload', ACCENT_BLUE, 'PDF, Images, Text files'),
|
| 298 |
+
('Document Processor', PRIMARY_BLUE, 'OCR → Layout → Chunking'),
|
| 299 |
+
('State Manager', SECONDARY_BLUE, 'ProcessedDocument storage'),
|
| 300 |
+
('Embedder', SECONDARY_BLUE, 'mxbai-embed-large (1024d)'),
|
| 301 |
+
('ChromaDB', SUCCESS_GREEN, 'Vector indexing & storage'),
|
| 302 |
+
('RAG Query', WARNING_ORANGE, 'User question processing'),
|
| 303 |
+
('Multi-Agent Pipeline', PRIMARY_BLUE, '5-agent collaboration'),
|
| 304 |
+
('Response', SUCCESS_GREEN, 'Answer with citations'),
|
| 305 |
+
]
|
| 306 |
+
|
| 307 |
+
box_height = 28
|
| 308 |
+
spacing = 8
|
| 309 |
+
total_height = len(items) * box_height + (len(items) - 1) * spacing
|
| 310 |
+
y_start = self.height - 50
|
| 311 |
+
box_width = 160
|
| 312 |
+
x_center = self.width / 2 - box_width / 2
|
| 313 |
+
|
| 314 |
+
for i, (name, color, desc) in enumerate(items):
|
| 315 |
+
y = y_start - i * (box_height + spacing)
|
| 316 |
+
self._draw_box(x_center, y - box_height, box_width, box_height, name, color, font_size=8)
|
| 317 |
+
# Description on right
|
| 318 |
+
self.canv.setFillColor(GRAY_DARK)
|
| 319 |
+
self.canv.setFont('Helvetica', 7)
|
| 320 |
+
self.canv.drawString(x_center + box_width + 15, y - box_height/2 - 3, desc)
|
| 321 |
+
|
| 322 |
+
# Arrow
|
| 323 |
+
if i < len(items) - 1:
|
| 324 |
+
self._draw_arrow(x_center + box_width/2, y - box_height, x_center + box_width/2, y - box_height - spacing + 2)
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def create_styles():
|
| 328 |
+
"""Create custom paragraph styles."""
|
| 329 |
+
styles = getSampleStyleSheet()
|
| 330 |
+
|
| 331 |
+
# Title style
|
| 332 |
+
styles.add(ParagraphStyle(
|
| 333 |
+
name='MainTitle',
|
| 334 |
+
parent=styles['Title'],
|
| 335 |
+
fontSize=28,
|
| 336 |
+
textColor=PRIMARY_BLUE,
|
| 337 |
+
spaceAfter=30,
|
| 338 |
+
alignment=TA_CENTER,
|
| 339 |
+
fontName='Helvetica-Bold'
|
| 340 |
+
))
|
| 341 |
+
|
| 342 |
+
# Subtitle
|
| 343 |
+
styles.add(ParagraphStyle(
|
| 344 |
+
name='Subtitle',
|
| 345 |
+
parent=styles['Normal'],
|
| 346 |
+
fontSize=16,
|
| 347 |
+
textColor=SECONDARY_BLUE,
|
| 348 |
+
spaceAfter=20,
|
| 349 |
+
alignment=TA_CENTER,
|
| 350 |
+
fontName='Helvetica'
|
| 351 |
+
))
|
| 352 |
+
|
| 353 |
+
# Section Header
|
| 354 |
+
styles.add(ParagraphStyle(
|
| 355 |
+
name='SectionHeader',
|
| 356 |
+
parent=styles['Heading1'],
|
| 357 |
+
fontSize=18,
|
| 358 |
+
textColor=PRIMARY_BLUE,
|
| 359 |
+
spaceBefore=25,
|
| 360 |
+
spaceAfter=15,
|
| 361 |
+
fontName='Helvetica-Bold',
|
| 362 |
+
borderColor=ACCENT_BLUE,
|
| 363 |
+
borderWidth=2,
|
| 364 |
+
borderPadding=5,
|
| 365 |
+
))
|
| 366 |
+
|
| 367 |
+
# Subsection Header
|
| 368 |
+
styles.add(ParagraphStyle(
|
| 369 |
+
name='SubsectionHeader',
|
| 370 |
+
parent=styles['Heading2'],
|
| 371 |
+
fontSize=14,
|
| 372 |
+
textColor=SECONDARY_BLUE,
|
| 373 |
+
spaceBefore=15,
|
| 374 |
+
spaceAfter=10,
|
| 375 |
+
fontName='Helvetica-Bold'
|
| 376 |
+
))
|
| 377 |
+
|
| 378 |
+
# Body text
|
| 379 |
+
styles.add(ParagraphStyle(
|
| 380 |
+
name='CustomBody',
|
| 381 |
+
parent=styles['Normal'],
|
| 382 |
+
fontSize=10,
|
| 383 |
+
textColor=GRAY_DARK,
|
| 384 |
+
spaceAfter=8,
|
| 385 |
+
alignment=TA_JUSTIFY,
|
| 386 |
+
leading=14
|
| 387 |
+
))
|
| 388 |
+
|
| 389 |
+
# Bullet style
|
| 390 |
+
styles.add(ParagraphStyle(
|
| 391 |
+
name='BulletText',
|
| 392 |
+
parent=styles['Normal'],
|
| 393 |
+
fontSize=10,
|
| 394 |
+
textColor=GRAY_DARK,
|
| 395 |
+
leftIndent=20,
|
| 396 |
+
spaceAfter=5,
|
| 397 |
+
leading=13
|
| 398 |
+
))
|
| 399 |
+
|
| 400 |
+
# Caption
|
| 401 |
+
styles.add(ParagraphStyle(
|
| 402 |
+
name='Caption',
|
| 403 |
+
parent=styles['Normal'],
|
| 404 |
+
fontSize=9,
|
| 405 |
+
textColor=GRAY_DARK,
|
| 406 |
+
alignment=TA_CENTER,
|
| 407 |
+
spaceAfter=15,
|
| 408 |
+
fontName='Helvetica-Oblique'
|
| 409 |
+
))
|
| 410 |
+
|
| 411 |
+
# Highlight box text
|
| 412 |
+
styles.add(ParagraphStyle(
|
| 413 |
+
name='HighlightText',
|
| 414 |
+
parent=styles['Normal'],
|
| 415 |
+
fontSize=10,
|
| 416 |
+
textColor=PRIMARY_BLUE,
|
| 417 |
+
spaceAfter=5,
|
| 418 |
+
fontName='Helvetica-Bold'
|
| 419 |
+
))
|
| 420 |
+
|
| 421 |
+
return styles
|
| 422 |
+
|
| 423 |
+
|
| 424 |
+
def create_highlight_box(text, styles, color=LIGHT_BLUE):
|
| 425 |
+
"""Create a highlighted text box."""
|
| 426 |
+
data = [[Paragraph(text, styles['HighlightText'])]]
|
| 427 |
+
table = Table(data, colWidths=[450])
|
| 428 |
+
table.setStyle(TableStyle([
|
| 429 |
+
('BACKGROUND', (0, 0), (-1, -1), color),
|
| 430 |
+
('BOX', (0, 0), (-1, -1), 1, ACCENT_BLUE),
|
| 431 |
+
('PADDING', (0, 0), (-1, -1), 12),
|
| 432 |
+
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
|
| 433 |
+
]))
|
| 434 |
+
return table
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
def create_status_table(items, styles):
|
| 438 |
+
"""Create a status table with colored indicators."""
|
| 439 |
+
data = [['Component', 'Status', 'Completion']]
|
| 440 |
+
for item, status, completion in items:
|
| 441 |
+
if status == 'Complete':
|
| 442 |
+
status_color = SUCCESS_GREEN
|
| 443 |
+
elif status == 'In Progress':
|
| 444 |
+
status_color = WARNING_ORANGE
|
| 445 |
+
else:
|
| 446 |
+
status_color = DANGER_RED
|
| 447 |
+
data.append([item, status, completion])
|
| 448 |
+
|
| 449 |
+
table = Table(data, colWidths=[250, 100, 100])
|
| 450 |
+
table.setStyle(TableStyle([
|
| 451 |
+
('BACKGROUND', (0, 0), (-1, 0), PRIMARY_BLUE),
|
| 452 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 453 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 454 |
+
('FONTSIZE', (0, 0), (-1, -1), 10),
|
| 455 |
+
('ALIGN', (1, 0), (-1, -1), 'CENTER'),
|
| 456 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 457 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
|
| 458 |
+
('PADDING', (0, 0), (-1, -1), 8),
|
| 459 |
+
]))
|
| 460 |
+
return table
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
def create_metrics_table(metrics, styles):
|
| 464 |
+
"""Create a metrics display table."""
|
| 465 |
+
data = []
|
| 466 |
+
for metric, value, change in metrics:
|
| 467 |
+
data.append([metric, value, change])
|
| 468 |
+
|
| 469 |
+
table = Table(data, colWidths=[200, 150, 100])
|
| 470 |
+
table.setStyle(TableStyle([
|
| 471 |
+
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
|
| 472 |
+
('FONTSIZE', (0, 0), (-1, -1), 11),
|
| 473 |
+
('TEXTCOLOR', (1, 0), (1, -1), PRIMARY_BLUE),
|
| 474 |
+
('ALIGN', (1, 0), (-1, -1), 'CENTER'),
|
| 475 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 476 |
+
('PADDING', (0, 0), (-1, -1), 10),
|
| 477 |
+
('ROWBACKGROUNDS', (0, 0), (-1, -1), [LIGHT_BLUE, WHITE]),
|
| 478 |
+
]))
|
| 479 |
+
return table
|
| 480 |
+
|
| 481 |
+
|
| 482 |
+
def generate_report():
|
| 483 |
+
"""Generate the complete SPARKNET progress report PDF."""
|
| 484 |
+
|
| 485 |
+
filename = '/home/mhamdan/SPARKNET/docs/SPARKNET_Progress_Report.pdf'
|
| 486 |
+
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
| 487 |
+
|
| 488 |
+
doc = SimpleDocTemplate(
|
| 489 |
+
filename,
|
| 490 |
+
pagesize=A4,
|
| 491 |
+
rightMargin=50,
|
| 492 |
+
leftMargin=50,
|
| 493 |
+
topMargin=60,
|
| 494 |
+
bottomMargin=60
|
| 495 |
+
)
|
| 496 |
+
|
| 497 |
+
styles = create_styles()
|
| 498 |
+
story = []
|
| 499 |
+
|
| 500 |
+
# ========== TITLE PAGE ==========
|
| 501 |
+
story.append(Spacer(1, 100))
|
| 502 |
+
story.append(Paragraph('SPARKNET', styles['MainTitle']))
|
| 503 |
+
story.append(Paragraph('Multi-Agentic Document Intelligence Framework', styles['Subtitle']))
|
| 504 |
+
story.append(Spacer(1, 30))
|
| 505 |
+
story.append(Paragraph('Progress Report & Future Roadmap', styles['Subtitle']))
|
| 506 |
+
story.append(Spacer(1, 50))
|
| 507 |
+
|
| 508 |
+
# Version info box
|
| 509 |
+
version_data = [
|
| 510 |
+
['Version', '1.0.0-beta'],
|
| 511 |
+
['Report Date', datetime.now().strftime('%B %d, %Y')],
|
| 512 |
+
['Document Type', 'Stakeholder Progress Report'],
|
| 513 |
+
['Classification', 'Internal / Confidential'],
|
| 514 |
+
]
|
| 515 |
+
version_table = Table(version_data, colWidths=[150, 200])
|
| 516 |
+
version_table.setStyle(TableStyle([
|
| 517 |
+
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
|
| 518 |
+
('FONTSIZE', (0, 0), (-1, -1), 10),
|
| 519 |
+
('TEXTCOLOR', (0, 0), (-1, -1), GRAY_DARK),
|
| 520 |
+
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
| 521 |
+
('GRID', (0, 0), (-1, -1), 0.5, ACCENT_BLUE),
|
| 522 |
+
('PADDING', (0, 0), (-1, -1), 8),
|
| 523 |
+
('BACKGROUND', (0, 0), (-1, -1), LIGHT_BLUE),
|
| 524 |
+
]))
|
| 525 |
+
story.append(version_table)
|
| 526 |
+
|
| 527 |
+
story.append(PageBreak())
|
| 528 |
+
|
| 529 |
+
# ========== TABLE OF CONTENTS ==========
|
| 530 |
+
story.append(Paragraph('Table of Contents', styles['SectionHeader']))
|
| 531 |
+
story.append(Spacer(1, 20))
|
| 532 |
+
|
| 533 |
+
toc_items = [
|
| 534 |
+
('1. Executive Summary', '3'),
|
| 535 |
+
('2. Project Overview', '4'),
|
| 536 |
+
('3. Technical Architecture', '5'),
|
| 537 |
+
('4. Component Deep Dive', '8'),
|
| 538 |
+
('5. Current Progress & Achievements', '12'),
|
| 539 |
+
('6. Gap Analysis', '14'),
|
| 540 |
+
('7. Future Work & Roadmap', '17'),
|
| 541 |
+
('8. Risk Assessment', '20'),
|
| 542 |
+
('9. Resource Requirements', '21'),
|
| 543 |
+
('10. Conclusion & Recommendations', '22'),
|
| 544 |
+
]
|
| 545 |
+
|
| 546 |
+
toc_data = [[Paragraph(f'<b>{item}</b>', styles['CustomBody']), page] for item, page in toc_items]
|
| 547 |
+
toc_table = Table(toc_data, colWidths=[400, 50])
|
| 548 |
+
toc_table.setStyle(TableStyle([
|
| 549 |
+
('FONTSIZE', (0, 0), (-1, -1), 11),
|
| 550 |
+
('ALIGN', (1, 0), (1, -1), 'RIGHT'),
|
| 551 |
+
('BOTTOMPADDING', (0, 0), (-1, -1), 8),
|
| 552 |
+
('LINEBELOW', (0, 0), (-1, -2), 0.5, colors.lightgrey),
|
| 553 |
+
]))
|
| 554 |
+
story.append(toc_table)
|
| 555 |
+
|
| 556 |
+
story.append(PageBreak())
|
| 557 |
+
|
| 558 |
+
# ========== 1. EXECUTIVE SUMMARY ==========
|
| 559 |
+
story.append(Paragraph('1. Executive Summary', styles['SectionHeader']))
|
| 560 |
+
|
| 561 |
+
story.append(Paragraph(
|
| 562 |
+
'''SPARKNET represents a next-generation document intelligence platform that combines
|
| 563 |
+
advanced OCR capabilities, sophisticated layout analysis, and a state-of-the-art
|
| 564 |
+
Multi-Agent Retrieval-Augmented Generation (RAG) system. This report provides a
|
| 565 |
+
comprehensive overview of the project's current state, technical achievements,
|
| 566 |
+
identified gaps, and the strategic roadmap for future development.''',
|
| 567 |
+
styles['CustomBody']
|
| 568 |
+
))
|
| 569 |
+
|
| 570 |
+
story.append(Spacer(1, 15))
|
| 571 |
+
story.append(Paragraph('<b>Key Highlights</b>', styles['SubsectionHeader']))
|
| 572 |
+
|
| 573 |
+
highlights = [
|
| 574 |
+
'<b>Multi-Agent RAG Architecture:</b> Successfully implemented a 5-agent pipeline (QueryPlanner, Retriever, Reranker, Synthesizer, Critic) with self-correction capabilities.',
|
| 575 |
+
'<b>Document Processing Pipeline:</b> Complete end-to-end document processing with OCR, layout detection, and semantic chunking.',
|
| 576 |
+
'<b>Production-Ready Demo:</b> Fully functional Streamlit application with 5 interactive modules for document intelligence workflows.',
|
| 577 |
+
'<b>Hallucination Detection:</b> Built-in validation and criticism system to ensure factual accuracy of generated responses.',
|
| 578 |
+
'<b>Unified State Management:</b> Cross-module communication enabling seamless user experience across all application components.',
|
| 579 |
+
]
|
| 580 |
+
|
| 581 |
+
for h in highlights:
|
| 582 |
+
story.append(Paragraph(f'• {h}', styles['BulletText']))
|
| 583 |
+
|
| 584 |
+
story.append(Spacer(1, 20))
|
| 585 |
+
|
| 586 |
+
# Key Metrics
|
| 587 |
+
story.append(Paragraph('<b>Current System Metrics</b>', styles['SubsectionHeader']))
|
| 588 |
+
metrics = [
|
| 589 |
+
('RAG Pipeline Agents', '5 Specialized Agents', '✓ Complete'),
|
| 590 |
+
('Document Formats Supported', 'PDF, Images', '2 formats'),
|
| 591 |
+
('Vector Dimensions', '1024 (mxbai-embed-large)', 'Production'),
|
| 592 |
+
('Demo Application Pages', '5 Interactive Modules', '✓ Complete'),
|
| 593 |
+
('LLM Integration', 'Ollama (Local)', 'Self-hosted'),
|
| 594 |
+
]
|
| 595 |
+
story.append(create_metrics_table(metrics, styles))
|
| 596 |
+
|
| 597 |
+
story.append(PageBreak())
|
| 598 |
+
|
| 599 |
+
# ========== 2. PROJECT OVERVIEW ==========
|
| 600 |
+
story.append(Paragraph('2. Project Overview', styles['SectionHeader']))
|
| 601 |
+
|
| 602 |
+
story.append(Paragraph('<b>2.1 Vision & Objectives</b>', styles['SubsectionHeader']))
|
| 603 |
+
story.append(Paragraph(
|
| 604 |
+
'''SPARKNET aims to revolutionize document intelligence by providing an integrated
|
| 605 |
+
platform that can understand, process, and intelligently query complex documents.
|
| 606 |
+
The system leverages cutting-edge AI techniques including multi-agent collaboration,
|
| 607 |
+
hybrid retrieval, and sophisticated answer synthesis with built-in validation.''',
|
| 608 |
+
styles['CustomBody']
|
| 609 |
+
))
|
| 610 |
+
|
| 611 |
+
story.append(Spacer(1, 10))
|
| 612 |
+
story.append(Paragraph('<b>Core Objectives:</b>', styles['CustomBody']))
|
| 613 |
+
|
| 614 |
+
objectives = [
|
| 615 |
+
'<b>Intelligent Document Understanding:</b> Extract and structure information from diverse document formats with high accuracy.',
|
| 616 |
+
'<b>Conversational Intelligence:</b> Enable natural language querying over document collections with citation-backed responses.',
|
| 617 |
+
'<b>Reliability & Trust:</b> Implement hallucination detection and self-correction to ensure factual accuracy.',
|
| 618 |
+
'<b>Scalability:</b> Design for enterprise-scale document processing and retrieval workloads.',
|
| 619 |
+
'<b>Extensibility:</b> Modular architecture allowing easy integration of new capabilities and models.',
|
| 620 |
+
]
|
| 621 |
+
|
| 622 |
+
for obj in objectives:
|
| 623 |
+
story.append(Paragraph(f'• {obj}', styles['BulletText']))
|
| 624 |
+
|
| 625 |
+
story.append(Spacer(1, 15))
|
| 626 |
+
story.append(Paragraph('<b>2.2 Target Use Cases</b>', styles['SubsectionHeader']))
|
| 627 |
+
|
| 628 |
+
use_cases = [
|
| 629 |
+
['Use Case', 'Description', 'Status'],
|
| 630 |
+
['Legal Document Analysis', 'Contract review, clause extraction, compliance checking', 'Supported'],
|
| 631 |
+
['Research Paper Synthesis', 'Multi-paper querying, citation tracking, summary generation', 'Supported'],
|
| 632 |
+
['Technical Documentation', 'API docs, manuals, knowledge base querying', 'Supported'],
|
| 633 |
+
['Financial Reports', 'Annual reports, SEC filings, financial data extraction', 'Planned'],
|
| 634 |
+
['Medical Records', 'Clinical notes, diagnostic reports (HIPAA compliance needed)', 'Future'],
|
| 635 |
+
]
|
| 636 |
+
|
| 637 |
+
uc_table = Table(use_cases, colWidths=[130, 230, 90])
|
| 638 |
+
uc_table.setStyle(TableStyle([
|
| 639 |
+
('BACKGROUND', (0, 0), (-1, 0), PRIMARY_BLUE),
|
| 640 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 641 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 642 |
+
('FONTSIZE', (0, 0), (-1, -1), 9),
|
| 643 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 644 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
|
| 645 |
+
('PADDING', (0, 0), (-1, -1), 6),
|
| 646 |
+
('ALIGN', (2, 0), (2, -1), 'CENTER'),
|
| 647 |
+
]))
|
| 648 |
+
story.append(uc_table)
|
| 649 |
+
|
| 650 |
+
story.append(PageBreak())
|
| 651 |
+
|
| 652 |
+
# ========== 3. TECHNICAL ARCHITECTURE ==========
|
| 653 |
+
story.append(Paragraph('3. Technical Architecture', styles['SectionHeader']))
|
| 654 |
+
|
| 655 |
+
story.append(Paragraph('<b>3.1 High-Level Architecture</b>', styles['SubsectionHeader']))
|
| 656 |
+
story.append(Paragraph(
|
| 657 |
+
'''SPARKNET follows a layered microservices-inspired architecture with clear separation
|
| 658 |
+
of concerns. The system is organized into presentation, service, and persistence layers,
|
| 659 |
+
with a central orchestration mechanism coordinating multi-agent workflows.''',
|
| 660 |
+
styles['CustomBody']
|
| 661 |
+
))
|
| 662 |
+
|
| 663 |
+
story.append(Spacer(1, 10))
|
| 664 |
+
|
| 665 |
+
# Architecture Diagram
|
| 666 |
+
arch_diagram = DiagramFlowable(500, 350, 'architecture')
|
| 667 |
+
story.append(arch_diagram)
|
| 668 |
+
story.append(Paragraph('Figure 1: SPARKNET High-Level Architecture', styles['Caption']))
|
| 669 |
+
|
| 670 |
+
story.append(Spacer(1, 15))
|
| 671 |
+
story.append(Paragraph('<b>3.2 Multi-Agent RAG Pipeline</b>', styles['SubsectionHeader']))
|
| 672 |
+
story.append(Paragraph(
|
| 673 |
+
'''The heart of SPARKNET is its Multi-Agent RAG system, which orchestrates five
|
| 674 |
+
specialized agents in a sophisticated pipeline with self-correction capabilities.''',
|
| 675 |
+
styles['CustomBody']
|
| 676 |
+
))
|
| 677 |
+
|
| 678 |
+
story.append(Spacer(1, 10))
|
| 679 |
+
|
| 680 |
+
# RAG Pipeline Diagram
|
| 681 |
+
rag_diagram = DiagramFlowable(500, 180, 'rag_pipeline')
|
| 682 |
+
story.append(rag_diagram)
|
| 683 |
+
story.append(Paragraph('Figure 2: Multi-Agent RAG Pipeline with Revision Loop', styles['Caption']))
|
| 684 |
+
|
| 685 |
+
story.append(PageBreak())
|
| 686 |
+
|
| 687 |
+
story.append(Paragraph('<b>3.3 Document Processing Pipeline</b>', styles['SubsectionHeader']))
|
| 688 |
+
story.append(Paragraph(
|
| 689 |
+
'''Documents undergo a multi-stage processing pipeline that extracts text, identifies
|
| 690 |
+
layout structure, establishes reading order, and creates semantically coherent chunks
|
| 691 |
+
optimized for retrieval.''',
|
| 692 |
+
styles['CustomBody']
|
| 693 |
+
))
|
| 694 |
+
|
| 695 |
+
story.append(Spacer(1, 10))
|
| 696 |
+
|
| 697 |
+
# Document Pipeline Diagram
|
| 698 |
+
doc_diagram = DiagramFlowable(500, 180, 'document_pipeline')
|
| 699 |
+
story.append(doc_diagram)
|
| 700 |
+
story.append(Paragraph('Figure 3: Document Processing Pipeline', styles['Caption']))
|
| 701 |
+
|
| 702 |
+
story.append(Spacer(1, 15))
|
| 703 |
+
story.append(Paragraph('<b>3.4 Agent Interaction Model</b>', styles['SubsectionHeader']))
|
| 704 |
+
story.append(Paragraph(
|
| 705 |
+
'''The orchestrator coordinates all agents, managing state transitions and ensuring
|
| 706 |
+
proper data flow between components. External services (Vector Store, LLM) are
|
| 707 |
+
accessed through well-defined interfaces.''',
|
| 708 |
+
styles['CustomBody']
|
| 709 |
+
))
|
| 710 |
+
|
| 711 |
+
story.append(Spacer(1, 10))
|
| 712 |
+
|
| 713 |
+
# Agent Interaction Diagram
|
| 714 |
+
agent_diagram = DiagramFlowable(500, 250, 'agent_interaction')
|
| 715 |
+
story.append(agent_diagram)
|
| 716 |
+
story.append(Paragraph('Figure 4: Agent Interaction Model', styles['Caption']))
|
| 717 |
+
|
| 718 |
+
story.append(PageBreak())
|
| 719 |
+
|
| 720 |
+
story.append(Paragraph('<b>3.5 Data Flow Architecture</b>', styles['SubsectionHeader']))
|
| 721 |
+
story.append(Paragraph(
|
| 722 |
+
'''The end-to-end data flow illustrates how documents are processed from upload
|
| 723 |
+
through indexing, and how queries are handled through the multi-agent pipeline
|
| 724 |
+
to produce validated, citation-backed responses.''',
|
| 725 |
+
styles['CustomBody']
|
| 726 |
+
))
|
| 727 |
+
|
| 728 |
+
story.append(Spacer(1, 10))
|
| 729 |
+
|
| 730 |
+
# Data Flow Diagram
|
| 731 |
+
flow_diagram = DiagramFlowable(500, 320, 'data_flow')
|
| 732 |
+
story.append(flow_diagram)
|
| 733 |
+
story.append(Paragraph('Figure 5: End-to-End Data Flow', styles['Caption']))
|
| 734 |
+
|
| 735 |
+
story.append(PageBreak())
|
| 736 |
+
|
| 737 |
+
# ========== 4. COMPONENT DEEP DIVE ==========
|
| 738 |
+
story.append(Paragraph('4. Component Deep Dive', styles['SectionHeader']))
|
| 739 |
+
|
| 740 |
+
story.append(Paragraph('<b>4.1 Query Planning Agent</b>', styles['SubsectionHeader']))
|
| 741 |
+
story.append(Paragraph(
|
| 742 |
+
'''The QueryPlannerAgent is responsible for understanding user intent, classifying
|
| 743 |
+
query types, and decomposing complex queries into manageable sub-queries.''',
|
| 744 |
+
styles['CustomBody']
|
| 745 |
+
))
|
| 746 |
+
|
| 747 |
+
# Query types table
|
| 748 |
+
query_types = [
|
| 749 |
+
['Intent Type', 'Description', 'Example'],
|
| 750 |
+
['FACTOID', 'Simple fact lookup', '"What is the revenue for Q4?"'],
|
| 751 |
+
['COMPARISON', 'Multi-entity comparison', '"Compare product A vs B features"'],
|
| 752 |
+
['AGGREGATION', 'Cross-document summary', '"Summarize all quarterly reports"'],
|
| 753 |
+
['CAUSAL', 'Why/how explanations', '"Why did revenue decline?"'],
|
| 754 |
+
['PROCEDURAL', 'Step-by-step instructions', '"How to configure the system?"'],
|
| 755 |
+
['MULTI_HOP', 'Multi-step reasoning', '"Which supplier has the lowest cost for product X?"'],
|
| 756 |
+
]
|
| 757 |
+
|
| 758 |
+
qt_table = Table(query_types, colWidths=[90, 180, 180])
|
| 759 |
+
qt_table.setStyle(TableStyle([
|
| 760 |
+
('BACKGROUND', (0, 0), (-1, 0), SECONDARY_BLUE),
|
| 761 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 762 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 763 |
+
('FONTSIZE', (0, 0), (-1, -1), 8),
|
| 764 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 765 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
|
| 766 |
+
('PADDING', (0, 0), (-1, -1), 5),
|
| 767 |
+
]))
|
| 768 |
+
story.append(qt_table)
|
| 769 |
+
story.append(Paragraph('Table 1: Supported Query Intent Types', styles['Caption']))
|
| 770 |
+
|
| 771 |
+
story.append(Spacer(1, 10))
|
| 772 |
+
story.append(Paragraph('<b>4.2 Hybrid Retrieval System</b>', styles['SubsectionHeader']))
|
| 773 |
+
story.append(Paragraph(
|
| 774 |
+
'''The RetrieverAgent implements a sophisticated hybrid search combining dense
|
| 775 |
+
semantic retrieval with sparse keyword matching, using Reciprocal Rank Fusion (RRF)
|
| 776 |
+
to merge results optimally.''',
|
| 777 |
+
styles['CustomBody']
|
| 778 |
+
))
|
| 779 |
+
|
| 780 |
+
retrieval_features = [
|
| 781 |
+
'<b>Dense Retrieval:</b> Embedding-based semantic search using mxbai-embed-large (1024 dimensions)',
|
| 782 |
+
'<b>Sparse Retrieval:</b> BM25-style keyword matching for precise term matching',
|
| 783 |
+
'<b>RRF Fusion:</b> Combines rankings using formula: RRF = Σ(1 / (k + rank))',
|
| 784 |
+
'<b>Intent-Adaptive Weights:</b> Adjusts dense/sparse balance based on query type (e.g., 80/20 for definitions, 50/50 for comparisons)',
|
| 785 |
+
]
|
| 786 |
+
|
| 787 |
+
for feat in retrieval_features:
|
| 788 |
+
story.append(Paragraph(f'• {feat}', styles['BulletText']))
|
| 789 |
+
|
| 790 |
+
story.append(Spacer(1, 10))
|
| 791 |
+
story.append(Paragraph('<b>4.3 Cross-Encoder Reranking</b>', styles['SubsectionHeader']))
|
| 792 |
+
story.append(Paragraph(
|
| 793 |
+
'''The RerankerAgent applies LLM-based cross-encoder scoring to refine retrieval
|
| 794 |
+
results, implementing deduplication and Maximal Marginal Relevance (MMR) for
|
| 795 |
+
diversity promotion.''',
|
| 796 |
+
styles['CustomBody']
|
| 797 |
+
))
|
| 798 |
+
|
| 799 |
+
reranker_config = [
|
| 800 |
+
['Parameter', 'Value', 'Purpose'],
|
| 801 |
+
['top_k', '5', 'Final result count'],
|
| 802 |
+
['min_relevance_score', '0.3', 'Quality threshold'],
|
| 803 |
+
['dedup_threshold', '0.9', 'Similarity for duplicate detection'],
|
| 804 |
+
['MMR lambda', '0.7', 'Relevance vs diversity balance'],
|
| 805 |
+
]
|
| 806 |
+
|
| 807 |
+
rr_table = Table(reranker_config, colWidths=[140, 80, 230])
|
| 808 |
+
rr_table.setStyle(TableStyle([
|
| 809 |
+
('BACKGROUND', (0, 0), (-1, 0), SECONDARY_BLUE),
|
| 810 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 811 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 812 |
+
('FONTSIZE', (0, 0), (-1, -1), 9),
|
| 813 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 814 |
+
('PADDING', (0, 0), (-1, -1), 6),
|
| 815 |
+
]))
|
| 816 |
+
story.append(rr_table)
|
| 817 |
+
story.append(Paragraph('Table 2: Reranker Configuration', styles['Caption']))
|
| 818 |
+
|
| 819 |
+
story.append(PageBreak())
|
| 820 |
+
|
| 821 |
+
story.append(Paragraph('<b>4.4 Answer Synthesis</b>', styles['SubsectionHeader']))
|
| 822 |
+
story.append(Paragraph(
|
| 823 |
+
'''The SynthesizerAgent generates comprehensive answers with automatic citation
|
| 824 |
+
tracking, supporting multiple output formats and implementing intelligent abstention
|
| 825 |
+
when evidence is insufficient.''',
|
| 826 |
+
styles['CustomBody']
|
| 827 |
+
))
|
| 828 |
+
|
| 829 |
+
story.append(Paragraph('<b>Supported Answer Formats:</b>', styles['CustomBody']))
|
| 830 |
+
formats = ['PROSE - Flowing paragraph narrative', 'BULLET_POINTS - Enumerated key points',
|
| 831 |
+
'TABLE - Comparative tabular format', 'STEP_BY_STEP - Procedural instructions']
|
| 832 |
+
for fmt in formats:
|
| 833 |
+
story.append(Paragraph(f'• {fmt}', styles['BulletText']))
|
| 834 |
+
|
| 835 |
+
story.append(Paragraph('<b>Confidence Calculation:</b>', styles['CustomBody']))
|
| 836 |
+
story.append(Paragraph('confidence = 0.5 × source_relevance + 0.3 × source_count_factor + 0.2 × consistency', styles['BulletText']))
|
| 837 |
+
|
| 838 |
+
story.append(Spacer(1, 10))
|
| 839 |
+
story.append(Paragraph('<b>4.5 Validation & Hallucination Detection</b>', styles['SubsectionHeader']))
|
| 840 |
+
story.append(Paragraph(
|
| 841 |
+
'''The CriticAgent performs comprehensive validation including hallucination detection,
|
| 842 |
+
citation verification, and factual consistency checking. It can trigger revision
|
| 843 |
+
cycles when issues are detected.''',
|
| 844 |
+
styles['CustomBody']
|
| 845 |
+
))
|
| 846 |
+
|
| 847 |
+
issue_types = [
|
| 848 |
+
['Issue Type', 'Description', 'Severity'],
|
| 849 |
+
['HALLUCINATION', 'Information not supported by sources', 'Critical'],
|
| 850 |
+
['UNSUPPORTED_CLAIM', 'Statement without citation', 'High'],
|
| 851 |
+
['INCORRECT_CITATION', 'Citation references wrong source', 'High'],
|
| 852 |
+
['CONTRADICTION', 'Internal inconsistency in answer', 'Medium'],
|
| 853 |
+
['INCOMPLETE', 'Missing important information', 'Medium'],
|
| 854 |
+
['FACTUAL_ERROR', 'Verifiable factual mistake', 'Critical'],
|
| 855 |
+
]
|
| 856 |
+
|
| 857 |
+
it_table = Table(issue_types, colWidths=[130, 230, 90])
|
| 858 |
+
it_table.setStyle(TableStyle([
|
| 859 |
+
('BACKGROUND', (0, 0), (-1, 0), WARNING_ORANGE),
|
| 860 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 861 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 862 |
+
('FONTSIZE', (0, 0), (-1, -1), 9),
|
| 863 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 864 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
|
| 865 |
+
('PADDING', (0, 0), (-1, -1), 5),
|
| 866 |
+
]))
|
| 867 |
+
story.append(it_table)
|
| 868 |
+
story.append(Paragraph('Table 3: Validation Issue Types', styles['Caption']))
|
| 869 |
+
|
| 870 |
+
story.append(PageBreak())
|
| 871 |
+
|
| 872 |
+
story.append(Paragraph('<b>4.6 Document Processing Components</b>', styles['SubsectionHeader']))
|
| 873 |
+
|
| 874 |
+
story.append(Paragraph('<b>OCR Engines:</b>', styles['CustomBody']))
|
| 875 |
+
ocr_comparison = [
|
| 876 |
+
['Feature', 'PaddleOCR', 'Tesseract'],
|
| 877 |
+
['GPU Acceleration', '✓ Yes', '✗ No'],
|
| 878 |
+
['Multi-language', '✓ 80+ languages', '✓ 100+ languages'],
|
| 879 |
+
['Accuracy (Clean)', '~95%', '~90%'],
|
| 880 |
+
['Accuracy (Complex)', '~85%', '~75%'],
|
| 881 |
+
['Speed', 'Fast', 'Moderate'],
|
| 882 |
+
['Confidence Scores', '✓ Per-region', '✓ Per-word'],
|
| 883 |
+
]
|
| 884 |
+
|
| 885 |
+
ocr_table = Table(ocr_comparison, colWidths=[130, 160, 160])
|
| 886 |
+
ocr_table.setStyle(TableStyle([
|
| 887 |
+
('BACKGROUND', (0, 0), (-1, 0), PRIMARY_BLUE),
|
| 888 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 889 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 890 |
+
('FONTSIZE', (0, 0), (-1, -1), 9),
|
| 891 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 892 |
+
('PADDING', (0, 0), (-1, -1), 5),
|
| 893 |
+
]))
|
| 894 |
+
story.append(ocr_table)
|
| 895 |
+
story.append(Paragraph('Table 4: OCR Engine Comparison', styles['Caption']))
|
| 896 |
+
|
| 897 |
+
story.append(Spacer(1, 10))
|
| 898 |
+
story.append(Paragraph('<b>Layout Detection:</b>', styles['CustomBody']))
|
| 899 |
+
layout_types = ['TEXT, TITLE, HEADING, PARAGRAPH - Text regions',
|
| 900 |
+
'TABLE, FIGURE, CHART - Visual elements',
|
| 901 |
+
'CAPTION, FOOTNOTE - Supplementary text',
|
| 902 |
+
'HEADER, FOOTER - Page elements',
|
| 903 |
+
'FORMULA - Mathematical expressions']
|
| 904 |
+
for lt in layout_types:
|
| 905 |
+
story.append(Paragraph(f'• {lt}', styles['BulletText']))
|
| 906 |
+
|
| 907 |
+
story.append(Spacer(1, 10))
|
| 908 |
+
story.append(Paragraph('<b>Chunking Configuration:</b>', styles['CustomBody']))
|
| 909 |
+
chunk_config = [
|
| 910 |
+
['Parameter', 'Default', 'Description'],
|
| 911 |
+
['max_chunk_chars', '1000', 'Maximum characters per chunk'],
|
| 912 |
+
['min_chunk_chars', '50', 'Minimum viable chunk size'],
|
| 913 |
+
['overlap_chars', '100', 'Overlap between consecutive chunks'],
|
| 914 |
+
['Strategy', 'Semantic', 'Respects layout boundaries'],
|
| 915 |
+
]
|
| 916 |
+
|
| 917 |
+
cc_table = Table(chunk_config, colWidths=[120, 80, 250])
|
| 918 |
+
cc_table.setStyle(TableStyle([
|
| 919 |
+
('BACKGROUND', (0, 0), (-1, 0), SECONDARY_BLUE),
|
| 920 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 921 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 922 |
+
('FONTSIZE', (0, 0), (-1, -1), 9),
|
| 923 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 924 |
+
('PADDING', (0, 0), (-1, -1), 5),
|
| 925 |
+
]))
|
| 926 |
+
story.append(cc_table)
|
| 927 |
+
story.append(Paragraph('Table 5: Chunking Configuration', styles['Caption']))
|
| 928 |
+
|
| 929 |
+
story.append(PageBreak())
|
| 930 |
+
|
| 931 |
+
# ========== 5. CURRENT PROGRESS ==========
|
| 932 |
+
story.append(Paragraph('5. Current Progress & Achievements', styles['SectionHeader']))
|
| 933 |
+
|
| 934 |
+
story.append(Paragraph('<b>5.1 Development Milestones</b>', styles['SubsectionHeader']))
|
| 935 |
+
|
| 936 |
+
milestones = [
|
| 937 |
+
['Milestone', 'Status', 'Completion'],
|
| 938 |
+
['Core RAG Pipeline', 'Complete', '100%'],
|
| 939 |
+
['5-Agent Architecture', 'Complete', '100%'],
|
| 940 |
+
['Document Processing Pipeline', 'Complete', '100%'],
|
| 941 |
+
['ChromaDB Integration', 'Complete', '100%'],
|
| 942 |
+
['Ollama LLM Integration', 'Complete', '100%'],
|
| 943 |
+
['Streamlit Demo Application', 'Complete', '100%'],
|
| 944 |
+
['State Management System', 'Complete', '100%'],
|
| 945 |
+
['Hallucination Detection', 'Complete', '100%'],
|
| 946 |
+
['PDF Processing', 'Complete', '100%'],
|
| 947 |
+
['Self-Correction Loop', 'Complete', '100%'],
|
| 948 |
+
]
|
| 949 |
+
|
| 950 |
+
ms_table = Table(milestones, colWidths=[220, 120, 110])
|
| 951 |
+
ms_table.setStyle(TableStyle([
|
| 952 |
+
('BACKGROUND', (0, 0), (-1, 0), PRIMARY_BLUE),
|
| 953 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 954 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 955 |
+
('FONTSIZE', (0, 0), (-1, -1), 9),
|
| 956 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 957 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
|
| 958 |
+
('PADDING', (0, 0), (-1, -1), 6),
|
| 959 |
+
('ALIGN', (1, 0), (-1, -1), 'CENTER'),
|
| 960 |
+
]))
|
| 961 |
+
story.append(ms_table)
|
| 962 |
+
story.append(Paragraph('Table 6: Development Milestones', styles['Caption']))
|
| 963 |
+
|
| 964 |
+
story.append(Spacer(1, 15))
|
| 965 |
+
story.append(Paragraph('<b>5.2 Demo Application Features</b>', styles['SubsectionHeader']))
|
| 966 |
+
|
| 967 |
+
demo_features = [
|
| 968 |
+
['Page', 'Features', 'Status'],
|
| 969 |
+
['Live Processing', 'Real-time document processing, progress tracking, auto-indexing', '✓ Complete'],
|
| 970 |
+
['Interactive RAG', 'Query interface, document filtering, chunk preview, citations', '✓ Complete'],
|
| 971 |
+
['Document Comparison', 'Semantic similarity, structure analysis, content diff', '✓ Complete'],
|
| 972 |
+
['Evidence Viewer', 'Confidence coloring, bounding boxes, OCR regions, export', '✓ Complete'],
|
| 973 |
+
['Document Viewer', 'Multi-tab view, chunk display, layout visualization', '✓ Complete'],
|
| 974 |
+
]
|
| 975 |
+
|
| 976 |
+
df_table = Table(demo_features, colWidths=[110, 270, 70])
|
| 977 |
+
df_table.setStyle(TableStyle([
|
| 978 |
+
('BACKGROUND', (0, 0), (-1, 0), SECONDARY_BLUE),
|
| 979 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 980 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 981 |
+
('FONTSIZE', (0, 0), (-1, -1), 9),
|
| 982 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 983 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
|
| 984 |
+
('PADDING', (0, 0), (-1, -1), 5),
|
| 985 |
+
('ALIGN', (2, 0), (2, -1), 'CENTER'),
|
| 986 |
+
]))
|
| 987 |
+
story.append(df_table)
|
| 988 |
+
story.append(Paragraph('Table 7: Demo Application Features', styles['Caption']))
|
| 989 |
+
|
| 990 |
+
story.append(Spacer(1, 15))
|
| 991 |
+
story.append(Paragraph('<b>5.3 Technical Achievements</b>', styles['SubsectionHeader']))
|
| 992 |
+
|
| 993 |
+
achievements = [
|
| 994 |
+
'<b>Hybrid Retrieval:</b> Successfully combined dense and sparse retrieval with RRF fusion, achieving better recall than either method alone.',
|
| 995 |
+
'<b>Self-Correction:</b> Implemented revision loop allowing the system to automatically fix issues detected by the Critic agent.',
|
| 996 |
+
'<b>Citation Tracking:</b> Automatic citation generation with [N] notation linking answers to source documents.',
|
| 997 |
+
'<b>Confidence Scoring:</b> Multi-factor confidence calculation providing transparency into answer reliability.',
|
| 998 |
+
'<b>Streaming Support:</b> Real-time response streaming for improved user experience during long generations.',
|
| 999 |
+
'<b>Cross-Module Communication:</b> Unified state manager enabling seamless navigation between application modules.',
|
| 1000 |
+
]
|
| 1001 |
+
|
| 1002 |
+
for ach in achievements:
|
| 1003 |
+
story.append(Paragraph(f'• {ach}', styles['BulletText']))
|
| 1004 |
+
|
| 1005 |
+
story.append(PageBreak())
|
| 1006 |
+
|
| 1007 |
+
# ========== 6. GAP ANALYSIS ==========
|
| 1008 |
+
story.append(Paragraph('6. Gap Analysis', styles['SectionHeader']))
|
| 1009 |
+
|
| 1010 |
+
story.append(Paragraph(
|
| 1011 |
+
'''This section identifies current limitations and gaps in the SPARKNET system
|
| 1012 |
+
that represent opportunities for improvement and future development.''',
|
| 1013 |
+
styles['CustomBody']
|
| 1014 |
+
))
|
| 1015 |
+
|
| 1016 |
+
story.append(Spacer(1, 10))
|
| 1017 |
+
story.append(Paragraph('<b>6.1 Functional Gaps</b>', styles['SubsectionHeader']))
|
| 1018 |
+
|
| 1019 |
+
functional_gaps = [
|
| 1020 |
+
['Gap ID', 'Category', 'Description', 'Impact', 'Priority'],
|
| 1021 |
+
['FG-001', 'Document Support', 'Limited to PDF and images; no Word, Excel, PowerPoint support', 'High', 'P1'],
|
| 1022 |
+
['FG-002', 'Table Extraction', 'Table structure not preserved during chunking', 'High', 'P1'],
|
| 1023 |
+
['FG-003', 'Multi-modal', 'No image/chart understanding within documents', 'Medium', 'P2'],
|
| 1024 |
+
['FG-004', 'Languages', 'Primarily English; limited multi-language support', 'Medium', 'P2'],
|
| 1025 |
+
['FG-005', 'Batch Processing', 'No bulk document upload/processing capability', 'Medium', 'P2'],
|
| 1026 |
+
['FG-006', 'Document Updates', 'No incremental update; full reprocessing required', 'Medium', 'P2'],
|
| 1027 |
+
['FG-007', 'User Feedback', 'No mechanism to learn from user corrections', 'Low', 'P3'],
|
| 1028 |
+
]
|
| 1029 |
+
|
| 1030 |
+
fg_table = Table(functional_gaps, colWidths=[50, 85, 200, 55, 55])
|
| 1031 |
+
fg_table.setStyle(TableStyle([
|
| 1032 |
+
('BACKGROUND', (0, 0), (-1, 0), DANGER_RED),
|
| 1033 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 1034 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 1035 |
+
('FONTSIZE', (0, 0), (-1, -1), 8),
|
| 1036 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 1037 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
|
| 1038 |
+
('PADDING', (0, 0), (-1, -1), 4),
|
| 1039 |
+
('ALIGN', (0, 0), (0, -1), 'CENTER'),
|
| 1040 |
+
('ALIGN', (3, 0), (-1, -1), 'CENTER'),
|
| 1041 |
+
]))
|
| 1042 |
+
story.append(fg_table)
|
| 1043 |
+
story.append(Paragraph('Table 8: Functional Gaps', styles['Caption']))
|
| 1044 |
+
|
| 1045 |
+
story.append(Spacer(1, 15))
|
| 1046 |
+
story.append(Paragraph('<b>6.2 Technical Gaps</b>', styles['SubsectionHeader']))
|
| 1047 |
+
|
| 1048 |
+
technical_gaps = [
|
| 1049 |
+
['Gap ID', 'Category', 'Description', 'Impact', 'Priority'],
|
| 1050 |
+
['TG-001', 'Scalability', 'Single-node architecture; no distributed processing', 'High', 'P1'],
|
| 1051 |
+
['TG-002', 'Authentication', 'No user authentication or access control', 'High', 'P1'],
|
| 1052 |
+
['TG-003', 'API', 'No REST API for external integration', 'High', 'P1'],
|
| 1053 |
+
['TG-004', 'Caching', 'Limited query result caching; redundant LLM calls', 'Medium', 'P2'],
|
| 1054 |
+
['TG-005', 'Monitoring', 'Basic logging only; no metrics/alerting system', 'Medium', 'P2'],
|
| 1055 |
+
['TG-006', 'Testing', 'Limited test coverage; no integration tests', 'Medium', 'P2'],
|
| 1056 |
+
['TG-007', 'Cloud Deploy', 'Not containerized; no Kubernetes manifests', 'Medium', 'P2'],
|
| 1057 |
+
['TG-008', 'GPU Sharing', 'Single GPU utilization; no multi-GPU support', 'Low', 'P3'],
|
| 1058 |
+
]
|
| 1059 |
+
|
| 1060 |
+
tg_table = Table(technical_gaps, colWidths=[50, 80, 205, 55, 55])
|
| 1061 |
+
tg_table.setStyle(TableStyle([
|
| 1062 |
+
('BACKGROUND', (0, 0), (-1, 0), WARNING_ORANGE),
|
| 1063 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 1064 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 1065 |
+
('FONTSIZE', (0, 0), (-1, -1), 8),
|
| 1066 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 1067 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
|
| 1068 |
+
('PADDING', (0, 0), (-1, -1), 4),
|
| 1069 |
+
('ALIGN', (0, 0), (0, -1), 'CENTER'),
|
| 1070 |
+
('ALIGN', (3, 0), (-1, -1), 'CENTER'),
|
| 1071 |
+
]))
|
| 1072 |
+
story.append(tg_table)
|
| 1073 |
+
story.append(Paragraph('Table 9: Technical Gaps', styles['Caption']))
|
| 1074 |
+
|
| 1075 |
+
story.append(PageBreak())
|
| 1076 |
+
|
| 1077 |
+
story.append(Paragraph('<b>6.3 Performance Gaps</b>', styles['SubsectionHeader']))
|
| 1078 |
+
|
| 1079 |
+
perf_gaps = [
|
| 1080 |
+
['Gap ID', 'Metric', 'Current', 'Target', 'Gap'],
|
| 1081 |
+
['PG-001', 'Query Latency (simple)', '3-5 seconds', '<2 seconds', '~2x improvement needed'],
|
| 1082 |
+
['PG-002', 'Query Latency (complex)', '10-20 seconds', '<5 seconds', '~3x improvement needed'],
|
| 1083 |
+
['PG-003', 'Document Processing', '30-60 sec/page', '<10 sec/page', '~4x improvement needed'],
|
| 1084 |
+
['PG-004', 'Concurrent Users', '1-5', '50+', 'Major scaling required'],
|
| 1085 |
+
['PG-005', 'Index Size', '10K chunks', '1M+ chunks', 'Architecture redesign'],
|
| 1086 |
+
['PG-006', 'Accuracy (hallucination)', '~85%', '>95%', '~10% improvement'],
|
| 1087 |
+
]
|
| 1088 |
+
|
| 1089 |
+
pg_table = Table(perf_gaps, colWidths=[50, 120, 90, 90, 100])
|
| 1090 |
+
pg_table.setStyle(TableStyle([
|
| 1091 |
+
('BACKGROUND', (0, 0), (-1, 0), SECONDARY_BLUE),
|
| 1092 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 1093 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 1094 |
+
('FONTSIZE', (0, 0), (-1, -1), 8),
|
| 1095 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 1096 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
|
| 1097 |
+
('PADDING', (0, 0), (-1, -1), 4),
|
| 1098 |
+
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
| 1099 |
+
]))
|
| 1100 |
+
story.append(pg_table)
|
| 1101 |
+
story.append(Paragraph('Table 10: Performance Gaps', styles['Caption']))
|
| 1102 |
+
|
| 1103 |
+
story.append(Spacer(1, 15))
|
| 1104 |
+
story.append(Paragraph('<b>6.4 Security & Compliance Gaps</b>', styles['SubsectionHeader']))
|
| 1105 |
+
|
| 1106 |
+
security_gaps = [
|
| 1107 |
+
'<b>No Authentication:</b> Currently no user login or session management',
|
| 1108 |
+
'<b>No Authorization:</b> Missing role-based access control (RBAC) for documents',
|
| 1109 |
+
'<b>Data Encryption:</b> Documents and embeddings stored unencrypted at rest',
|
| 1110 |
+
'<b>Audit Logging:</b> No comprehensive audit trail for compliance requirements',
|
| 1111 |
+
'<b>PII Detection:</b> No automatic detection/redaction of personally identifiable information',
|
| 1112 |
+
'<b>GDPR/HIPAA:</b> Not compliant with major data protection regulations',
|
| 1113 |
+
]
|
| 1114 |
+
|
| 1115 |
+
for sg in security_gaps:
|
| 1116 |
+
story.append(Paragraph(f'• {sg}', styles['BulletText']))
|
| 1117 |
+
|
| 1118 |
+
story.append(PageBreak())
|
| 1119 |
+
|
| 1120 |
+
# ========== 7. FUTURE WORK & ROADMAP ==========
|
| 1121 |
+
story.append(Paragraph('7. Future Work & Roadmap', styles['SectionHeader']))
|
| 1122 |
+
|
| 1123 |
+
story.append(Paragraph('<b>7.1 Strategic Roadmap Overview</b>', styles['SubsectionHeader']))
|
| 1124 |
+
story.append(Paragraph(
|
| 1125 |
+
'''The SPARKNET roadmap is organized into three phases, each building upon the
|
| 1126 |
+
previous to transform the current prototype into a production-ready enterprise
|
| 1127 |
+
solution.''',
|
| 1128 |
+
styles['CustomBody']
|
| 1129 |
+
))
|
| 1130 |
+
|
| 1131 |
+
story.append(Spacer(1, 10))
|
| 1132 |
+
|
| 1133 |
+
# Roadmap phases
|
| 1134 |
+
roadmap = [
|
| 1135 |
+
['Phase', 'Timeline', 'Focus Areas', 'Key Deliverables'],
|
| 1136 |
+
['Phase 1:\nFoundation', 'Q1-Q2 2026',
|
| 1137 |
+
'Stability, Core Features,\nBasic Security',
|
| 1138 |
+
'• REST API\n• Authentication\n• Extended document formats\n• Basic containerization'],
|
| 1139 |
+
['Phase 2:\nScale', 'Q3-Q4 2026',
|
| 1140 |
+
'Performance, Scalability,\nEnterprise Features',
|
| 1141 |
+
'• Distributed processing\n• Advanced caching\n• Multi-tenancy\n• Monitoring & alerting'],
|
| 1142 |
+
['Phase 3:\nInnovation', 'Q1-Q2 2027',
|
| 1143 |
+
'Advanced AI, Compliance,\nEcosystem',
|
| 1144 |
+
'• Multi-modal understanding\n• Compliance frameworks\n• Plugin architecture\n• Advanced analytics'],
|
| 1145 |
+
]
|
| 1146 |
+
|
| 1147 |
+
rm_table = Table(roadmap, colWidths=[70, 80, 130, 170])
|
| 1148 |
+
rm_table.setStyle(TableStyle([
|
| 1149 |
+
('BACKGROUND', (0, 0), (-1, 0), PRIMARY_BLUE),
|
| 1150 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 1151 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 1152 |
+
('FONTSIZE', (0, 0), (-1, -1), 8),
|
| 1153 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 1154 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [LIGHT_BLUE, WHITE]),
|
| 1155 |
+
('PADDING', (0, 0), (-1, -1), 6),
|
| 1156 |
+
('VALIGN', (0, 0), (-1, -1), 'TOP'),
|
| 1157 |
+
]))
|
| 1158 |
+
story.append(rm_table)
|
| 1159 |
+
story.append(Paragraph('Table 11: Strategic Roadmap', styles['Caption']))
|
| 1160 |
+
|
| 1161 |
+
story.append(Spacer(1, 15))
|
| 1162 |
+
story.append(Paragraph('<b>7.2 Phase 1: Foundation (Q1-Q2 2026)</b>', styles['SubsectionHeader']))
|
| 1163 |
+
|
| 1164 |
+
phase1_items = [
|
| 1165 |
+
['Item', 'Description', 'Effort', 'Dependencies'],
|
| 1166 |
+
['REST API Development', 'FastAPI-based API for all core functions', '4 weeks', 'None'],
|
| 1167 |
+
['User Authentication', 'JWT-based auth with OAuth2 support', '3 weeks', 'API'],
|
| 1168 |
+
['Document Format Extension', 'Add Word, Excel, PowerPoint support', '4 weeks', 'None'],
|
| 1169 |
+
['Table Extraction', 'Preserve table structure in processing', '3 weeks', 'None'],
|
| 1170 |
+
['Docker Containerization', 'Production-ready Docker images', '2 weeks', 'None'],
|
| 1171 |
+
['Basic CI/CD Pipeline', 'Automated testing and deployment', '2 weeks', 'Docker'],
|
| 1172 |
+
['Query Result Caching', 'Redis-based caching layer', '2 weeks', 'API'],
|
| 1173 |
+
['Unit Test Coverage', 'Achieve 80% code coverage', '3 weeks', 'Ongoing'],
|
| 1174 |
+
]
|
| 1175 |
+
|
| 1176 |
+
p1_table = Table(phase1_items, colWidths=[130, 180, 60, 80])
|
| 1177 |
+
p1_table.setStyle(TableStyle([
|
| 1178 |
+
('BACKGROUND', (0, 0), (-1, 0), SUCCESS_GREEN),
|
| 1179 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 1180 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 1181 |
+
('FONTSIZE', (0, 0), (-1, -1), 8),
|
| 1182 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 1183 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
|
| 1184 |
+
('PADDING', (0, 0), (-1, -1), 4),
|
| 1185 |
+
]))
|
| 1186 |
+
story.append(p1_table)
|
| 1187 |
+
story.append(Paragraph('Table 12: Phase 1 Deliverables', styles['Caption']))
|
| 1188 |
+
|
| 1189 |
+
story.append(PageBreak())
|
| 1190 |
+
|
| 1191 |
+
story.append(Paragraph('<b>7.3 Phase 2: Scale (Q3-Q4 2026)</b>', styles['SubsectionHeader']))
|
| 1192 |
+
|
| 1193 |
+
phase2_items = [
|
| 1194 |
+
['Item', 'Description', 'Effort', 'Dependencies'],
|
| 1195 |
+
['Distributed Processing', 'Celery/Ray for parallel document processing', '6 weeks', 'Phase 1'],
|
| 1196 |
+
['Vector Store Scaling', 'Milvus/Pinecone for large-scale indices', '4 weeks', 'Phase 1'],
|
| 1197 |
+
['Multi-tenancy', 'Organization-based data isolation', '4 weeks', 'Auth'],
|
| 1198 |
+
['Kubernetes Deployment', 'Full K8s manifests and Helm charts', '3 weeks', 'Docker'],
|
| 1199 |
+
['Monitoring Stack', 'Prometheus, Grafana, ELK integration', '3 weeks', 'K8s'],
|
| 1200 |
+
['Batch Processing', 'Bulk document upload and processing', '3 weeks', 'Distributed'],
|
| 1201 |
+
['Advanced Caching', 'Semantic caching for similar queries', '3 weeks', 'Cache'],
|
| 1202 |
+
['Performance Optimization', 'Achieve <2s simple query latency', '4 weeks', 'Caching'],
|
| 1203 |
+
]
|
| 1204 |
+
|
| 1205 |
+
p2_table = Table(phase2_items, colWidths=[130, 180, 60, 80])
|
| 1206 |
+
p2_table.setStyle(TableStyle([
|
| 1207 |
+
('BACKGROUND', (0, 0), (-1, 0), WARNING_ORANGE),
|
| 1208 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 1209 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 1210 |
+
('FONTSIZE', (0, 0), (-1, -1), 8),
|
| 1211 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 1212 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
|
| 1213 |
+
('PADDING', (0, 0), (-1, -1), 4),
|
| 1214 |
+
]))
|
| 1215 |
+
story.append(p2_table)
|
| 1216 |
+
story.append(Paragraph('Table 13: Phase 2 Deliverables', styles['Caption']))
|
| 1217 |
+
|
| 1218 |
+
story.append(Spacer(1, 15))
|
| 1219 |
+
story.append(Paragraph('<b>7.4 Phase 3: Innovation (Q1-Q2 2027)</b>', styles['SubsectionHeader']))
|
| 1220 |
+
|
| 1221 |
+
phase3_items = [
|
| 1222 |
+
['Item', 'Description', 'Effort', 'Dependencies'],
|
| 1223 |
+
['Multi-modal Understanding', 'GPT-4V/Claude Vision for image analysis', '6 weeks', 'Phase 2'],
|
| 1224 |
+
['Advanced Table QA', 'SQL-like queries over extracted tables', '4 weeks', 'Table Extract'],
|
| 1225 |
+
['PII Detection/Redaction', 'Automatic sensitive data handling', '4 weeks', 'None'],
|
| 1226 |
+
['Compliance Framework', 'GDPR, HIPAA, SOC2 compliance', '8 weeks', 'PII'],
|
| 1227 |
+
['Plugin Architecture', 'Extensible agent and tool system', '4 weeks', 'Phase 2'],
|
| 1228 |
+
['Analytics Dashboard', 'Usage analytics and insights', '3 weeks', 'Monitoring'],
|
| 1229 |
+
['Multi-language Support', 'Full support for top 10 languages', '4 weeks', 'None'],
|
| 1230 |
+
['Feedback Learning', 'Learn from user corrections', '4 weeks', 'Analytics'],
|
| 1231 |
+
]
|
| 1232 |
+
|
| 1233 |
+
p3_table = Table(phase3_items, colWidths=[130, 180, 60, 80])
|
| 1234 |
+
p3_table.setStyle(TableStyle([
|
| 1235 |
+
('BACKGROUND', (0, 0), (-1, 0), ACCENT_BLUE),
|
| 1236 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 1237 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 1238 |
+
('FONTSIZE', (0, 0), (-1, -1), 8),
|
| 1239 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 1240 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
|
| 1241 |
+
('PADDING', (0, 0), (-1, -1), 4),
|
| 1242 |
+
]))
|
| 1243 |
+
story.append(p3_table)
|
| 1244 |
+
story.append(Paragraph('Table 14: Phase 3 Deliverables', styles['Caption']))
|
| 1245 |
+
|
| 1246 |
+
story.append(PageBreak())
|
| 1247 |
+
|
| 1248 |
+
# ========== 8. RISK ASSESSMENT ==========
|
| 1249 |
+
story.append(Paragraph('8. Risk Assessment', styles['SectionHeader']))
|
| 1250 |
+
|
| 1251 |
+
story.append(Paragraph('<b>8.1 Technical Risks</b>', styles['SubsectionHeader']))
|
| 1252 |
+
|
| 1253 |
+
tech_risks = [
|
| 1254 |
+
['Risk', 'Probability', 'Impact', 'Mitigation'],
|
| 1255 |
+
['LLM API Changes', 'Medium', 'High', 'Abstract LLM interface; support multiple providers'],
|
| 1256 |
+
['Scaling Bottlenecks', 'High', 'High', 'Early load testing; phased rollout'],
|
| 1257 |
+
['Model Accuracy Plateau', 'Medium', 'Medium', 'Ensemble approaches; fine-tuning capability'],
|
| 1258 |
+
['Dependency Vulnerabilities', 'Medium', 'Medium', 'Regular dependency audits; Dependabot'],
|
| 1259 |
+
['Data Loss', 'Low', 'Critical', 'Automated backups; disaster recovery plan'],
|
| 1260 |
+
]
|
| 1261 |
+
|
| 1262 |
+
tr_table = Table(tech_risks, colWidths=[120, 70, 70, 190])
|
| 1263 |
+
tr_table.setStyle(TableStyle([
|
| 1264 |
+
('BACKGROUND', (0, 0), (-1, 0), DANGER_RED),
|
| 1265 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 1266 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 1267 |
+
('FONTSIZE', (0, 0), (-1, -1), 8),
|
| 1268 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 1269 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
|
| 1270 |
+
('PADDING', (0, 0), (-1, -1), 5),
|
| 1271 |
+
('ALIGN', (1, 0), (2, -1), 'CENTER'),
|
| 1272 |
+
]))
|
| 1273 |
+
story.append(tr_table)
|
| 1274 |
+
story.append(Paragraph('Table 15: Technical Risks', styles['Caption']))
|
| 1275 |
+
|
| 1276 |
+
story.append(Spacer(1, 15))
|
| 1277 |
+
story.append(Paragraph('<b>8.2 Project Risks</b>', styles['SubsectionHeader']))
|
| 1278 |
+
|
| 1279 |
+
proj_risks = [
|
| 1280 |
+
['Risk', 'Probability', 'Impact', 'Mitigation'],
|
| 1281 |
+
['Scope Creep', 'High', 'Medium', 'Strict phase gates; change control process'],
|
| 1282 |
+
['Resource Constraints', 'Medium', 'High', 'Prioritized backlog; MVP focus'],
|
| 1283 |
+
['Timeline Slippage', 'Medium', 'Medium', 'Buffer time; parallel workstreams'],
|
| 1284 |
+
['Knowledge Silos', 'Medium', 'Medium', 'Documentation; pair programming; code reviews'],
|
| 1285 |
+
['Stakeholder Alignment', 'Low', 'High', 'Regular demos; feedback cycles'],
|
| 1286 |
+
]
|
| 1287 |
+
|
| 1288 |
+
pr_table = Table(proj_risks, colWidths=[120, 70, 70, 190])
|
| 1289 |
+
pr_table.setStyle(TableStyle([
|
| 1290 |
+
('BACKGROUND', (0, 0), (-1, 0), WARNING_ORANGE),
|
| 1291 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 1292 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 1293 |
+
('FONTSIZE', (0, 0), (-1, -1), 8),
|
| 1294 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 1295 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
|
| 1296 |
+
('PADDING', (0, 0), (-1, -1), 5),
|
| 1297 |
+
('ALIGN', (1, 0), (2, -1), 'CENTER'),
|
| 1298 |
+
]))
|
| 1299 |
+
story.append(pr_table)
|
| 1300 |
+
story.append(Paragraph('Table 16: Project Risks', styles['Caption']))
|
| 1301 |
+
|
| 1302 |
+
story.append(PageBreak())
|
| 1303 |
+
|
| 1304 |
+
# ========== 9. RESOURCE REQUIREMENTS ==========
|
| 1305 |
+
story.append(Paragraph('9. Resource Requirements', styles['SectionHeader']))
|
| 1306 |
+
|
| 1307 |
+
story.append(Paragraph('<b>9.1 Team Structure (Recommended)</b>', styles['SubsectionHeader']))
|
| 1308 |
+
|
| 1309 |
+
team = [
|
| 1310 |
+
['Role', 'Count', 'Phase 1', 'Phase 2', 'Phase 3'],
|
| 1311 |
+
['Senior ML Engineer', '2', '✓', '✓', '✓'],
|
| 1312 |
+
['Backend Developer', '2', '✓', '✓', '✓'],
|
| 1313 |
+
['Frontend Developer', '1', '✓', '✓', '✓'],
|
| 1314 |
+
['DevOps Engineer', '1', '✓', '✓', '✓'],
|
| 1315 |
+
['QA Engineer', '1', '—', '✓', '✓'],
|
| 1316 |
+
['Technical Lead', '1', '✓', '✓', '✓'],
|
| 1317 |
+
['Product Manager', '1', '✓', '✓', '✓'],
|
| 1318 |
+
]
|
| 1319 |
+
|
| 1320 |
+
team_table = Table(team, colWidths=[130, 60, 70, 70, 70])
|
| 1321 |
+
team_table.setStyle(TableStyle([
|
| 1322 |
+
('BACKGROUND', (0, 0), (-1, 0), PRIMARY_BLUE),
|
| 1323 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 1324 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 1325 |
+
('FONTSIZE', (0, 0), (-1, -1), 9),
|
| 1326 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 1327 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
|
| 1328 |
+
('PADDING', (0, 0), (-1, -1), 6),
|
| 1329 |
+
('ALIGN', (1, 0), (-1, -1), 'CENTER'),
|
| 1330 |
+
]))
|
| 1331 |
+
story.append(team_table)
|
| 1332 |
+
story.append(Paragraph('Table 17: Team Structure', styles['Caption']))
|
| 1333 |
+
|
| 1334 |
+
story.append(Spacer(1, 15))
|
| 1335 |
+
story.append(Paragraph('<b>9.2 Infrastructure Requirements</b>', styles['SubsectionHeader']))
|
| 1336 |
+
|
| 1337 |
+
infra = [
|
| 1338 |
+
['Component', 'Development', 'Staging', 'Production'],
|
| 1339 |
+
['GPU Servers', '1x A100 40GB', '2x A100 40GB', '4x A100 80GB'],
|
| 1340 |
+
['CPU Servers', '4 vCPU, 16GB', '8 vCPU, 32GB', '16 vCPU, 64GB x3'],
|
| 1341 |
+
['Storage', '500GB SSD', '2TB SSD', '10TB SSD + S3'],
|
| 1342 |
+
['Vector DB', 'ChromaDB local', 'Milvus single', 'Milvus cluster'],
|
| 1343 |
+
['Cache', 'In-memory', 'Redis single', 'Redis cluster'],
|
| 1344 |
+
['Load Balancer', 'None', 'Nginx', 'AWS ALB / GCP LB'],
|
| 1345 |
+
]
|
| 1346 |
+
|
| 1347 |
+
infra_table = Table(infra, colWidths=[100, 120, 120, 110])
|
| 1348 |
+
infra_table.setStyle(TableStyle([
|
| 1349 |
+
('BACKGROUND', (0, 0), (-1, 0), SECONDARY_BLUE),
|
| 1350 |
+
('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
|
| 1351 |
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 1352 |
+
('FONTSIZE', (0, 0), (-1, -1), 8),
|
| 1353 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 1354 |
+
('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
|
| 1355 |
+
('PADDING', (0, 0), (-1, -1), 5),
|
| 1356 |
+
]))
|
| 1357 |
+
story.append(infra_table)
|
| 1358 |
+
story.append(Paragraph('Table 18: Infrastructure Requirements', styles['Caption']))
|
| 1359 |
+
|
| 1360 |
+
story.append(PageBreak())
|
| 1361 |
+
|
| 1362 |
+
# ========== 10. CONCLUSION ==========
|
| 1363 |
+
story.append(Paragraph('10. Conclusion & Recommendations', styles['SectionHeader']))
|
| 1364 |
+
|
| 1365 |
+
story.append(Paragraph('<b>10.1 Summary</b>', styles['SubsectionHeader']))
|
| 1366 |
+
story.append(Paragraph(
|
| 1367 |
+
'''SPARKNET has achieved significant progress as a proof-of-concept for multi-agentic
|
| 1368 |
+
document intelligence. The core RAG pipeline is functional, demonstrating the viability
|
| 1369 |
+
of the 5-agent architecture with self-correction capabilities. The system successfully
|
| 1370 |
+
processes documents, performs hybrid retrieval, and generates citation-backed responses.''',
|
| 1371 |
+
styles['CustomBody']
|
| 1372 |
+
))
|
| 1373 |
+
|
| 1374 |
+
story.append(Spacer(1, 10))
|
| 1375 |
+
story.append(Paragraph('<b>10.2 Key Recommendations</b>', styles['SubsectionHeader']))
|
| 1376 |
+
|
| 1377 |
+
recommendations = [
|
| 1378 |
+
'<b>Prioritize API Development:</b> Enable external integrations and unlock enterprise adoption.',
|
| 1379 |
+
'<b>Invest in Security:</b> Authentication and authorization are prerequisites for any production deployment.',
|
| 1380 |
+
'<b>Focus on Performance:</b> Current latency is acceptable for demos but needs significant improvement for production use.',
|
| 1381 |
+
'<b>Expand Document Support:</b> Office formats (Word, Excel, PowerPoint) are critical for enterprise adoption.',
|
| 1382 |
+
'<b>Implement Monitoring:</b> Observability is essential for maintaining and scaling the system.',
|
| 1383 |
+
'<b>Plan for Scale Early:</b> Architectural decisions made now will impact scalability; consider distributed architecture.',
|
| 1384 |
+
]
|
| 1385 |
+
|
| 1386 |
+
for rec in recommendations:
|
| 1387 |
+
story.append(Paragraph(f'• {rec}', styles['BulletText']))
|
| 1388 |
+
|
| 1389 |
+
story.append(Spacer(1, 15))
|
| 1390 |
+
story.append(Paragraph('<b>10.3 Immediate Next Steps</b>', styles['SubsectionHeader']))
|
| 1391 |
+
|
| 1392 |
+
next_steps = [
|
| 1393 |
+
'1. Finalize Phase 1 scope and create detailed sprint plans',
|
| 1394 |
+
'2. Set up development infrastructure and CI/CD pipeline',
|
| 1395 |
+
'3. Begin REST API development (target: 4 weeks)',
|
| 1396 |
+
'4. Initiate security assessment and authentication design',
|
| 1397 |
+
'5. Start documentation and knowledge transfer activities',
|
| 1398 |
+
'6. Schedule bi-weekly stakeholder demos for continuous feedback',
|
| 1399 |
+
]
|
| 1400 |
+
|
| 1401 |
+
for step in next_steps:
|
| 1402 |
+
story.append(Paragraph(step, styles['BulletText']))
|
| 1403 |
+
|
| 1404 |
+
story.append(Spacer(1, 30))
|
| 1405 |
+
|
| 1406 |
+
# Final signature block
|
| 1407 |
+
story.append(HRFlowable(width='100%', thickness=1, color=PRIMARY_BLUE))
|
| 1408 |
+
story.append(Spacer(1, 15))
|
| 1409 |
+
|
| 1410 |
+
story.append(Paragraph(
|
| 1411 |
+
f'''<b>Document prepared by:</b> SPARKNET Development Team<br/>
|
| 1412 |
+
<b>Report Date:</b> {datetime.now().strftime('%B %d, %Y')}<br/>
|
| 1413 |
+
<b>Version:</b> 1.0<br/>
|
| 1414 |
+
<b>Classification:</b> Internal / Confidential''',
|
| 1415 |
+
styles['CustomBody']
|
| 1416 |
+
))
|
| 1417 |
+
|
| 1418 |
+
story.append(Spacer(1, 20))
|
| 1419 |
+
story.append(Paragraph(
|
| 1420 |
+
'<i>This document contains confidential information intended for stakeholder review. '
|
| 1421 |
+
'Please do not distribute without authorization.</i>',
|
| 1422 |
+
styles['Caption']
|
| 1423 |
+
))
|
| 1424 |
+
|
| 1425 |
+
# Build PDF
|
| 1426 |
+
doc.build(story)
|
| 1427 |
+
print(f"Report generated: {filename}")
|
| 1428 |
+
return filename
|
| 1429 |
+
|
| 1430 |
+
|
| 1431 |
+
if __name__ == '__main__':
|
| 1432 |
+
generate_report()
|
examples/document_agent.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Example: DocumentAgent with ReAct-style Processing
|
| 3 |
+
|
| 4 |
+
Demonstrates:
|
| 5 |
+
1. Loading and processing documents
|
| 6 |
+
2. Field extraction with evidence
|
| 7 |
+
3. Document classification
|
| 8 |
+
4. Question answering with grounding
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import asyncio
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from loguru import logger
|
| 14 |
+
|
| 15 |
+
# Import DocumentAgent
|
| 16 |
+
from src.agents.document_agent import (
|
| 17 |
+
DocumentAgent,
|
| 18 |
+
AgentConfig,
|
| 19 |
+
)
|
| 20 |
+
from src.document.schemas.extraction import (
|
| 21 |
+
ExtractionSchema,
|
| 22 |
+
FieldDefinition,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
async def example_basic_agent():
|
| 27 |
+
"""Basic agent usage."""
|
| 28 |
+
print("=" * 50)
|
| 29 |
+
print("Basic DocumentAgent Usage")
|
| 30 |
+
print("=" * 50)
|
| 31 |
+
|
| 32 |
+
# Create agent with custom config
|
| 33 |
+
config = AgentConfig(
|
| 34 |
+
default_model="llama3.2:3b",
|
| 35 |
+
max_iterations=10,
|
| 36 |
+
temperature=0.1,
|
| 37 |
+
)
|
| 38 |
+
agent = DocumentAgent(config)
|
| 39 |
+
|
| 40 |
+
# Load document
|
| 41 |
+
sample_doc = Path("./data/sample.pdf")
|
| 42 |
+
if not sample_doc.exists():
|
| 43 |
+
print(f"Sample document not found: {sample_doc}")
|
| 44 |
+
print("Create a sample PDF at ./data/sample.pdf")
|
| 45 |
+
return
|
| 46 |
+
|
| 47 |
+
print(f"\nLoading document: {sample_doc}")
|
| 48 |
+
await agent.load_document(str(sample_doc))
|
| 49 |
+
|
| 50 |
+
print(f"Document loaded: {agent.document.metadata.filename}")
|
| 51 |
+
print(f"Pages: {agent.document.metadata.num_pages}")
|
| 52 |
+
print(f"Chunks: {len(agent.document.chunks)}")
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
async def example_field_extraction():
|
| 56 |
+
"""Extract structured fields with evidence."""
|
| 57 |
+
print("\n" + "=" * 50)
|
| 58 |
+
print("Field Extraction with Evidence")
|
| 59 |
+
print("=" * 50)
|
| 60 |
+
|
| 61 |
+
agent = DocumentAgent()
|
| 62 |
+
|
| 63 |
+
sample_doc = Path("./data/sample.pdf")
|
| 64 |
+
if not sample_doc.exists():
|
| 65 |
+
print("Sample document not found")
|
| 66 |
+
return
|
| 67 |
+
|
| 68 |
+
await agent.load_document(str(sample_doc))
|
| 69 |
+
|
| 70 |
+
# Define extraction schema
|
| 71 |
+
schema = ExtractionSchema(
|
| 72 |
+
name="document_info",
|
| 73 |
+
description="Extract key document information",
|
| 74 |
+
fields=[
|
| 75 |
+
FieldDefinition(
|
| 76 |
+
name="title",
|
| 77 |
+
field_type="string",
|
| 78 |
+
description="Document title",
|
| 79 |
+
required=True,
|
| 80 |
+
),
|
| 81 |
+
FieldDefinition(
|
| 82 |
+
name="author",
|
| 83 |
+
field_type="string",
|
| 84 |
+
description="Document author or organization",
|
| 85 |
+
required=False,
|
| 86 |
+
),
|
| 87 |
+
FieldDefinition(
|
| 88 |
+
name="date",
|
| 89 |
+
field_type="string",
|
| 90 |
+
description="Document date",
|
| 91 |
+
required=False,
|
| 92 |
+
),
|
| 93 |
+
FieldDefinition(
|
| 94 |
+
name="summary",
|
| 95 |
+
field_type="string",
|
| 96 |
+
description="Brief summary of document content",
|
| 97 |
+
required=True,
|
| 98 |
+
),
|
| 99 |
+
],
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# Extract fields
|
| 103 |
+
print("\nExtracting fields...")
|
| 104 |
+
result = await agent.extract_fields(schema)
|
| 105 |
+
|
| 106 |
+
print(f"\nExtracted Fields:")
|
| 107 |
+
for field, value in result.fields.items():
|
| 108 |
+
print(f" {field}: {value}")
|
| 109 |
+
|
| 110 |
+
print(f"\nConfidence: {result.confidence:.2f}")
|
| 111 |
+
|
| 112 |
+
if result.evidence:
|
| 113 |
+
print(f"\nEvidence ({len(result.evidence)} sources):")
|
| 114 |
+
for ev in result.evidence[:3]:
|
| 115 |
+
print(f" - Page {ev.page + 1}: {ev.snippet[:80]}...")
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
async def example_classification():
|
| 119 |
+
"""Classify document type."""
|
| 120 |
+
print("\n" + "=" * 50)
|
| 121 |
+
print("Document Classification")
|
| 122 |
+
print("=" * 50)
|
| 123 |
+
|
| 124 |
+
agent = DocumentAgent()
|
| 125 |
+
|
| 126 |
+
sample_doc = Path("./data/sample.pdf")
|
| 127 |
+
if not sample_doc.exists():
|
| 128 |
+
print("Sample document not found")
|
| 129 |
+
return
|
| 130 |
+
|
| 131 |
+
await agent.load_document(str(sample_doc))
|
| 132 |
+
|
| 133 |
+
# Classify
|
| 134 |
+
print("\nClassifying document...")
|
| 135 |
+
classification = await agent.classify()
|
| 136 |
+
|
| 137 |
+
print(f"\nDocument Type: {classification.document_type.value}")
|
| 138 |
+
print(f"Confidence: {classification.confidence:.2f}")
|
| 139 |
+
print(f"Reasoning: {classification.reasoning}")
|
| 140 |
+
|
| 141 |
+
if classification.metadata:
|
| 142 |
+
print(f"\nAdditional metadata:")
|
| 143 |
+
for key, value in classification.metadata.items():
|
| 144 |
+
print(f" {key}: {value}")
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
async def example_question_answering():
|
| 148 |
+
"""Answer questions about document with evidence."""
|
| 149 |
+
print("\n" + "=" * 50)
|
| 150 |
+
print("Question Answering with Evidence")
|
| 151 |
+
print("=" * 50)
|
| 152 |
+
|
| 153 |
+
agent = DocumentAgent()
|
| 154 |
+
|
| 155 |
+
sample_doc = Path("./data/sample.pdf")
|
| 156 |
+
if not sample_doc.exists():
|
| 157 |
+
print("Sample document not found")
|
| 158 |
+
return
|
| 159 |
+
|
| 160 |
+
await agent.load_document(str(sample_doc))
|
| 161 |
+
|
| 162 |
+
# Questions to ask
|
| 163 |
+
questions = [
|
| 164 |
+
"What is this document about?",
|
| 165 |
+
"What are the main findings or conclusions?",
|
| 166 |
+
"Are there any tables or figures? What do they show?",
|
| 167 |
+
]
|
| 168 |
+
|
| 169 |
+
for question in questions:
|
| 170 |
+
print(f"\nQ: {question}")
|
| 171 |
+
print("-" * 40)
|
| 172 |
+
|
| 173 |
+
answer, evidence = await agent.answer_question(question)
|
| 174 |
+
|
| 175 |
+
print(f"A: {answer}")
|
| 176 |
+
|
| 177 |
+
if evidence:
|
| 178 |
+
print(f"\nEvidence:")
|
| 179 |
+
for ev in evidence[:2]:
|
| 180 |
+
print(f" - Page {ev.page + 1} ({ev.source_type}): {ev.snippet[:60]}...")
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
async def example_react_task():
|
| 184 |
+
"""Run a complex task with ReAct-style reasoning."""
|
| 185 |
+
print("\n" + "=" * 50)
|
| 186 |
+
print("ReAct-style Task Execution")
|
| 187 |
+
print("=" * 50)
|
| 188 |
+
|
| 189 |
+
agent = DocumentAgent()
|
| 190 |
+
|
| 191 |
+
sample_doc = Path("./data/sample.pdf")
|
| 192 |
+
if not sample_doc.exists():
|
| 193 |
+
print("Sample document not found")
|
| 194 |
+
return
|
| 195 |
+
|
| 196 |
+
await agent.load_document(str(sample_doc))
|
| 197 |
+
|
| 198 |
+
# Complex task
|
| 199 |
+
task = """
|
| 200 |
+
Analyze this document and provide:
|
| 201 |
+
1. A brief summary of the content
|
| 202 |
+
2. The document type and purpose
|
| 203 |
+
3. Any key data points or figures mentioned
|
| 204 |
+
4. Your confidence in the analysis
|
| 205 |
+
"""
|
| 206 |
+
|
| 207 |
+
print(f"\nTask: {task}")
|
| 208 |
+
print("-" * 40)
|
| 209 |
+
|
| 210 |
+
# Run with trace
|
| 211 |
+
result, trace = await agent.run(task)
|
| 212 |
+
|
| 213 |
+
print(f"\nResult:\n{result}")
|
| 214 |
+
|
| 215 |
+
print(f"\n--- Agent Trace ---")
|
| 216 |
+
print(f"Steps: {len(trace.steps)}")
|
| 217 |
+
print(f"Tools used: {trace.tools_used}")
|
| 218 |
+
print(f"Total time: {trace.total_time:.2f}s")
|
| 219 |
+
|
| 220 |
+
# Show thinking process
|
| 221 |
+
print(f"\nReasoning trace:")
|
| 222 |
+
for i, step in enumerate(trace.steps[:5], 1):
|
| 223 |
+
print(f"\n[Step {i}] {step.action}")
|
| 224 |
+
if step.thought:
|
| 225 |
+
print(f" Thought: {step.thought[:100]}...")
|
| 226 |
+
if step.observation:
|
| 227 |
+
print(f" Observation: {step.observation[:100]}...")
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
async def main():
|
| 231 |
+
"""Run all examples."""
|
| 232 |
+
await example_basic_agent()
|
| 233 |
+
await example_field_extraction()
|
| 234 |
+
await example_classification()
|
| 235 |
+
await example_question_answering()
|
| 236 |
+
await example_react_task()
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
if __name__ == "__main__":
|
| 240 |
+
asyncio.run(main())
|
examples/document_intelligence_demo.py
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Document Intelligence Demo
|
| 4 |
+
|
| 5 |
+
Demonstrates the capabilities of the SPARKNET document_intelligence subsystem:
|
| 6 |
+
- Document parsing with OCR and layout detection
|
| 7 |
+
- Schema-driven field extraction
|
| 8 |
+
- Visual grounding with evidence
|
| 9 |
+
- Question answering
|
| 10 |
+
- Document classification
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import asyncio
|
| 14 |
+
import json
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
# Add project root to path
|
| 18 |
+
import sys
|
| 19 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def demo_parse_document(doc_path: str):
|
| 23 |
+
"""Demo: Parse a document into semantic chunks."""
|
| 24 |
+
print("\n" + "=" * 60)
|
| 25 |
+
print("1. DOCUMENT PARSING")
|
| 26 |
+
print("=" * 60)
|
| 27 |
+
|
| 28 |
+
from src.document_intelligence import (
|
| 29 |
+
DocumentParser,
|
| 30 |
+
ParserConfig,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Configure parser
|
| 34 |
+
config = ParserConfig(
|
| 35 |
+
render_dpi=200,
|
| 36 |
+
max_pages=5, # Limit for demo
|
| 37 |
+
include_markdown=True,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
parser = DocumentParser(config=config)
|
| 41 |
+
|
| 42 |
+
print(f"\nParsing: {doc_path}")
|
| 43 |
+
result = parser.parse(doc_path)
|
| 44 |
+
|
| 45 |
+
print(f"\nDocument ID: {result.doc_id}")
|
| 46 |
+
print(f"Filename: {result.filename}")
|
| 47 |
+
print(f"Pages: {result.num_pages}")
|
| 48 |
+
print(f"Chunks: {len(result.chunks)}")
|
| 49 |
+
print(f"Processing time: {result.processing_time_ms:.0f}ms")
|
| 50 |
+
|
| 51 |
+
# Show chunk summary by type
|
| 52 |
+
print("\nChunk types:")
|
| 53 |
+
by_type = {}
|
| 54 |
+
for chunk in result.chunks:
|
| 55 |
+
t = chunk.chunk_type.value
|
| 56 |
+
by_type[t] = by_type.get(t, 0) + 1
|
| 57 |
+
|
| 58 |
+
for t, count in sorted(by_type.items()):
|
| 59 |
+
print(f" - {t}: {count}")
|
| 60 |
+
|
| 61 |
+
# Show first few chunks
|
| 62 |
+
print("\nFirst 3 chunks:")
|
| 63 |
+
for i, chunk in enumerate(result.chunks[:3]):
|
| 64 |
+
print(f"\n [{i+1}] Type: {chunk.chunk_type.value}, Page: {chunk.page}")
|
| 65 |
+
print(f" ID: {chunk.chunk_id}")
|
| 66 |
+
print(f" Text: {chunk.text[:100]}...")
|
| 67 |
+
print(f" BBox: {chunk.bbox.xyxy}")
|
| 68 |
+
print(f" Confidence: {chunk.confidence:.2f}")
|
| 69 |
+
|
| 70 |
+
return result
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def demo_extract_fields(parse_result):
|
| 74 |
+
"""Demo: Extract fields using a schema."""
|
| 75 |
+
print("\n" + "=" * 60)
|
| 76 |
+
print("2. SCHEMA-DRIVEN EXTRACTION")
|
| 77 |
+
print("=" * 60)
|
| 78 |
+
|
| 79 |
+
from src.document_intelligence import (
|
| 80 |
+
FieldExtractor,
|
| 81 |
+
ExtractionSchema,
|
| 82 |
+
FieldType,
|
| 83 |
+
ExtractionValidator,
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Create a custom schema
|
| 87 |
+
schema = ExtractionSchema(
|
| 88 |
+
name="DocumentInfo",
|
| 89 |
+
description="Basic document information",
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
schema.add_string_field("title", "Document title or heading", required=True)
|
| 93 |
+
schema.add_string_field("date", "Document date", required=False)
|
| 94 |
+
schema.add_string_field("author", "Author or organization name", required=False)
|
| 95 |
+
schema.add_string_field("reference_number", "Reference or ID number", required=False)
|
| 96 |
+
|
| 97 |
+
print(f"\nExtraction schema: {schema.name}")
|
| 98 |
+
print("Fields:")
|
| 99 |
+
for field in schema.fields:
|
| 100 |
+
req = "required" if field.required else "optional"
|
| 101 |
+
print(f" - {field.name} ({field.field_type.value}, {req})")
|
| 102 |
+
|
| 103 |
+
# Extract fields
|
| 104 |
+
extractor = FieldExtractor()
|
| 105 |
+
result = extractor.extract(parse_result, schema)
|
| 106 |
+
|
| 107 |
+
print("\nExtracted data:")
|
| 108 |
+
for key, value in result.data.items():
|
| 109 |
+
status = " [ABSTAINED]" if key in result.abstained_fields else ""
|
| 110 |
+
print(f" {key}: {value}{status}")
|
| 111 |
+
|
| 112 |
+
print(f"\nOverall confidence: {result.overall_confidence:.2f}")
|
| 113 |
+
|
| 114 |
+
# Show evidence
|
| 115 |
+
if result.evidence:
|
| 116 |
+
print("\nEvidence:")
|
| 117 |
+
for ev in result.evidence[:3]:
|
| 118 |
+
print(f" - Page {ev.page}, Chunk {ev.chunk_id[:12]}...")
|
| 119 |
+
print(f" Snippet: {ev.snippet[:80]}...")
|
| 120 |
+
|
| 121 |
+
# Validate
|
| 122 |
+
validator = ExtractionValidator()
|
| 123 |
+
validation = validator.validate(result, schema)
|
| 124 |
+
|
| 125 |
+
print(f"\nValidation: {'PASSED' if validation.is_valid else 'FAILED'}")
|
| 126 |
+
if validation.issues:
|
| 127 |
+
print("Issues:")
|
| 128 |
+
for issue in validation.issues[:3]:
|
| 129 |
+
print(f" - [{issue.severity}] {issue.field_name}: {issue.message}")
|
| 130 |
+
|
| 131 |
+
return result
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def demo_search_and_qa(parse_result):
|
| 135 |
+
"""Demo: Search and question answering."""
|
| 136 |
+
print("\n" + "=" * 60)
|
| 137 |
+
print("3. SEARCH AND Q&A")
|
| 138 |
+
print("=" * 60)
|
| 139 |
+
|
| 140 |
+
from src.document_intelligence.tools import get_tool
|
| 141 |
+
|
| 142 |
+
# Search demo
|
| 143 |
+
print("\nSearching for 'document'...")
|
| 144 |
+
search_tool = get_tool("search_chunks")
|
| 145 |
+
search_result = search_tool.execute(
|
| 146 |
+
parse_result=parse_result,
|
| 147 |
+
query="document",
|
| 148 |
+
top_k=5,
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
if search_result.success:
|
| 152 |
+
matches = search_result.data.get("results", [])
|
| 153 |
+
print(f"Found {len(matches)} matches:")
|
| 154 |
+
for i, match in enumerate(matches[:3], 1):
|
| 155 |
+
print(f" {i}. Page {match['page']}, Type: {match['type']}")
|
| 156 |
+
print(f" Score: {match['score']:.2f}")
|
| 157 |
+
print(f" Text: {match['text'][:80]}...")
|
| 158 |
+
|
| 159 |
+
# Q&A demo
|
| 160 |
+
print("\nAsking: 'What is this document about?'")
|
| 161 |
+
qa_tool = get_tool("answer_question")
|
| 162 |
+
qa_result = qa_tool.execute(
|
| 163 |
+
parse_result=parse_result,
|
| 164 |
+
question="What is this document about?",
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
if qa_result.success:
|
| 168 |
+
print(f"Answer: {qa_result.data.get('answer', 'No answer')}")
|
| 169 |
+
print(f"Confidence: {qa_result.data.get('confidence', 0):.2f}")
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def demo_grounding(parse_result, doc_path: str):
|
| 173 |
+
"""Demo: Visual grounding with crops."""
|
| 174 |
+
print("\n" + "=" * 60)
|
| 175 |
+
print("4. VISUAL GROUNDING")
|
| 176 |
+
print("=" * 60)
|
| 177 |
+
|
| 178 |
+
from src.document_intelligence import (
|
| 179 |
+
load_document,
|
| 180 |
+
RenderOptions,
|
| 181 |
+
)
|
| 182 |
+
from src.document_intelligence.grounding import (
|
| 183 |
+
EvidenceBuilder,
|
| 184 |
+
crop_region,
|
| 185 |
+
create_annotated_image,
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
# Load page image
|
| 189 |
+
loader, renderer = load_document(doc_path)
|
| 190 |
+
page_image = renderer.render_page(1, RenderOptions(dpi=200))
|
| 191 |
+
loader.close()
|
| 192 |
+
|
| 193 |
+
print(f"\nPage 1 image size: {page_image.shape}")
|
| 194 |
+
|
| 195 |
+
# Get chunks from page 1
|
| 196 |
+
page_chunks = [c for c in parse_result.chunks if c.page == 1]
|
| 197 |
+
print(f"Page 1 chunks: {len(page_chunks)}")
|
| 198 |
+
|
| 199 |
+
# Create evidence for first chunk
|
| 200 |
+
if page_chunks:
|
| 201 |
+
chunk = page_chunks[0]
|
| 202 |
+
evidence_builder = EvidenceBuilder()
|
| 203 |
+
|
| 204 |
+
evidence = evidence_builder.create_evidence(
|
| 205 |
+
chunk=chunk,
|
| 206 |
+
value=chunk.text[:50],
|
| 207 |
+
field_name="example_field",
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
print(f"\nEvidence created:")
|
| 211 |
+
print(f" Chunk ID: {evidence.chunk_id}")
|
| 212 |
+
print(f" Page: {evidence.page}")
|
| 213 |
+
print(f" BBox: {evidence.bbox.xyxy}")
|
| 214 |
+
print(f" Snippet: {evidence.snippet[:80]}...")
|
| 215 |
+
|
| 216 |
+
# Crop region
|
| 217 |
+
crop = crop_region(page_image, chunk.bbox)
|
| 218 |
+
print(f" Crop size: {crop.shape}")
|
| 219 |
+
|
| 220 |
+
# Create annotated image (preview)
|
| 221 |
+
print("\nAnnotated image would include bounding boxes for all chunks.")
|
| 222 |
+
print("Use the CLI 'sparknet docint visualize' command to generate.")
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def demo_classification(parse_result):
|
| 226 |
+
"""Demo: Document classification."""
|
| 227 |
+
print("\n" + "=" * 60)
|
| 228 |
+
print("5. DOCUMENT CLASSIFICATION")
|
| 229 |
+
print("=" * 60)
|
| 230 |
+
|
| 231 |
+
from src.document_intelligence.chunks import DocumentType
|
| 232 |
+
|
| 233 |
+
# Simple keyword-based classification
|
| 234 |
+
first_page = [c for c in parse_result.chunks if c.page == 1][:5]
|
| 235 |
+
content = " ".join(c.text for c in first_page).lower()
|
| 236 |
+
|
| 237 |
+
type_keywords = {
|
| 238 |
+
"invoice": ["invoice", "bill", "payment due", "amount due"],
|
| 239 |
+
"contract": ["agreement", "contract", "party", "whereas"],
|
| 240 |
+
"receipt": ["receipt", "paid", "transaction"],
|
| 241 |
+
"patent": ["patent", "claims", "invention"],
|
| 242 |
+
"report": ["report", "findings", "summary"],
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
detected_type = "other"
|
| 246 |
+
confidence = 0.3
|
| 247 |
+
|
| 248 |
+
for doc_type, keywords in type_keywords.items():
|
| 249 |
+
matches = sum(1 for k in keywords if k in content)
|
| 250 |
+
if matches >= 2:
|
| 251 |
+
detected_type = doc_type
|
| 252 |
+
confidence = min(0.95, 0.5 + matches * 0.15)
|
| 253 |
+
break
|
| 254 |
+
|
| 255 |
+
print(f"\nDetected type: {detected_type}")
|
| 256 |
+
print(f"Confidence: {confidence:.2f}")
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def main():
|
| 260 |
+
"""Run all demos."""
|
| 261 |
+
print("=" * 60)
|
| 262 |
+
print("SPARKNET Document Intelligence Demo")
|
| 263 |
+
print("=" * 60)
|
| 264 |
+
|
| 265 |
+
# Check for sample document
|
| 266 |
+
sample_paths = [
|
| 267 |
+
Path("Dataset/Patent_1.pdf"),
|
| 268 |
+
Path("data/sample.pdf"),
|
| 269 |
+
Path("tests/fixtures/sample.pdf"),
|
| 270 |
+
]
|
| 271 |
+
|
| 272 |
+
doc_path = None
|
| 273 |
+
for path in sample_paths:
|
| 274 |
+
if path.exists():
|
| 275 |
+
doc_path = str(path)
|
| 276 |
+
break
|
| 277 |
+
|
| 278 |
+
if not doc_path:
|
| 279 |
+
print("\nNo sample document found.")
|
| 280 |
+
print("Please provide a PDF file path as argument.")
|
| 281 |
+
print("\nUsage: python document_intelligence_demo.py [path/to/document.pdf]")
|
| 282 |
+
|
| 283 |
+
if len(sys.argv) > 1:
|
| 284 |
+
doc_path = sys.argv[1]
|
| 285 |
+
else:
|
| 286 |
+
return
|
| 287 |
+
|
| 288 |
+
print(f"\nUsing document: {doc_path}")
|
| 289 |
+
|
| 290 |
+
try:
|
| 291 |
+
# Run demos
|
| 292 |
+
parse_result = demo_parse_document(doc_path)
|
| 293 |
+
demo_extract_fields(parse_result)
|
| 294 |
+
demo_search_and_qa(parse_result)
|
| 295 |
+
demo_grounding(parse_result, doc_path)
|
| 296 |
+
demo_classification(parse_result)
|
| 297 |
+
|
| 298 |
+
print("\n" + "=" * 60)
|
| 299 |
+
print("Demo complete!")
|
| 300 |
+
print("=" * 60)
|
| 301 |
+
|
| 302 |
+
except ImportError as e:
|
| 303 |
+
print(f"\nImport error: {e}")
|
| 304 |
+
print("Make sure all dependencies are installed:")
|
| 305 |
+
print(" pip install pymupdf pillow numpy pydantic")
|
| 306 |
+
|
| 307 |
+
except Exception as e:
|
| 308 |
+
print(f"\nError: {e}")
|
| 309 |
+
import traceback
|
| 310 |
+
traceback.print_exc()
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
if __name__ == "__main__":
|
| 314 |
+
main()
|
examples/document_processing.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Example: Document Processing Pipeline
|
| 3 |
+
|
| 4 |
+
Demonstrates:
|
| 5 |
+
1. Processing a PDF document
|
| 6 |
+
2. Extracting text with OCR
|
| 7 |
+
3. Layout detection
|
| 8 |
+
4. Semantic chunking
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import asyncio
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from loguru import logger
|
| 14 |
+
|
| 15 |
+
# Import document processing components
|
| 16 |
+
from src.document.pipeline import (
|
| 17 |
+
PipelineConfig,
|
| 18 |
+
DocumentProcessor,
|
| 19 |
+
process_document,
|
| 20 |
+
)
|
| 21 |
+
from src.document.ocr import OCRConfig
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def example_basic_processing():
|
| 25 |
+
"""Basic document processing example."""
|
| 26 |
+
print("=" * 50)
|
| 27 |
+
print("Basic Document Processing")
|
| 28 |
+
print("=" * 50)
|
| 29 |
+
|
| 30 |
+
# Configure pipeline
|
| 31 |
+
config = PipelineConfig(
|
| 32 |
+
ocr=OCRConfig(engine="paddleocr"),
|
| 33 |
+
render_dpi=300,
|
| 34 |
+
max_pages=5, # Limit for demo
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# Create processor
|
| 38 |
+
processor = DocumentProcessor(config)
|
| 39 |
+
|
| 40 |
+
# Process a sample document
|
| 41 |
+
# NOTE: Replace with actual document path
|
| 42 |
+
sample_doc = Path("./data/sample.pdf")
|
| 43 |
+
|
| 44 |
+
if not sample_doc.exists():
|
| 45 |
+
print(f"Sample document not found: {sample_doc}")
|
| 46 |
+
print("Create a sample PDF at ./data/sample.pdf to run this example")
|
| 47 |
+
return
|
| 48 |
+
|
| 49 |
+
# Process
|
| 50 |
+
result = processor.process(sample_doc)
|
| 51 |
+
|
| 52 |
+
# Display results
|
| 53 |
+
print(f"\nDocument: {result.metadata.filename}")
|
| 54 |
+
print(f"Pages: {result.metadata.num_pages}")
|
| 55 |
+
print(f"Chunks: {result.metadata.total_chunks}")
|
| 56 |
+
print(f"Characters: {result.metadata.total_characters}")
|
| 57 |
+
print(f"OCR Confidence: {result.metadata.ocr_confidence_avg:.2%}")
|
| 58 |
+
|
| 59 |
+
print("\n--- Sample Chunks ---")
|
| 60 |
+
for i, chunk in enumerate(result.chunks[:3]):
|
| 61 |
+
print(f"\n[Chunk {i+1}] Type: {chunk.chunk_type.value}, Page: {chunk.page+1}")
|
| 62 |
+
print(f"Text: {chunk.text[:200]}...")
|
| 63 |
+
print(f"BBox: ({chunk.bbox.x_min:.0f}, {chunk.bbox.y_min:.0f}) -> ({chunk.bbox.x_max:.0f}, {chunk.bbox.y_max:.0f})")
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def example_with_layout():
|
| 67 |
+
"""Document processing with layout analysis."""
|
| 68 |
+
print("\n" + "=" * 50)
|
| 69 |
+
print("Document Processing with Layout Analysis")
|
| 70 |
+
print("=" * 50)
|
| 71 |
+
|
| 72 |
+
from src.document.layout import LayoutConfig, LayoutType
|
| 73 |
+
|
| 74 |
+
# Configure with layout detection
|
| 75 |
+
config = PipelineConfig(
|
| 76 |
+
ocr=OCRConfig(engine="paddleocr"),
|
| 77 |
+
layout=LayoutConfig(method="rule_based"),
|
| 78 |
+
include_layout_regions=True,
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
processor = DocumentProcessor(config)
|
| 82 |
+
|
| 83 |
+
sample_doc = Path("./data/sample.pdf")
|
| 84 |
+
if not sample_doc.exists():
|
| 85 |
+
print("Sample document not found")
|
| 86 |
+
return
|
| 87 |
+
|
| 88 |
+
result = processor.process(sample_doc)
|
| 89 |
+
|
| 90 |
+
# Count layout types
|
| 91 |
+
layout_counts = {}
|
| 92 |
+
for region in result.layout_regions:
|
| 93 |
+
layout_type = region.layout_type.value
|
| 94 |
+
layout_counts[layout_type] = layout_counts.get(layout_type, 0) + 1
|
| 95 |
+
|
| 96 |
+
print(f"\nLayout Analysis:")
|
| 97 |
+
for layout_type, count in sorted(layout_counts.items()):
|
| 98 |
+
print(f" {layout_type}: {count} regions")
|
| 99 |
+
|
| 100 |
+
# Show tables if found
|
| 101 |
+
tables = [r for r in result.layout_regions if r.layout_type == LayoutType.TABLE]
|
| 102 |
+
if tables:
|
| 103 |
+
print(f"\n--- Tables Found ({len(tables)}) ---")
|
| 104 |
+
for i, table in enumerate(tables[:2]):
|
| 105 |
+
print(f"\nTable {i+1}: Page {table.page+1}")
|
| 106 |
+
print(f" Position: ({table.bbox.x_min:.0f}, {table.bbox.y_min:.0f})")
|
| 107 |
+
print(f" Size: {table.bbox.width:.0f} x {table.bbox.height:.0f}")
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def example_convenience_function():
|
| 111 |
+
"""Using the convenience function."""
|
| 112 |
+
print("\n" + "=" * 50)
|
| 113 |
+
print("Using Convenience Function")
|
| 114 |
+
print("=" * 50)
|
| 115 |
+
|
| 116 |
+
sample_doc = Path("./data/sample.pdf")
|
| 117 |
+
if not sample_doc.exists():
|
| 118 |
+
print("Sample document not found")
|
| 119 |
+
return
|
| 120 |
+
|
| 121 |
+
# Simple one-liner
|
| 122 |
+
result = process_document(sample_doc)
|
| 123 |
+
|
| 124 |
+
print(f"Processed: {result.metadata.filename}")
|
| 125 |
+
print(f"Chunks: {len(result.chunks)}")
|
| 126 |
+
print(f"\nFull text preview:")
|
| 127 |
+
print(result.full_text[:500] + "..." if len(result.full_text) > 500 else result.full_text)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
if __name__ == "__main__":
|
| 131 |
+
example_basic_processing()
|
| 132 |
+
example_with_layout()
|
| 133 |
+
example_convenience_function()
|
examples/document_rag_end_to_end.py
ADDED
|
@@ -0,0 +1,359 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Document Intelligence RAG End-to-End Example
|
| 4 |
+
|
| 5 |
+
Demonstrates the complete RAG workflow:
|
| 6 |
+
1. Parse documents into semantic chunks
|
| 7 |
+
2. Index chunks into vector store
|
| 8 |
+
3. Semantic retrieval with filters
|
| 9 |
+
4. Grounded question answering with evidence
|
| 10 |
+
5. Evidence visualization
|
| 11 |
+
|
| 12 |
+
Requirements:
|
| 13 |
+
- ChromaDB: pip install chromadb
|
| 14 |
+
- Ollama running with nomic-embed-text model: ollama pull nomic-embed-text
|
| 15 |
+
- PyMuPDF: pip install pymupdf
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
import sys
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
|
| 21 |
+
# Add project root to path
|
| 22 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def check_dependencies():
|
| 26 |
+
"""Check that required dependencies are available."""
|
| 27 |
+
missing = []
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
import chromadb
|
| 31 |
+
except ImportError:
|
| 32 |
+
missing.append("chromadb")
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
import fitz # PyMuPDF
|
| 36 |
+
except ImportError:
|
| 37 |
+
missing.append("pymupdf")
|
| 38 |
+
|
| 39 |
+
if missing:
|
| 40 |
+
print("Missing dependencies:")
|
| 41 |
+
for dep in missing:
|
| 42 |
+
print(f" - {dep}")
|
| 43 |
+
print("\nInstall with: pip install " + " ".join(missing))
|
| 44 |
+
return False
|
| 45 |
+
|
| 46 |
+
# Check Ollama
|
| 47 |
+
try:
|
| 48 |
+
import requests
|
| 49 |
+
response = requests.get("http://localhost:11434/api/tags", timeout=2)
|
| 50 |
+
if response.status_code != 200:
|
| 51 |
+
print("Warning: Ollama server not responding")
|
| 52 |
+
print("Start Ollama with: ollama serve")
|
| 53 |
+
print("Then pull the embedding model: ollama pull nomic-embed-text")
|
| 54 |
+
except:
|
| 55 |
+
print("Warning: Could not connect to Ollama server")
|
| 56 |
+
print("The example will still work but with mock embeddings")
|
| 57 |
+
|
| 58 |
+
return True
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def demo_parse_and_index(doc_paths: list):
|
| 62 |
+
"""
|
| 63 |
+
Demo: Parse documents and index into vector store.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
doc_paths: List of document file paths
|
| 67 |
+
"""
|
| 68 |
+
print("\n" + "=" * 60)
|
| 69 |
+
print("STEP 1: PARSE AND INDEX DOCUMENTS")
|
| 70 |
+
print("=" * 60)
|
| 71 |
+
|
| 72 |
+
from src.document_intelligence import DocumentParser, ParserConfig
|
| 73 |
+
from src.document_intelligence.tools import get_rag_tool
|
| 74 |
+
|
| 75 |
+
# Get the index tool
|
| 76 |
+
index_tool = get_rag_tool("index_document")
|
| 77 |
+
|
| 78 |
+
results = []
|
| 79 |
+
for doc_path in doc_paths:
|
| 80 |
+
print(f"\nProcessing: {doc_path}")
|
| 81 |
+
|
| 82 |
+
# Parse document first (optional - tool can do this)
|
| 83 |
+
config = ParserConfig(render_dpi=200, max_pages=10)
|
| 84 |
+
parser = DocumentParser(config=config)
|
| 85 |
+
|
| 86 |
+
try:
|
| 87 |
+
parse_result = parser.parse(doc_path)
|
| 88 |
+
print(f" Parsed: {len(parse_result.chunks)} chunks, {parse_result.num_pages} pages")
|
| 89 |
+
|
| 90 |
+
# Index the parse result
|
| 91 |
+
result = index_tool.execute(parse_result=parse_result)
|
| 92 |
+
|
| 93 |
+
if result.success:
|
| 94 |
+
print(f" Indexed: {result.data['chunks_indexed']} chunks")
|
| 95 |
+
print(f" Document ID: {result.data['document_id']}")
|
| 96 |
+
results.append({
|
| 97 |
+
"path": doc_path,
|
| 98 |
+
"doc_id": result.data['document_id'],
|
| 99 |
+
"chunks": result.data['chunks_indexed'],
|
| 100 |
+
})
|
| 101 |
+
else:
|
| 102 |
+
print(f" Error: {result.error}")
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f" Failed: {e}")
|
| 106 |
+
|
| 107 |
+
return results
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def demo_semantic_retrieval(query: str, document_id: str = None):
|
| 111 |
+
"""
|
| 112 |
+
Demo: Semantic retrieval from vector store.
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
query: Search query
|
| 116 |
+
document_id: Optional document filter
|
| 117 |
+
"""
|
| 118 |
+
print("\n" + "=" * 60)
|
| 119 |
+
print("STEP 2: SEMANTIC RETRIEVAL")
|
| 120 |
+
print("=" * 60)
|
| 121 |
+
|
| 122 |
+
from src.document_intelligence.tools import get_rag_tool
|
| 123 |
+
|
| 124 |
+
retrieve_tool = get_rag_tool("retrieve_chunks")
|
| 125 |
+
|
| 126 |
+
print(f"\nQuery: \"{query}\"")
|
| 127 |
+
if document_id:
|
| 128 |
+
print(f"Document filter: {document_id}")
|
| 129 |
+
|
| 130 |
+
result = retrieve_tool.execute(
|
| 131 |
+
query=query,
|
| 132 |
+
top_k=5,
|
| 133 |
+
document_id=document_id,
|
| 134 |
+
include_evidence=True,
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
if result.success:
|
| 138 |
+
chunks = result.data.get("chunks", [])
|
| 139 |
+
print(f"\nFound {len(chunks)} relevant chunks:\n")
|
| 140 |
+
|
| 141 |
+
for i, chunk in enumerate(chunks, 1):
|
| 142 |
+
print(f"{i}. [similarity={chunk['similarity']:.3f}]")
|
| 143 |
+
print(f" Page {chunk.get('page', '?')}, Type: {chunk.get('chunk_type', 'unknown')}")
|
| 144 |
+
print(f" Text: {chunk['text'][:150]}...")
|
| 145 |
+
print()
|
| 146 |
+
|
| 147 |
+
# Show evidence
|
| 148 |
+
if result.evidence:
|
| 149 |
+
print("Evidence references:")
|
| 150 |
+
for ev in result.evidence[:3]:
|
| 151 |
+
print(f" - Chunk {ev['chunk_id'][:12]}... Page {ev.get('page', '?')}")
|
| 152 |
+
|
| 153 |
+
return chunks
|
| 154 |
+
else:
|
| 155 |
+
print(f"Error: {result.error}")
|
| 156 |
+
return []
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def demo_grounded_qa(question: str, document_id: str = None):
|
| 160 |
+
"""
|
| 161 |
+
Demo: Grounded question answering with evidence.
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
question: Question to answer
|
| 165 |
+
document_id: Optional document filter
|
| 166 |
+
"""
|
| 167 |
+
print("\n" + "=" * 60)
|
| 168 |
+
print("STEP 3: GROUNDED QUESTION ANSWERING")
|
| 169 |
+
print("=" * 60)
|
| 170 |
+
|
| 171 |
+
from src.document_intelligence.tools import get_rag_tool
|
| 172 |
+
|
| 173 |
+
qa_tool = get_rag_tool("rag_answer")
|
| 174 |
+
|
| 175 |
+
print(f"\nQuestion: \"{question}\"")
|
| 176 |
+
|
| 177 |
+
result = qa_tool.execute(
|
| 178 |
+
question=question,
|
| 179 |
+
document_id=document_id,
|
| 180 |
+
top_k=5,
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
if result.success:
|
| 184 |
+
data = result.data
|
| 185 |
+
print(f"\nAnswer: {data.get('answer', 'No answer')}")
|
| 186 |
+
print(f"Confidence: {data.get('confidence', 0):.2f}")
|
| 187 |
+
|
| 188 |
+
if data.get('abstained'):
|
| 189 |
+
print("Note: System abstained due to low confidence")
|
| 190 |
+
|
| 191 |
+
# Show citations if any
|
| 192 |
+
citations = data.get('citations', [])
|
| 193 |
+
if citations:
|
| 194 |
+
print("\nCitations:")
|
| 195 |
+
for cit in citations:
|
| 196 |
+
print(f" [{cit['index']}] {cit.get('text', '')[:80]}...")
|
| 197 |
+
|
| 198 |
+
# Show evidence
|
| 199 |
+
if result.evidence:
|
| 200 |
+
print("\nEvidence locations:")
|
| 201 |
+
for ev in result.evidence:
|
| 202 |
+
print(f" - Page {ev.get('page', '?')}: {ev.get('snippet', '')[:60]}...")
|
| 203 |
+
|
| 204 |
+
return data
|
| 205 |
+
else:
|
| 206 |
+
print(f"Error: {result.error}")
|
| 207 |
+
return None
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def demo_filtered_retrieval():
|
| 211 |
+
"""
|
| 212 |
+
Demo: Retrieval with various filters.
|
| 213 |
+
"""
|
| 214 |
+
print("\n" + "=" * 60)
|
| 215 |
+
print("STEP 4: FILTERED RETRIEVAL")
|
| 216 |
+
print("=" * 60)
|
| 217 |
+
|
| 218 |
+
from src.document_intelligence.tools import get_rag_tool
|
| 219 |
+
|
| 220 |
+
retrieve_tool = get_rag_tool("retrieve_chunks")
|
| 221 |
+
|
| 222 |
+
# Filter by chunk type
|
| 223 |
+
print("\n--- Retrieving only table chunks ---")
|
| 224 |
+
result = retrieve_tool.execute(
|
| 225 |
+
query="data values",
|
| 226 |
+
top_k=3,
|
| 227 |
+
chunk_types=["table"],
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
if result.success:
|
| 231 |
+
chunks = result.data.get("chunks", [])
|
| 232 |
+
print(f"Found {len(chunks)} table chunks")
|
| 233 |
+
for chunk in chunks:
|
| 234 |
+
print(f" - Page {chunk.get('page', '?')}: {chunk['text'][:80]}...")
|
| 235 |
+
|
| 236 |
+
# Filter by page range
|
| 237 |
+
print("\n--- Retrieving from pages 1-3 only ---")
|
| 238 |
+
result = retrieve_tool.execute(
|
| 239 |
+
query="content",
|
| 240 |
+
top_k=3,
|
| 241 |
+
page_range=(1, 3),
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
if result.success:
|
| 245 |
+
chunks = result.data.get("chunks", [])
|
| 246 |
+
print(f"Found {len(chunks)} chunks from pages 1-3")
|
| 247 |
+
for chunk in chunks:
|
| 248 |
+
print(f" - Page {chunk.get('page', '?')}: {chunk['text'][:80]}...")
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def demo_index_stats():
|
| 252 |
+
"""
|
| 253 |
+
Demo: Show index statistics.
|
| 254 |
+
"""
|
| 255 |
+
print("\n" + "=" * 60)
|
| 256 |
+
print("INDEX STATISTICS")
|
| 257 |
+
print("=" * 60)
|
| 258 |
+
|
| 259 |
+
from src.document_intelligence.tools import get_rag_tool
|
| 260 |
+
|
| 261 |
+
stats_tool = get_rag_tool("get_index_stats")
|
| 262 |
+
result = stats_tool.execute()
|
| 263 |
+
|
| 264 |
+
if result.success:
|
| 265 |
+
data = result.data
|
| 266 |
+
print(f"\nTotal chunks indexed: {data.get('total_chunks', 0)}")
|
| 267 |
+
print(f"Embedding model: {data.get('embedding_model', 'unknown')}")
|
| 268 |
+
print(f"Embedding dimension: {data.get('embedding_dimension', 'unknown')}")
|
| 269 |
+
else:
|
| 270 |
+
print(f"Error: {result.error}")
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def main():
|
| 274 |
+
"""Run the complete RAG demo."""
|
| 275 |
+
print("=" * 60)
|
| 276 |
+
print("SPARKNET Document Intelligence RAG Demo")
|
| 277 |
+
print("=" * 60)
|
| 278 |
+
|
| 279 |
+
# Check dependencies
|
| 280 |
+
if not check_dependencies():
|
| 281 |
+
print("\nPlease install missing dependencies and try again.")
|
| 282 |
+
return
|
| 283 |
+
|
| 284 |
+
# Find sample documents
|
| 285 |
+
sample_paths = [
|
| 286 |
+
Path("Dataset/Patent_1.pdf"),
|
| 287 |
+
Path("data/sample.pdf"),
|
| 288 |
+
Path("tests/fixtures/sample.pdf"),
|
| 289 |
+
]
|
| 290 |
+
|
| 291 |
+
doc_paths = []
|
| 292 |
+
for path in sample_paths:
|
| 293 |
+
if path.exists():
|
| 294 |
+
doc_paths.append(str(path))
|
| 295 |
+
break
|
| 296 |
+
|
| 297 |
+
if not doc_paths:
|
| 298 |
+
print("\nNo sample documents found.")
|
| 299 |
+
print("Please provide a PDF file path as argument.")
|
| 300 |
+
print("\nUsage: python document_rag_end_to_end.py [path/to/document.pdf]")
|
| 301 |
+
|
| 302 |
+
if len(sys.argv) > 1:
|
| 303 |
+
doc_paths = sys.argv[1:]
|
| 304 |
+
else:
|
| 305 |
+
return
|
| 306 |
+
|
| 307 |
+
print(f"\nUsing documents: {doc_paths}")
|
| 308 |
+
|
| 309 |
+
try:
|
| 310 |
+
# Step 1: Parse and index
|
| 311 |
+
indexed_docs = demo_parse_and_index(doc_paths)
|
| 312 |
+
|
| 313 |
+
if not indexed_docs:
|
| 314 |
+
print("\nNo documents were indexed. Exiting.")
|
| 315 |
+
return
|
| 316 |
+
|
| 317 |
+
# Get first document ID for filtering
|
| 318 |
+
first_doc_id = indexed_docs[0]["doc_id"]
|
| 319 |
+
|
| 320 |
+
# Step 2: Semantic retrieval
|
| 321 |
+
demo_semantic_retrieval(
|
| 322 |
+
query="main topic content",
|
| 323 |
+
document_id=first_doc_id,
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
# Step 3: Grounded Q&A
|
| 327 |
+
demo_grounded_qa(
|
| 328 |
+
question="What is this document about?",
|
| 329 |
+
document_id=first_doc_id,
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
# Step 4: Filtered retrieval
|
| 333 |
+
demo_filtered_retrieval()
|
| 334 |
+
|
| 335 |
+
# Show stats
|
| 336 |
+
demo_index_stats()
|
| 337 |
+
|
| 338 |
+
print("\n" + "=" * 60)
|
| 339 |
+
print("Demo complete!")
|
| 340 |
+
print("=" * 60)
|
| 341 |
+
|
| 342 |
+
print("\nNext steps:")
|
| 343 |
+
print(" 1. Try the CLI: sparknet docint index your_document.pdf")
|
| 344 |
+
print(" 2. Query the index: sparknet docint retrieve 'your query'")
|
| 345 |
+
print(" 3. Ask questions: sparknet docint ask doc.pdf 'question' --use-rag")
|
| 346 |
+
|
| 347 |
+
except ImportError as e:
|
| 348 |
+
print(f"\nImport error: {e}")
|
| 349 |
+
print("Make sure all dependencies are installed:")
|
| 350 |
+
print(" pip install pymupdf pillow numpy pydantic chromadb")
|
| 351 |
+
|
| 352 |
+
except Exception as e:
|
| 353 |
+
print(f"\nError: {e}")
|
| 354 |
+
import traceback
|
| 355 |
+
traceback.print_exc()
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
if __name__ == "__main__":
|
| 359 |
+
main()
|
examples/rag_pipeline.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Example: RAG Pipeline
|
| 3 |
+
|
| 4 |
+
Demonstrates:
|
| 5 |
+
1. Indexing documents into vector store
|
| 6 |
+
2. Semantic search
|
| 7 |
+
3. Question answering with citations
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from loguru import logger
|
| 12 |
+
|
| 13 |
+
# Import RAG components
|
| 14 |
+
from src.rag import (
|
| 15 |
+
VectorStoreConfig,
|
| 16 |
+
EmbeddingConfig,
|
| 17 |
+
RetrieverConfig,
|
| 18 |
+
GeneratorConfig,
|
| 19 |
+
get_document_indexer,
|
| 20 |
+
get_document_retriever,
|
| 21 |
+
get_grounded_generator,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def example_indexing():
|
| 26 |
+
"""Index documents into vector store."""
|
| 27 |
+
print("=" * 50)
|
| 28 |
+
print("Document Indexing")
|
| 29 |
+
print("=" * 50)
|
| 30 |
+
|
| 31 |
+
# Get indexer
|
| 32 |
+
indexer = get_document_indexer()
|
| 33 |
+
|
| 34 |
+
# Index a document
|
| 35 |
+
sample_doc = Path("./data/sample.pdf")
|
| 36 |
+
|
| 37 |
+
if not sample_doc.exists():
|
| 38 |
+
print(f"Sample document not found: {sample_doc}")
|
| 39 |
+
print("Create a sample PDF at ./data/sample.pdf")
|
| 40 |
+
return False
|
| 41 |
+
|
| 42 |
+
# Index
|
| 43 |
+
result = indexer.index_document(sample_doc)
|
| 44 |
+
|
| 45 |
+
if result.success:
|
| 46 |
+
print(f"\nIndexed: {result.source_path}")
|
| 47 |
+
print(f" Document ID: {result.document_id}")
|
| 48 |
+
print(f" Chunks indexed: {result.num_chunks_indexed}")
|
| 49 |
+
print(f" Chunks skipped: {result.num_chunks_skipped}")
|
| 50 |
+
else:
|
| 51 |
+
print(f"Indexing failed: {result.error}")
|
| 52 |
+
return False
|
| 53 |
+
|
| 54 |
+
# Show stats
|
| 55 |
+
stats = indexer.get_index_stats()
|
| 56 |
+
print(f"\nIndex Stats:")
|
| 57 |
+
print(f" Total chunks: {stats['total_chunks']}")
|
| 58 |
+
print(f" Documents: {stats['num_documents']}")
|
| 59 |
+
print(f" Embedding model: {stats['embedding_model']}")
|
| 60 |
+
|
| 61 |
+
return True
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def example_search():
|
| 65 |
+
"""Search indexed documents."""
|
| 66 |
+
print("\n" + "=" * 50)
|
| 67 |
+
print("Semantic Search")
|
| 68 |
+
print("=" * 50)
|
| 69 |
+
|
| 70 |
+
# Get retriever
|
| 71 |
+
retriever = get_document_retriever()
|
| 72 |
+
|
| 73 |
+
# Search queries
|
| 74 |
+
queries = [
|
| 75 |
+
"What is the main topic?",
|
| 76 |
+
"key findings",
|
| 77 |
+
"conclusions and recommendations",
|
| 78 |
+
]
|
| 79 |
+
|
| 80 |
+
for query in queries:
|
| 81 |
+
print(f"\nQuery: '{query}'")
|
| 82 |
+
|
| 83 |
+
chunks = retriever.retrieve(query, top_k=3)
|
| 84 |
+
|
| 85 |
+
if not chunks:
|
| 86 |
+
print(" No results found")
|
| 87 |
+
continue
|
| 88 |
+
|
| 89 |
+
for i, chunk in enumerate(chunks, 1):
|
| 90 |
+
print(f"\n [{i}] Similarity: {chunk.similarity:.3f}")
|
| 91 |
+
if chunk.page is not None:
|
| 92 |
+
print(f" Page: {chunk.page + 1}")
|
| 93 |
+
print(f" Text: {chunk.text[:150]}...")
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def example_question_answering():
|
| 97 |
+
"""Answer questions using RAG."""
|
| 98 |
+
print("\n" + "=" * 50)
|
| 99 |
+
print("Question Answering with Citations")
|
| 100 |
+
print("=" * 50)
|
| 101 |
+
|
| 102 |
+
# Get generator
|
| 103 |
+
generator = get_grounded_generator()
|
| 104 |
+
|
| 105 |
+
# Questions
|
| 106 |
+
questions = [
|
| 107 |
+
"What is the main purpose of this document?",
|
| 108 |
+
"What are the key findings?",
|
| 109 |
+
"What recommendations are made?",
|
| 110 |
+
]
|
| 111 |
+
|
| 112 |
+
for question in questions:
|
| 113 |
+
print(f"\nQuestion: {question}")
|
| 114 |
+
print("-" * 40)
|
| 115 |
+
|
| 116 |
+
result = generator.answer_question(question, top_k=5)
|
| 117 |
+
|
| 118 |
+
print(f"\nAnswer: {result.answer}")
|
| 119 |
+
print(f"\nConfidence: {result.confidence:.2f}")
|
| 120 |
+
|
| 121 |
+
if result.abstained:
|
| 122 |
+
print(f"Note: {result.abstain_reason}")
|
| 123 |
+
|
| 124 |
+
if result.citations:
|
| 125 |
+
print(f"\nCitations ({len(result.citations)}):")
|
| 126 |
+
for citation in result.citations:
|
| 127 |
+
page = f"Page {citation.page + 1}" if citation.page is not None else ""
|
| 128 |
+
print(f" [{citation.index}] {page}: {citation.text_snippet[:60]}...")
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def example_filtered_search():
|
| 132 |
+
"""Search with metadata filters."""
|
| 133 |
+
print("\n" + "=" * 50)
|
| 134 |
+
print("Filtered Search")
|
| 135 |
+
print("=" * 50)
|
| 136 |
+
|
| 137 |
+
retriever = get_document_retriever()
|
| 138 |
+
|
| 139 |
+
# Search only in tables
|
| 140 |
+
print("\nSearching for tables only...")
|
| 141 |
+
table_chunks = retriever.retrieve_tables("data values", top_k=3)
|
| 142 |
+
|
| 143 |
+
if table_chunks:
|
| 144 |
+
print(f"Found {len(table_chunks)} table chunks:")
|
| 145 |
+
for chunk in table_chunks:
|
| 146 |
+
print(f" - Page {chunk.page + 1}: {chunk.text[:100]}...")
|
| 147 |
+
else:
|
| 148 |
+
print("No table chunks found")
|
| 149 |
+
|
| 150 |
+
# Search specific page range
|
| 151 |
+
print("\nSearching pages 1-3...")
|
| 152 |
+
page_chunks = retriever.retrieve_by_page(
|
| 153 |
+
"introduction",
|
| 154 |
+
page_range=(0, 2),
|
| 155 |
+
top_k=3,
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
if page_chunks:
|
| 159 |
+
print(f"Found {len(page_chunks)} chunks in pages 1-3:")
|
| 160 |
+
for chunk in page_chunks:
|
| 161 |
+
print(f" - Page {chunk.page + 1}: {chunk.text[:100]}...")
|
| 162 |
+
else:
|
| 163 |
+
print("No chunks found in specified pages")
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def example_full_pipeline():
|
| 167 |
+
"""Complete RAG pipeline demo."""
|
| 168 |
+
print("\n" + "=" * 50)
|
| 169 |
+
print("Full RAG Pipeline Demo")
|
| 170 |
+
print("=" * 50)
|
| 171 |
+
|
| 172 |
+
# Step 1: Index
|
| 173 |
+
print("\n[Step 1] Indexing documents...")
|
| 174 |
+
if not example_indexing():
|
| 175 |
+
return
|
| 176 |
+
|
| 177 |
+
# Step 2: Search
|
| 178 |
+
print("\n[Step 2] Testing search...")
|
| 179 |
+
example_search()
|
| 180 |
+
|
| 181 |
+
# Step 3: Q&A
|
| 182 |
+
print("\n[Step 3] Question answering...")
|
| 183 |
+
example_question_answering()
|
| 184 |
+
|
| 185 |
+
print("\n" + "=" * 50)
|
| 186 |
+
print("Pipeline demo complete!")
|
| 187 |
+
print("=" * 50)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
if __name__ == "__main__":
|
| 191 |
+
# Run full pipeline
|
| 192 |
+
example_full_pipeline()
|
nginx/nginx.conf
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPARKNET Production Nginx Configuration
|
| 2 |
+
# Reverse proxy for API and Demo services
|
| 3 |
+
|
| 4 |
+
user nginx;
|
| 5 |
+
worker_processes auto;
|
| 6 |
+
error_log /var/log/nginx/error.log warn;
|
| 7 |
+
pid /var/run/nginx.pid;
|
| 8 |
+
|
| 9 |
+
events {
|
| 10 |
+
worker_connections 1024;
|
| 11 |
+
use epoll;
|
| 12 |
+
multi_accept on;
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
http {
|
| 16 |
+
include /etc/nginx/mime.types;
|
| 17 |
+
default_type application/octet-stream;
|
| 18 |
+
|
| 19 |
+
# Logging format
|
| 20 |
+
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
|
| 21 |
+
'$status $body_bytes_sent "$http_referer" '
|
| 22 |
+
'"$http_user_agent" "$http_x_forwarded_for" '
|
| 23 |
+
'rt=$request_time uct="$upstream_connect_time" '
|
| 24 |
+
'uht="$upstream_header_time" urt="$upstream_response_time"';
|
| 25 |
+
|
| 26 |
+
access_log /var/log/nginx/access.log main;
|
| 27 |
+
|
| 28 |
+
# Performance optimizations
|
| 29 |
+
sendfile on;
|
| 30 |
+
tcp_nopush on;
|
| 31 |
+
tcp_nodelay on;
|
| 32 |
+
keepalive_timeout 65;
|
| 33 |
+
types_hash_max_size 2048;
|
| 34 |
+
|
| 35 |
+
# Gzip compression
|
| 36 |
+
gzip on;
|
| 37 |
+
gzip_vary on;
|
| 38 |
+
gzip_proxied any;
|
| 39 |
+
gzip_comp_level 6;
|
| 40 |
+
gzip_types text/plain text/css text/xml application/json application/javascript
|
| 41 |
+
application/xml application/xml+rss text/javascript application/x-javascript;
|
| 42 |
+
gzip_min_length 1000;
|
| 43 |
+
|
| 44 |
+
# Rate limiting zones
|
| 45 |
+
limit_req_zone $binary_remote_addr zone=api_limit:10m rate=30r/s;
|
| 46 |
+
limit_req_zone $binary_remote_addr zone=upload_limit:10m rate=5r/s;
|
| 47 |
+
limit_conn_zone $binary_remote_addr zone=conn_limit:10m;
|
| 48 |
+
|
| 49 |
+
# Security headers map
|
| 50 |
+
map $sent_http_content_type $security_headers {
|
| 51 |
+
default "always";
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
# Upstream servers
|
| 55 |
+
upstream sparknet_api {
|
| 56 |
+
server sparknet-api:8000;
|
| 57 |
+
keepalive 32;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
upstream sparknet_demo {
|
| 61 |
+
server sparknet-demo:4000;
|
| 62 |
+
keepalive 32;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
# HTTP redirect to HTTPS (uncomment for production with SSL)
|
| 66 |
+
# server {
|
| 67 |
+
# listen 80;
|
| 68 |
+
# listen [::]:80;
|
| 69 |
+
# server_name _;
|
| 70 |
+
# return 301 https://$host$request_uri;
|
| 71 |
+
# }
|
| 72 |
+
|
| 73 |
+
# Main HTTP server (development/internal)
|
| 74 |
+
server {
|
| 75 |
+
listen 80;
|
| 76 |
+
listen [::]:80;
|
| 77 |
+
server_name _;
|
| 78 |
+
|
| 79 |
+
# Connection limits
|
| 80 |
+
limit_conn conn_limit 20;
|
| 81 |
+
|
| 82 |
+
# Security headers
|
| 83 |
+
add_header X-Frame-Options "SAMEORIGIN" always;
|
| 84 |
+
add_header X-Content-Type-Options "nosniff" always;
|
| 85 |
+
add_header X-XSS-Protection "1; mode=block" always;
|
| 86 |
+
add_header Referrer-Policy "strict-origin-when-cross-origin" always;
|
| 87 |
+
|
| 88 |
+
# Client body size for file uploads
|
| 89 |
+
client_max_body_size 100M;
|
| 90 |
+
client_body_buffer_size 128k;
|
| 91 |
+
client_body_timeout 300s;
|
| 92 |
+
|
| 93 |
+
# Proxy timeouts
|
| 94 |
+
proxy_connect_timeout 60s;
|
| 95 |
+
proxy_send_timeout 300s;
|
| 96 |
+
proxy_read_timeout 300s;
|
| 97 |
+
|
| 98 |
+
# Health check endpoint (no rate limiting)
|
| 99 |
+
location /api/health {
|
| 100 |
+
proxy_pass http://sparknet_api;
|
| 101 |
+
proxy_http_version 1.1;
|
| 102 |
+
proxy_set_header Host $host;
|
| 103 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 104 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 105 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
# API endpoints
|
| 109 |
+
location /api/ {
|
| 110 |
+
# Rate limiting
|
| 111 |
+
limit_req zone=api_limit burst=50 nodelay;
|
| 112 |
+
|
| 113 |
+
proxy_pass http://sparknet_api;
|
| 114 |
+
proxy_http_version 1.1;
|
| 115 |
+
|
| 116 |
+
# Headers
|
| 117 |
+
proxy_set_header Host $host;
|
| 118 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 119 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 120 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 121 |
+
proxy_set_header Connection "";
|
| 122 |
+
|
| 123 |
+
# CORS headers (if not handled by FastAPI)
|
| 124 |
+
# add_header Access-Control-Allow-Origin "*" always;
|
| 125 |
+
# add_header Access-Control-Allow-Methods "GET, POST, PUT, DELETE, OPTIONS" always;
|
| 126 |
+
# add_header Access-Control-Allow-Headers "Authorization, Content-Type" always;
|
| 127 |
+
|
| 128 |
+
# Handle OPTIONS for CORS preflight
|
| 129 |
+
if ($request_method = 'OPTIONS') {
|
| 130 |
+
add_header Access-Control-Allow-Origin "*";
|
| 131 |
+
add_header Access-Control-Allow-Methods "GET, POST, PUT, DELETE, OPTIONS";
|
| 132 |
+
add_header Access-Control-Allow-Headers "Authorization, Content-Type";
|
| 133 |
+
add_header Access-Control-Max-Age 3600;
|
| 134 |
+
add_header Content-Length 0;
|
| 135 |
+
add_header Content-Type text/plain;
|
| 136 |
+
return 204;
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
# Document upload endpoint (lower rate limit)
|
| 141 |
+
location /api/documents/upload {
|
| 142 |
+
limit_req zone=upload_limit burst=10 nodelay;
|
| 143 |
+
|
| 144 |
+
proxy_pass http://sparknet_api;
|
| 145 |
+
proxy_http_version 1.1;
|
| 146 |
+
|
| 147 |
+
proxy_set_header Host $host;
|
| 148 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 149 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 150 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 151 |
+
|
| 152 |
+
# Increased timeout for large uploads
|
| 153 |
+
proxy_connect_timeout 120s;
|
| 154 |
+
proxy_send_timeout 600s;
|
| 155 |
+
proxy_read_timeout 600s;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
# RAG streaming endpoint (SSE support)
|
| 159 |
+
location /api/rag/query/stream {
|
| 160 |
+
proxy_pass http://sparknet_api;
|
| 161 |
+
proxy_http_version 1.1;
|
| 162 |
+
|
| 163 |
+
proxy_set_header Host $host;
|
| 164 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 165 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 166 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 167 |
+
proxy_set_header Connection "";
|
| 168 |
+
|
| 169 |
+
# SSE-specific settings
|
| 170 |
+
proxy_buffering off;
|
| 171 |
+
proxy_cache off;
|
| 172 |
+
chunked_transfer_encoding off;
|
| 173 |
+
proxy_read_timeout 3600s;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
# Streamlit Demo (with WebSocket support)
|
| 177 |
+
location / {
|
| 178 |
+
proxy_pass http://sparknet_demo;
|
| 179 |
+
proxy_http_version 1.1;
|
| 180 |
+
|
| 181 |
+
proxy_set_header Host $host;
|
| 182 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 183 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 184 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 185 |
+
|
| 186 |
+
# WebSocket support for Streamlit
|
| 187 |
+
proxy_set_header Upgrade $http_upgrade;
|
| 188 |
+
proxy_set_header Connection "upgrade";
|
| 189 |
+
|
| 190 |
+
# Streamlit specific
|
| 191 |
+
proxy_read_timeout 86400;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
# Streamlit WebSocket endpoint
|
| 195 |
+
location /_stcore/stream {
|
| 196 |
+
proxy_pass http://sparknet_demo;
|
| 197 |
+
proxy_http_version 1.1;
|
| 198 |
+
|
| 199 |
+
proxy_set_header Host $host;
|
| 200 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 201 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 202 |
+
|
| 203 |
+
# WebSocket
|
| 204 |
+
proxy_set_header Upgrade $http_upgrade;
|
| 205 |
+
proxy_set_header Connection "upgrade";
|
| 206 |
+
|
| 207 |
+
proxy_read_timeout 86400;
|
| 208 |
+
proxy_buffering off;
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
# Streamlit static files
|
| 212 |
+
location /static {
|
| 213 |
+
proxy_pass http://sparknet_demo;
|
| 214 |
+
proxy_http_version 1.1;
|
| 215 |
+
proxy_set_header Host $host;
|
| 216 |
+
|
| 217 |
+
# Cache static assets
|
| 218 |
+
expires 1d;
|
| 219 |
+
add_header Cache-Control "public, immutable";
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
# Error pages
|
| 223 |
+
error_page 502 503 504 /50x.html;
|
| 224 |
+
location = /50x.html {
|
| 225 |
+
root /usr/share/nginx/html;
|
| 226 |
+
internal;
|
| 227 |
+
}
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
# HTTPS server (uncomment and configure for production)
|
| 231 |
+
# server {
|
| 232 |
+
# listen 443 ssl http2;
|
| 233 |
+
# listen [::]:443 ssl http2;
|
| 234 |
+
# server_name sparknet.example.com;
|
| 235 |
+
#
|
| 236 |
+
# # SSL configuration
|
| 237 |
+
# ssl_certificate /etc/nginx/ssl/fullchain.pem;
|
| 238 |
+
# ssl_certificate_key /etc/nginx/ssl/privkey.pem;
|
| 239 |
+
# ssl_session_timeout 1d;
|
| 240 |
+
# ssl_session_cache shared:SSL:50m;
|
| 241 |
+
# ssl_session_tickets off;
|
| 242 |
+
#
|
| 243 |
+
# # Modern SSL configuration
|
| 244 |
+
# ssl_protocols TLSv1.2 TLSv1.3;
|
| 245 |
+
# ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384;
|
| 246 |
+
# ssl_prefer_server_ciphers off;
|
| 247 |
+
#
|
| 248 |
+
# # HSTS
|
| 249 |
+
# add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
|
| 250 |
+
#
|
| 251 |
+
# # Include same location blocks as HTTP server above
|
| 252 |
+
# # ...
|
| 253 |
+
# }
|
| 254 |
+
}
|
run_demo.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
SPARKNET Demo Launcher
|
| 4 |
+
|
| 5 |
+
Cross-platform launcher for the Streamlit demo.
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
python run_demo.py [--port PORT]
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import subprocess
|
| 12 |
+
import sys
|
| 13 |
+
import os
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
def check_dependencies():
|
| 17 |
+
"""Check and install required dependencies."""
|
| 18 |
+
print("📦 Checking dependencies...")
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
import streamlit
|
| 22 |
+
print(f" ✅ Streamlit {streamlit.__version__}")
|
| 23 |
+
except ImportError:
|
| 24 |
+
print(" 📥 Installing Streamlit...")
|
| 25 |
+
subprocess.run([sys.executable, "-m", "pip", "install", "streamlit"], check=True)
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
import pandas
|
| 29 |
+
print(f" ✅ Pandas {pandas.__version__}")
|
| 30 |
+
except ImportError:
|
| 31 |
+
print(" 📥 Installing Pandas...")
|
| 32 |
+
subprocess.run([sys.executable, "-m", "pip", "install", "pandas"], check=True)
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
import httpx
|
| 36 |
+
print(f" ✅ httpx {httpx.__version__}")
|
| 37 |
+
except ImportError:
|
| 38 |
+
print(" 📥 Installing httpx...")
|
| 39 |
+
subprocess.run([sys.executable, "-m", "pip", "install", "httpx"], check=True)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def check_ollama():
|
| 43 |
+
"""Check if Ollama is running."""
|
| 44 |
+
print("\n🔍 Checking Ollama status...")
|
| 45 |
+
|
| 46 |
+
try:
|
| 47 |
+
import httpx
|
| 48 |
+
with httpx.Client(timeout=2.0) as client:
|
| 49 |
+
response = client.get("http://localhost:11434/api/tags")
|
| 50 |
+
if response.status_code == 200:
|
| 51 |
+
data = response.json()
|
| 52 |
+
models = len(data.get("models", []))
|
| 53 |
+
print(f" ✅ Ollama is running ({models} models)")
|
| 54 |
+
return True
|
| 55 |
+
except Exception:
|
| 56 |
+
pass
|
| 57 |
+
|
| 58 |
+
print(" ⚠️ Ollama not running (demo will use simulated responses)")
|
| 59 |
+
print(" Start with: ollama serve")
|
| 60 |
+
return False
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def main():
|
| 64 |
+
"""Main entry point."""
|
| 65 |
+
import argparse
|
| 66 |
+
|
| 67 |
+
parser = argparse.ArgumentParser(description="SPARKNET Demo Launcher")
|
| 68 |
+
parser.add_argument("--port", type=int, default=8501, help="Port to run on")
|
| 69 |
+
args = parser.parse_args()
|
| 70 |
+
|
| 71 |
+
print("=" * 50)
|
| 72 |
+
print("🔥 SPARKNET Demo Launcher")
|
| 73 |
+
print("=" * 50)
|
| 74 |
+
print()
|
| 75 |
+
|
| 76 |
+
# Get project root
|
| 77 |
+
project_root = Path(__file__).parent
|
| 78 |
+
demo_app = project_root / "demo" / "app.py"
|
| 79 |
+
|
| 80 |
+
if not demo_app.exists():
|
| 81 |
+
print(f"❌ Demo app not found: {demo_app}")
|
| 82 |
+
sys.exit(1)
|
| 83 |
+
|
| 84 |
+
# Check dependencies
|
| 85 |
+
check_dependencies()
|
| 86 |
+
|
| 87 |
+
# Check Ollama
|
| 88 |
+
check_ollama()
|
| 89 |
+
|
| 90 |
+
# Launch
|
| 91 |
+
print()
|
| 92 |
+
print(f"🚀 Launching SPARKNET Demo on port {args.port}...")
|
| 93 |
+
print(f" URL: http://localhost:{args.port}")
|
| 94 |
+
print()
|
| 95 |
+
print("Press Ctrl+C to stop")
|
| 96 |
+
print("=" * 50)
|
| 97 |
+
print()
|
| 98 |
+
|
| 99 |
+
# Run Streamlit
|
| 100 |
+
os.chdir(project_root)
|
| 101 |
+
subprocess.run([
|
| 102 |
+
sys.executable, "-m", "streamlit", "run",
|
| 103 |
+
str(demo_app),
|
| 104 |
+
"--server.port", str(args.port),
|
| 105 |
+
"--server.headless", "true",
|
| 106 |
+
])
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
if __name__ == "__main__":
|
| 110 |
+
main()
|
run_demo.sh
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# SPARKNET Demo Launcher
|
| 3 |
+
# Usage: ./run_demo.sh [port]
|
| 4 |
+
|
| 5 |
+
set -e
|
| 6 |
+
|
| 7 |
+
PORT=${1:-8501}
|
| 8 |
+
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
| 9 |
+
|
| 10 |
+
echo "🔥 SPARKNET Demo Launcher"
|
| 11 |
+
echo "========================="
|
| 12 |
+
echo ""
|
| 13 |
+
|
| 14 |
+
# Check Python
|
| 15 |
+
if ! command -v python3 &> /dev/null; then
|
| 16 |
+
echo "❌ Python3 not found. Please install Python 3.10+"
|
| 17 |
+
exit 1
|
| 18 |
+
fi
|
| 19 |
+
|
| 20 |
+
# Check Streamlit
|
| 21 |
+
if ! python3 -c "import streamlit" &> /dev/null; then
|
| 22 |
+
echo "📦 Installing Streamlit..."
|
| 23 |
+
pip install streamlit
|
| 24 |
+
fi
|
| 25 |
+
|
| 26 |
+
# Check demo dependencies
|
| 27 |
+
echo "📦 Checking dependencies..."
|
| 28 |
+
pip install -q -r "$SCRIPT_DIR/demo/requirements.txt" 2>/dev/null || true
|
| 29 |
+
|
| 30 |
+
# Check Ollama status
|
| 31 |
+
echo ""
|
| 32 |
+
echo "🔍 Checking Ollama status..."
|
| 33 |
+
if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
|
| 34 |
+
echo "✅ Ollama is running"
|
| 35 |
+
MODELS=$(curl -s http://localhost:11434/api/tags | python3 -c "import sys,json; d=json.load(sys.stdin); print(len(d.get('models', [])))" 2>/dev/null || echo "?")
|
| 36 |
+
echo " Models available: $MODELS"
|
| 37 |
+
else
|
| 38 |
+
echo "⚠️ Ollama not running (demo will use simulated responses)"
|
| 39 |
+
echo " Start with: ollama serve"
|
| 40 |
+
fi
|
| 41 |
+
|
| 42 |
+
# Launch demo
|
| 43 |
+
echo ""
|
| 44 |
+
echo "🚀 Launching SPARKNET Demo on port $PORT..."
|
| 45 |
+
echo " URL: http://localhost:$PORT"
|
| 46 |
+
echo ""
|
| 47 |
+
echo "Press Ctrl+C to stop"
|
| 48 |
+
echo "========================="
|
| 49 |
+
echo ""
|
| 50 |
+
|
| 51 |
+
cd "$SCRIPT_DIR"
|
| 52 |
+
streamlit run demo/app.py --server.port "$PORT" --server.headless true
|
scripts to get ideas from/ides.txt
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
This introduces the fundamentals of document processing and how they connect to agentic AI workflows.
|
| 2 |
+
|
| 3 |
+
The core problem is that modern organizations are overwhelmed with digital documents such as PDFs, scans, receipts, contracts, and reports. These documents are designed for human reading, not machine processing, which makes searching, analyzing, and automating information extremely difficult. Valuable data is often trapped inside unstructured formats, requiring manual reading and re-entry, which does not scale.
|
| 4 |
+
|
| 5 |
+
The goal of document processing is to convert unstructured documents into structured, machine-readable data. Common output formats include JSON and Markdown or HTML. JSON is well suited for machines, APIs, databases, and analytics pipelines because it is hierarchical and easy to process programmatically. Markdown or HTML preserves layout elements such as headings, tables, and lists, making it ideal for human readers and large language models, especially in chat interfaces and retrieval-augmented generation systems.
|
| 6 |
+
|
| 7 |
+
When documents are scanned or photographed, the system only sees image pixels. Optical Character Recognition (OCR) is required to convert those pixels into text. OCR typically involves two main steps: image preprocessing, such as deskewing, denoising, and contrast adjustment, followed by text recognition, where visual patterns are matched to characters. The output is editable or searchable text.
|
| 8 |
+
|
| 9 |
+
However, OCR has important limitations. It does not understand document structure, meaning, or relationships between elements. It often produces a flat block of text and struggles with poor image quality, complex layouts, multi-column text, nested tables, handwriting, stamps, and stylized fonts. These weaknesses can lead to cascading errors during parsing and extraction. OCR provides perception, but not comprehension.
|
| 10 |
+
|
| 11 |
+
A key distinction introduced in this lesson is that processing is not the same as understanding. OCR can read characters but cannot determine what is a header, a value, a total amount, or a table entry. To move from raw text to meaningful structured data, an additional cognitive layer is required.
|
| 12 |
+
|
| 13 |
+
Agentic AI provides this missing layer. In document processing, an agent is an autonomous system that can perceive input, reason about goals, decide which tools to use, and act iteratively until the task is complete. In this context, OCR functions as the eyes, while the agent serves as the brain. Unlike rigid rule-based pipelines, agents can adapt to edge cases and unexpected document variations.
|
| 14 |
+
|
| 15 |
+
An agentic document system is typically composed of three components. The brain, implemented using a large language model, handles reasoning, planning, and decision-making. The eyes, implemented through OCR, convert visual content into text. The hands are the tools the agent can use, such as APIs, database queries, file operations, and function calls. Together, these components allow the system to answer high-level requests, such as identifying the total amount on an invoice, without hardcoding every step.
|
| 16 |
+
|
| 17 |
+
The lesson also introduces the ReAct framework, which describes how agents reason step by step. The agent alternates between thinking about what to do next, taking an action by calling a tool, observing the result, and then repeating the process. This loop enables adaptability, error correction, and transparency, since the agent’s reasoning and tool usage can be inspected.
|
| 18 |
+
|
| 19 |
+
The lesson concludes with a practical lab. Learners build a simple document agent that combines OCR, parsing, and agentic reasoning to read documents and extract structured information. The lab follows a step-by-step approach, reinforcing the bottom-up journey from pixels, to text, to structure, and finally to reasoning.
|
| 20 |
+
=========================================================================================================================
|
| 21 |
+
|
| 22 |
+
This walkthrough demonstrates how OCR, rule-based methods, and LLM-based agentic reasoning work together in document processing, and where each approach succeeds or fails.
|
| 23 |
+
|
| 24 |
+
OCR is first applied to extract raw text from documents, which works well for clean, printed invoices but produces unstructured, noisy text with no understanding of meaning. Simple rule-based approaches such as regular expressions are then used to extract values like tax and total, but they fail easily due to small OCR variations, ambiguous wording, or layout differences. This highlights how brittle traditional pipelines are when faced with real-world data.
|
| 25 |
+
|
| 26 |
+
An agentic approach is then introduced, combining OCR as perception, an LLM as the reasoning component, and tools within a ReAct-style loop. The agent decides when to call OCR, interprets the extracted text semantically, and outputs structured JSON without relying on hardcoded rules. This allows correct extraction of values such as totals even when multiple similar terms (e.g., subtotal vs total) appear.
|
| 27 |
+
|
| 28 |
+
More challenging examples show the limits of OCR and the strengths and weaknesses of LLM reasoning. Tables with complex layouts, handwriting, and low-quality receipts produce chaotic OCR outputs. The agent can often infer intent and recover partially correct information, but errors still occur when OCR inaccuracies distort the underlying data. In some cases, the LLM overcorrects or reasons from incorrect inputs, leading to plausible but wrong conclusions.
|
| 29 |
+
|
| 30 |
+
The key takeaway is that OCR provides reading but not understanding, regex provides rules without meaning, and LLM-based agents introduce semantic reasoning that significantly improves robustness. However, reliable real-world document understanding still requires multiple components working together, including OCR, layout analysis, vision-language models, agentic workflows, and validation mechanisms.
|
| 31 |
+
=========================================================================================================================
|
| 32 |
+
|
| 33 |
+
OCR has evolved from rule-based, procedural computer vision systems to modern deep learning–based approaches. Early OCR systems, represented by Tesseract, relied heavily on handcrafted pipelines such as line detection, character segmentation, and shape matching. These systems work well for clean, printed, black-and-white text with regular layouts and can run efficiently on CPUs, but they struggle with real-world variability such as complex layouts, curved text, images, or noise.
|
| 34 |
+
|
| 35 |
+
Around 2015, deep learning fundamentally changed OCR by introducing data-driven, end-to-end models. Modern OCR systems separate the problem into two modular stages: text detection (finding text regions) and text recognition (reading the text within those regions). PaddleOCR is a representative system from this era, using neural networks for both stages, specifically DBNet for detection and transformer-based models for recognition. This approach handles irregular layouts, curved or rotated text, and noisy real-world images far better than traditional methods, especially when accelerated with GPUs.
|
| 36 |
+
|
| 37 |
+
While both Tesseract and PaddleOCR are open source and support many languages, they are best suited to different use cases. Tesseract is ideal for simple document scanning such as books with clean layouts, whereas PaddleOCR performs better on complex, real-world documents like receipts, signage, and mixed-layout content. Overall, these tools illustrate how OCR has shifted from rigid, rule-based pipelines to flexible, learnable systems that can be integrated into larger document intelligence and agentic workflows.
|
| 38 |
+
|
| 39 |
+
=========================================================================================================================
|
| 40 |
+
A modern OCR pipeline is set up using PaddleOCR along with image and visualization tools. PaddleOCR runs an end-to-end process that includes preprocessing and two deep learning stages: text detection, which finds text regions and returns bounding boxes, and text recognition, which reads the text in each region and outputs the text with confidence scores. Compared to earlier OCR, this pipeline provides localization and improved accuracy on messy inputs such as receipts, which makes downstream reasoning tasks like verifying totals more reliable when combined with an LLM agent.
|
| 41 |
+
|
| 42 |
+
The same approach is tested on harder examples. On a complex table, PaddleOCR still makes errors such as misreading scientific notation, but an LLM agent can sometimes correct these issues using contextual reasoning and domain expectations. On handwriting, recognition improves over older OCR in some places, but key fields like names and several answers can still be misread, and the agent can only be as accurate as the OCR signal.
|
| 43 |
+
|
| 44 |
+
New document types expose major weaknesses related to layout and reading order. For report pages containing charts, the OCR may extract axis numbers without recognizing the full chart as a unit, losing context. For multi-column articles, the text can be read across columns incorrectly, producing garbled output. To address this, a layout detection model is added to segment the document into labeled regions such as title, abstract, text blocks, table, chart, footer, and numbers. This improves structure and preserves reading order by keeping text within coherent regions, although errors remain, such as splitting a table into multiple parts or failing to separate headers from table content in bank statements.
|
| 45 |
+
|
| 46 |
+
Overall, PaddleOCR significantly improves real-world OCR accuracy and adds bounding-box structure, and layout detection helps with region-level organization and reading order. However, these tools still fall short of full semantic document understanding, especially for complex layouts, tables, and small but important text.
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
=========================================================================================================================
|
| 50 |
+
Documents often have complex layouts, so extracting text and sending it directly to a language model can destroy structure and mix content such as columns, tables, captions, and figures. Layout detection addresses this by identifying and labeling page regions like paragraphs, tables, figures, headers, footers, and captions so downstream systems keep structure and target the right areas.
|
| 51 |
+
|
| 52 |
+
Reading order is a separate problem: it determines the sequence a human would read content, especially in multi-column pages and documents with floating elements. Older heuristic methods (top-to-bottom, left-to-right rules) fail on real layouts. LayoutReader replaces rules with a learned model trained on a large reading-order dataset, using OCR bounding boxes and visual-spatial features to reconstruct a human-like token sequence.
|
| 53 |
+
|
| 54 |
+
Even with correct reading order, OCR-only pipelines remain limited because OCR captures text but misses visual context such as charts, diagrams, and spatial relationships. Forms require linking labels to values and may need key-value models like LayoutLM and vision for elements like checkboxes. Tables need structure-preserving models such as Table Transformer, TableFormer, or TABLET to recover rows and columns and output usable formats like CSV/JSON/HTML. Handwriting often requires specialized ICR models trained on handwritten data. Multilingual documents add challenges like script detection and different reading directions.
|
| 55 |
+
|
| 56 |
+
Vision-Language Models (VLMs) extend LLMs by adding a vision encoder and projector so they can reason over images plus text, enabling interpretation of visual elements. However, VLMs can still struggle with small text, nested layouts, multi-page structure, hallucinations, and weak grounding unless they are guided by layout structure.
|
| 57 |
+
|
| 58 |
+
A practical hybrid approach combines layout detection and reading-order models for deterministic structure with VLMs for visually rich regions. An agent can orchestrate this workflow by using OCR plus bounding boxes, reordering text with LayoutReader, detecting regions (tables, charts, text blocks), and selectively sending cropped regions to specialized VLM-based tools for table and chart understanding based on the user’s question.
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
=========================================================================================================================
|
| 63 |
+
An agentic document intelligence pipeline combines OCR, reading-order reconstruction, layout detection, and vision-language model analysis for visual regions.
|
| 64 |
+
|
| 65 |
+
Text extraction uses PaddleOCR to produce, for each detected text region, the recognized string, a confidence score, and polygon bounding boxes. Bounding boxes are visualized for verification and converted into a standardized XYXY format using structured data objects for cleaner downstream processing.
|
| 66 |
+
|
| 67 |
+
Reading order is reconstructed with a LayoutReader model built on LayoutLMv3. OCR bounding boxes are normalized to the 0–1000 coordinate range expected by LayoutLM-style models, then the model predicts an ordering index for each region. Regions are sorted by this index to create a correctly sequenced text representation that can answer many questions without visual models.
|
| 68 |
+
|
| 69 |
+
Layout detection uses PaddleOCR’s layout detector to segment the page into labeled regions such as text blocks, titles, tables, charts, and figures. Each region is assigned a unique ID, stored in structured objects, and visualized with labeled boxes and confidence scores.
|
| 70 |
+
|
| 71 |
+
For tables and charts, regions are cropped from the original document and encoded in base64 to be sent to vision APIs. Cropping improves focus, reduces noise, and lowers cost, but localization can still be imperfect and requires careful prompt design.
|
| 72 |
+
|
| 73 |
+
Two specialized tools are defined for vision-language model calls: one for chart interpretation and one for table extraction. Each tool uses a structured prompt with explicit fields and a JSON output template to produce machine-readable results. A shared multimodal-call utility packages the prompt plus the cropped image, and the tools are exposed to the agent via a tool interface.
|
| 74 |
+
|
| 75 |
+
A tool-calling agent is created with a system context containing the ordered OCR text plus layout region IDs and types. For a given user question, the agent decides whether text alone is sufficient; if not, it selects the appropriate tool, analyzes the target region, and merges the tool output with the textual context into a final answer.
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
=========================================================================================================================
|
| 79 |
+
Agentic Document Extraction (ADE) is a unified, vision-first document intelligence system exposed through a single API that converts documents, images, presentations, and spreadsheets into structured Markdown and JSON.
|
| 80 |
+
|
| 81 |
+
The system is designed around three core principles. Vision-first processing treats documents as visual objects where meaning comes from layout, structure, and spatial relationships rather than raw text tokens. A data-centric approach emphasizes training on highly curated, document-specific datasets, prioritizing data quality alongside model design. An agentic architecture enables planning, routing, execution, and verification steps to iteratively reach high-quality outputs.
|
| 82 |
+
|
| 83 |
+
ADE replaces traditional pipelines built from OCR, layout analysis, and vision-language models with document-native vision transformers called DPTs (DPT-1, DPT-2, and DPT-2-mini). These models natively perform reading order reconstruction, layout detection, text recognition, and figure captioning within a single framework.
|
| 84 |
+
|
| 85 |
+
The core architecture consists of document-native vision models at the foundation, intelligent parsing and routing agents that handle different content types such as text, tables, and figures through separate paths, and an application layer that delivers user-facing capabilities like key-value (field) extraction, document splitting, and content preparation for retrieval-augmented generation.
|
| 86 |
+
|
| 87 |
+
Primary use cases include precise field extraction with traceability back to source regions, and preparation of complex documents for RAG systems that must preserve tables, figures, and structural context. ADE achieves state-of-the-art accuracy, exceeding human performance on the DocVQA benchmark, demonstrating strong performance on real scanned and handwritten documents.
|
| 88 |
+
|
| 89 |
+
The platform is accessible through a visual interface, REST APIs, and Python or TypeScript libraries, enabling flexible integration into document
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
=========================================================================================================================
|
| 94 |
+
Agentic Document Extraction (ADE) is used through an API-driven workflow to parse complex documents and extract structured, verifiable information using document-natiprocessing workflows at scale.ve vision models.
|
| 95 |
+
|
| 96 |
+
The process begins by sending documents to a parsing API powered by Document Pretrained Transformers (DPT-2-latest or DPT-1-latest). The parser converts each document into structured JSON and Markdown, identifying semantically meaningful chunks such as text blocks, tables, figures, charts, logos, margins, and attestations. Each chunk and even individual table cells receive unique identifiers and bounding boxes, enabling precise visual grounding and traceability.
|
| 97 |
+
|
| 98 |
+
Parsed outputs include:
|
| 99 |
+
|
| 100 |
+
* Structured chunks with type, coordinates, and page references
|
| 101 |
+
* Markdown representations of text, tables, and figures
|
| 102 |
+
* Cell-level identifiers for tables, enabling fine-grained referencing
|
| 103 |
+
* Rich visual descriptions for figures, charts, flowcharts, and illustrations
|
| 104 |
+
|
| 105 |
+
A schema-based extraction step is then applied. A user-defined JSON schema specifies the required fields, including nested objects, numeric values, strings, and booleans. The extraction API combines the parsed document representation with this schema to return structured key-value pairs along with metadata linking each extracted value back to its exact source region or table cell.
|
| 106 |
+
|
| 107 |
+
The system demonstrates robust performance across highly challenging document types:
|
| 108 |
+
|
| 109 |
+
* Utility bills with mixed text, tables, and usage charts
|
| 110 |
+
* Charts and flowcharts with implicit spatial relationships and arrows
|
| 111 |
+
* Sparse tables, merged cells, and very large “mega tables” with thousands of values
|
| 112 |
+
* Handwritten forms, checkboxes, circled answers, and medical annotations
|
| 113 |
+
* Mathematical handwriting with symbols, equations, and square roots
|
| 114 |
+
* Purely visual documents such as instruction manuals and infographics
|
| 115 |
+
* Official documents containing stamps, curved text, and handwritten signatures
|
| 116 |
+
|
| 117 |
+
ADE handles all of these cases through a single, consistent API without requiring custom OCR pipelines, layout rules, or manual model orchestration. The output supports downstream applications such as user interfaces, compliance workflows, analytics, and reliable field extraction with full visual traceability, even under extreme document variability and complexity.
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
=========================================================================================================================
|
| 121 |
+
A multi-document financial intake pipeline is built around LandingAI ADE to handle mixed uploads with unknown filenames and unknown document types.
|
| 122 |
+
|
| 123 |
+
1. Batch parsing and page-level Markdown
|
| 124 |
+
Each uploaded file is sent to the Parse API using a DPT model. The response is requested as per-page Markdown so the first page can be used for fast identification while still keeping full parsed output available for extraction and grounding.
|
| 125 |
+
|
| 126 |
+
2. Automatic document type classification
|
| 127 |
+
The Extract API is used to categorize each file by running a lightweight schema over the first-page Markdown. A Pydantic schema defines an enum of expected document types (for example investment statement, pay stub, bank statement, government ID, tax form) with rich descriptions to improve classification reliability. Pydantic is converted to JSON internally before extraction.
|
| 128 |
+
|
| 129 |
+
3. Type-specific field extraction with dedicated schemas
|
| 130 |
+
For each identified document type, a separate Pydantic extraction schema is applied (ID fields, tax form fields, pay stub fields, bank statement fields, investment fields). The pipeline selects the schema dynamically based on the classified type, then calls Extract to return structured key-value pairs plus extraction metadata that links each value to chunk IDs for visual grounding.
|
| 131 |
+
|
| 132 |
+
4. Grounding-focused visualization for review
|
| 133 |
+
Parsed outputs are rendered with bounding boxes to show detected chunks (text, tables, figures) and cell-level structure for tables. A second visualization focuses only on the specific fields requested by each schema, highlighting exactly where each extracted value came from, enabling fast human review.
|
| 134 |
+
|
| 135 |
+
5. Consolidation into a structured summary table
|
| 136 |
+
All extracted fields across documents are aggregated into a single tabular summary (for example a Pandas DataFrame) with columns such as applicant folder, document name, detected type, field name, and field value. This replaces manual opening, searching, and retyping.
|
| 137 |
+
|
| 138 |
+
6. Validation and consistency checks
|
| 139 |
+
Custom validation logic is applied across the extracted results, such as:
|
| 140 |
+
|
| 141 |
+
* Cross-document name matching to detect inconsistent applicants across uploaded files
|
| 142 |
+
* Recency checks by extracting years from dates and flagging outdated documents
|
| 143 |
+
* Asset aggregation by summing balances across bank and investment statements, scalable to many accounts
|
| 144 |
+
|
| 145 |
+
The result is an end-to-end workflow that parses heterogeneous documents, identifies their types, extracts structured fields with traceable grounding, produces a reviewer-friendly summary, and runs automated checks to surface inconsistencies and missing requirements.
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
=========================================================================================================================
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
|
src/agents/document_agent.py
ADDED
|
@@ -0,0 +1,661 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
DocumentAgent for SPARKNET
|
| 3 |
+
|
| 4 |
+
A ReAct-style agent for document intelligence tasks:
|
| 5 |
+
- Document parsing and extraction
|
| 6 |
+
- Field extraction with grounding
|
| 7 |
+
- Table and chart analysis
|
| 8 |
+
- Document classification
|
| 9 |
+
- Question answering over documents
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 13 |
+
from dataclasses import dataclass
|
| 14 |
+
from enum import Enum
|
| 15 |
+
import json
|
| 16 |
+
import time
|
| 17 |
+
from loguru import logger
|
| 18 |
+
|
| 19 |
+
from .base_agent import BaseAgent, Task, Message
|
| 20 |
+
from ..llm.langchain_ollama_client import LangChainOllamaClient
|
| 21 |
+
from ..document.schemas.core import (
|
| 22 |
+
ProcessedDocument,
|
| 23 |
+
DocumentChunk,
|
| 24 |
+
EvidenceRef,
|
| 25 |
+
ExtractionResult,
|
| 26 |
+
)
|
| 27 |
+
from ..document.schemas.extraction import ExtractionSchema, ExtractedField
|
| 28 |
+
from ..document.schemas.classification import DocumentClassification, DocumentType
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class AgentAction(str, Enum):
|
| 32 |
+
"""Actions the DocumentAgent can take."""
|
| 33 |
+
THINK = "think"
|
| 34 |
+
USE_TOOL = "use_tool"
|
| 35 |
+
ANSWER = "answer"
|
| 36 |
+
ABSTAIN = "abstain"
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
@dataclass
|
| 40 |
+
class ThoughtAction:
|
| 41 |
+
"""A thought-action pair in the ReAct loop."""
|
| 42 |
+
thought: str
|
| 43 |
+
action: AgentAction
|
| 44 |
+
tool_name: Optional[str] = None
|
| 45 |
+
tool_args: Optional[Dict[str, Any]] = None
|
| 46 |
+
observation: Optional[str] = None
|
| 47 |
+
evidence: Optional[List[EvidenceRef]] = None
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@dataclass
|
| 51 |
+
class AgentTrace:
|
| 52 |
+
"""Full trace of agent execution for inspection."""
|
| 53 |
+
task: str
|
| 54 |
+
steps: List[ThoughtAction]
|
| 55 |
+
final_answer: Optional[Any] = None
|
| 56 |
+
confidence: float = 0.0
|
| 57 |
+
total_time_ms: float = 0.0
|
| 58 |
+
success: bool = True
|
| 59 |
+
error: Optional[str] = None
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class DocumentAgent:
|
| 63 |
+
"""
|
| 64 |
+
ReAct-style agent for document intelligence tasks.
|
| 65 |
+
|
| 66 |
+
Implements the Think -> Tool -> Observe -> Refine loop
|
| 67 |
+
with inspectable traces and grounded outputs.
|
| 68 |
+
"""
|
| 69 |
+
|
| 70 |
+
# System prompt for ReAct reasoning
|
| 71 |
+
SYSTEM_PROMPT = """You are a document intelligence agent that analyzes documents
|
| 72 |
+
and extracts information with evidence.
|
| 73 |
+
|
| 74 |
+
You operate in a Think-Act-Observe loop:
|
| 75 |
+
1. THINK: Analyze what you need to do and what information you have
|
| 76 |
+
2. ACT: Choose a tool to use or provide an answer
|
| 77 |
+
3. OBSERVE: Review the tool output and update your understanding
|
| 78 |
+
|
| 79 |
+
Available tools:
|
| 80 |
+
{tool_descriptions}
|
| 81 |
+
|
| 82 |
+
CRITICAL RULES:
|
| 83 |
+
- Every extraction MUST include evidence (page, bbox, text snippet)
|
| 84 |
+
- If you cannot find evidence for a value, ABSTAIN rather than guess
|
| 85 |
+
- Always cite the source of information with page numbers
|
| 86 |
+
- For tables, analyze structure before extracting data
|
| 87 |
+
- For charts, describe what you see before extracting values
|
| 88 |
+
|
| 89 |
+
Output format for each step:
|
| 90 |
+
THOUGHT: <your reasoning>
|
| 91 |
+
ACTION: <tool_name or ANSWER or ABSTAIN>
|
| 92 |
+
ACTION_INPUT: <JSON arguments for tool, or final answer>
|
| 93 |
+
"""
|
| 94 |
+
|
| 95 |
+
# Available tools
|
| 96 |
+
TOOLS = {
|
| 97 |
+
"extract_text": {
|
| 98 |
+
"description": "Extract text from specific pages or regions",
|
| 99 |
+
"args": ["page_numbers", "region_bbox"],
|
| 100 |
+
},
|
| 101 |
+
"analyze_table": {
|
| 102 |
+
"description": "Analyze and extract structured data from a table region",
|
| 103 |
+
"args": ["page", "bbox", "expected_columns"],
|
| 104 |
+
},
|
| 105 |
+
"analyze_chart": {
|
| 106 |
+
"description": "Analyze a chart/graph and extract insights",
|
| 107 |
+
"args": ["page", "bbox"],
|
| 108 |
+
},
|
| 109 |
+
"extract_fields": {
|
| 110 |
+
"description": "Extract specific fields using a schema",
|
| 111 |
+
"args": ["schema", "context_chunks"],
|
| 112 |
+
},
|
| 113 |
+
"classify_document": {
|
| 114 |
+
"description": "Classify the document type",
|
| 115 |
+
"args": ["first_page_chunks"],
|
| 116 |
+
},
|
| 117 |
+
"search_text": {
|
| 118 |
+
"description": "Search for text patterns in the document",
|
| 119 |
+
"args": ["query", "page_range"],
|
| 120 |
+
},
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
def __init__(
|
| 124 |
+
self,
|
| 125 |
+
llm_client: LangChainOllamaClient,
|
| 126 |
+
memory_agent: Optional[Any] = None,
|
| 127 |
+
max_iterations: int = 10,
|
| 128 |
+
temperature: float = 0.3,
|
| 129 |
+
):
|
| 130 |
+
"""
|
| 131 |
+
Initialize DocumentAgent.
|
| 132 |
+
|
| 133 |
+
Args:
|
| 134 |
+
llm_client: LangChain Ollama client
|
| 135 |
+
memory_agent: Optional memory agent for context retrieval
|
| 136 |
+
max_iterations: Maximum ReAct iterations
|
| 137 |
+
temperature: LLM temperature for reasoning
|
| 138 |
+
"""
|
| 139 |
+
self.llm_client = llm_client
|
| 140 |
+
self.memory_agent = memory_agent
|
| 141 |
+
self.max_iterations = max_iterations
|
| 142 |
+
self.temperature = temperature
|
| 143 |
+
|
| 144 |
+
# Current document context
|
| 145 |
+
self._current_document: Optional[ProcessedDocument] = None
|
| 146 |
+
self._page_images: Dict[int, Any] = {}
|
| 147 |
+
|
| 148 |
+
logger.info(f"Initialized DocumentAgent (max_iterations={max_iterations})")
|
| 149 |
+
|
| 150 |
+
def set_document(
|
| 151 |
+
self,
|
| 152 |
+
document: ProcessedDocument,
|
| 153 |
+
page_images: Optional[Dict[int, Any]] = None,
|
| 154 |
+
):
|
| 155 |
+
"""
|
| 156 |
+
Set the current document context.
|
| 157 |
+
|
| 158 |
+
Args:
|
| 159 |
+
document: Processed document
|
| 160 |
+
page_images: Optional dict of page number -> image array
|
| 161 |
+
"""
|
| 162 |
+
self._current_document = document
|
| 163 |
+
self._page_images = page_images or {}
|
| 164 |
+
logger.info(f"Set document context: {document.metadata.document_id}")
|
| 165 |
+
|
| 166 |
+
async def run(
|
| 167 |
+
self,
|
| 168 |
+
task_description: str,
|
| 169 |
+
extraction_schema: Optional[ExtractionSchema] = None,
|
| 170 |
+
) -> Tuple[Any, AgentTrace]:
|
| 171 |
+
"""
|
| 172 |
+
Run the agent on a task.
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
task_description: Natural language task description
|
| 176 |
+
extraction_schema: Optional schema for structured extraction
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
Tuple of (result, trace)
|
| 180 |
+
"""
|
| 181 |
+
start_time = time.time()
|
| 182 |
+
|
| 183 |
+
if not self._current_document:
|
| 184 |
+
raise ValueError("No document set. Call set_document() first.")
|
| 185 |
+
|
| 186 |
+
trace = AgentTrace(task=task_description, steps=[])
|
| 187 |
+
|
| 188 |
+
try:
|
| 189 |
+
# Build initial context
|
| 190 |
+
context = self._build_context(extraction_schema)
|
| 191 |
+
|
| 192 |
+
# ReAct loop
|
| 193 |
+
result = None
|
| 194 |
+
for iteration in range(self.max_iterations):
|
| 195 |
+
logger.debug(f"ReAct iteration {iteration + 1}")
|
| 196 |
+
|
| 197 |
+
# Generate thought and action
|
| 198 |
+
step = await self._generate_step(task_description, context, trace.steps)
|
| 199 |
+
trace.steps.append(step)
|
| 200 |
+
|
| 201 |
+
# Check for terminal actions
|
| 202 |
+
if step.action == AgentAction.ANSWER:
|
| 203 |
+
result = self._parse_answer(step.tool_args)
|
| 204 |
+
trace.final_answer = result
|
| 205 |
+
trace.confidence = self._calculate_confidence(trace.steps)
|
| 206 |
+
break
|
| 207 |
+
|
| 208 |
+
elif step.action == AgentAction.ABSTAIN:
|
| 209 |
+
trace.final_answer = {
|
| 210 |
+
"abstained": True,
|
| 211 |
+
"reason": step.thought,
|
| 212 |
+
}
|
| 213 |
+
trace.confidence = 0.0
|
| 214 |
+
break
|
| 215 |
+
|
| 216 |
+
elif step.action == AgentAction.USE_TOOL:
|
| 217 |
+
# Execute tool and get observation
|
| 218 |
+
observation, evidence = await self._execute_tool(
|
| 219 |
+
step.tool_name, step.tool_args
|
| 220 |
+
)
|
| 221 |
+
step.observation = observation
|
| 222 |
+
step.evidence = evidence
|
| 223 |
+
|
| 224 |
+
# Update context with observation
|
| 225 |
+
context += f"\n\nObservation from {step.tool_name}:\n{observation}"
|
| 226 |
+
|
| 227 |
+
trace.success = True
|
| 228 |
+
|
| 229 |
+
except Exception as e:
|
| 230 |
+
logger.error(f"Agent execution failed: {e}")
|
| 231 |
+
trace.success = False
|
| 232 |
+
trace.error = str(e)
|
| 233 |
+
|
| 234 |
+
trace.total_time_ms = (time.time() - start_time) * 1000
|
| 235 |
+
return trace.final_answer, trace
|
| 236 |
+
|
| 237 |
+
async def extract_fields(
|
| 238 |
+
self,
|
| 239 |
+
schema: ExtractionSchema,
|
| 240 |
+
) -> ExtractionResult:
|
| 241 |
+
"""
|
| 242 |
+
Extract fields from the document using a schema.
|
| 243 |
+
|
| 244 |
+
Args:
|
| 245 |
+
schema: Extraction schema defining fields
|
| 246 |
+
|
| 247 |
+
Returns:
|
| 248 |
+
ExtractionResult with extracted data and evidence
|
| 249 |
+
"""
|
| 250 |
+
task = f"Extract the following fields from this document: {', '.join(f.name for f in schema.fields)}"
|
| 251 |
+
result, trace = await self.run(task, schema)
|
| 252 |
+
|
| 253 |
+
# Build extraction result
|
| 254 |
+
data = {}
|
| 255 |
+
evidence = []
|
| 256 |
+
warnings = []
|
| 257 |
+
abstained = []
|
| 258 |
+
|
| 259 |
+
if isinstance(result, dict):
|
| 260 |
+
data = result.get("data", result)
|
| 261 |
+
|
| 262 |
+
# Collect evidence from trace
|
| 263 |
+
for step in trace.steps:
|
| 264 |
+
if step.evidence:
|
| 265 |
+
evidence.extend(step.evidence)
|
| 266 |
+
|
| 267 |
+
# Check for abstained fields
|
| 268 |
+
for field in schema.fields:
|
| 269 |
+
if field.name not in data and field.required:
|
| 270 |
+
abstained.append(field.name)
|
| 271 |
+
warnings.append(
|
| 272 |
+
f"Required field '{field.name}' not found with sufficient confidence"
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
return ExtractionResult(
|
| 276 |
+
data=data,
|
| 277 |
+
evidence=evidence,
|
| 278 |
+
warnings=warnings,
|
| 279 |
+
confidence=trace.confidence,
|
| 280 |
+
abstained_fields=abstained,
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
async def classify(self) -> DocumentClassification:
|
| 284 |
+
"""
|
| 285 |
+
Classify the document type.
|
| 286 |
+
|
| 287 |
+
Returns:
|
| 288 |
+
DocumentClassification with type and confidence
|
| 289 |
+
"""
|
| 290 |
+
task = "Classify this document into one of the standard document types (contract, invoice, patent, research_paper, report, letter, form, etc.)"
|
| 291 |
+
result, trace = await self.run(task)
|
| 292 |
+
|
| 293 |
+
# Parse classification result
|
| 294 |
+
doc_type = DocumentType.UNKNOWN
|
| 295 |
+
confidence = 0.0
|
| 296 |
+
|
| 297 |
+
if isinstance(result, dict):
|
| 298 |
+
type_str = result.get("document_type", "unknown")
|
| 299 |
+
try:
|
| 300 |
+
doc_type = DocumentType(type_str.lower())
|
| 301 |
+
except ValueError:
|
| 302 |
+
doc_type = DocumentType.OTHER
|
| 303 |
+
|
| 304 |
+
confidence = result.get("confidence", trace.confidence)
|
| 305 |
+
|
| 306 |
+
return DocumentClassification(
|
| 307 |
+
document_id=self._current_document.metadata.document_id,
|
| 308 |
+
primary_type=doc_type,
|
| 309 |
+
primary_confidence=confidence,
|
| 310 |
+
evidence=[e for step in trace.steps if step.evidence for e in step.evidence],
|
| 311 |
+
method="llm",
|
| 312 |
+
is_confident=confidence >= 0.7,
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
async def answer_question(self, question: str) -> Tuple[str, List[EvidenceRef]]:
|
| 316 |
+
"""
|
| 317 |
+
Answer a question about the document.
|
| 318 |
+
|
| 319 |
+
Args:
|
| 320 |
+
question: Natural language question
|
| 321 |
+
|
| 322 |
+
Returns:
|
| 323 |
+
Tuple of (answer, evidence)
|
| 324 |
+
"""
|
| 325 |
+
task = f"Answer this question about the document: {question}"
|
| 326 |
+
result, trace = await self.run(task)
|
| 327 |
+
|
| 328 |
+
answer = ""
|
| 329 |
+
evidence = []
|
| 330 |
+
|
| 331 |
+
if isinstance(result, dict):
|
| 332 |
+
answer = result.get("answer", str(result))
|
| 333 |
+
elif isinstance(result, str):
|
| 334 |
+
answer = result
|
| 335 |
+
|
| 336 |
+
# Collect evidence
|
| 337 |
+
for step in trace.steps:
|
| 338 |
+
if step.evidence:
|
| 339 |
+
evidence.extend(step.evidence)
|
| 340 |
+
|
| 341 |
+
return answer, evidence
|
| 342 |
+
|
| 343 |
+
def _build_context(self, schema: Optional[ExtractionSchema] = None) -> str:
|
| 344 |
+
"""Build initial context from document."""
|
| 345 |
+
doc = self._current_document
|
| 346 |
+
context_parts = [
|
| 347 |
+
f"Document: {doc.metadata.filename}",
|
| 348 |
+
f"Type: {doc.metadata.file_type}",
|
| 349 |
+
f"Pages: {doc.metadata.num_pages}",
|
| 350 |
+
f"Chunks: {len(doc.chunks)}",
|
| 351 |
+
"",
|
| 352 |
+
"Document content summary:",
|
| 353 |
+
]
|
| 354 |
+
|
| 355 |
+
# Add first few chunks as context
|
| 356 |
+
for chunk in doc.chunks[:10]:
|
| 357 |
+
context_parts.append(
|
| 358 |
+
f"[Page {chunk.page + 1}, {chunk.chunk_type.value}]: {chunk.text[:200]}..."
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
if schema:
|
| 362 |
+
context_parts.append("")
|
| 363 |
+
context_parts.append("Extraction schema:")
|
| 364 |
+
for field in schema.fields:
|
| 365 |
+
req = "required" if field.required else "optional"
|
| 366 |
+
context_parts.append(f"- {field.name} ({field.type.value}, {req}): {field.description}")
|
| 367 |
+
|
| 368 |
+
return "\n".join(context_parts)
|
| 369 |
+
|
| 370 |
+
async def _generate_step(
|
| 371 |
+
self,
|
| 372 |
+
task: str,
|
| 373 |
+
context: str,
|
| 374 |
+
previous_steps: List[ThoughtAction],
|
| 375 |
+
) -> ThoughtAction:
|
| 376 |
+
"""Generate the next thought-action step."""
|
| 377 |
+
# Build prompt
|
| 378 |
+
tool_descriptions = "\n".join(
|
| 379 |
+
f"- {name}: {info['description']}"
|
| 380 |
+
for name, info in self.TOOLS.items()
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
system_prompt = self.SYSTEM_PROMPT.format(tool_descriptions=tool_descriptions)
|
| 384 |
+
|
| 385 |
+
messages = [{"role": "system", "content": system_prompt}]
|
| 386 |
+
|
| 387 |
+
# Add task and context
|
| 388 |
+
user_content = f"TASK: {task}\n\nCONTEXT:\n{context}"
|
| 389 |
+
|
| 390 |
+
# Add previous steps
|
| 391 |
+
if previous_steps:
|
| 392 |
+
user_content += "\n\nPREVIOUS STEPS:"
|
| 393 |
+
for i, step in enumerate(previous_steps, 1):
|
| 394 |
+
user_content += f"\n\nStep {i}:"
|
| 395 |
+
user_content += f"\nTHOUGHT: {step.thought}"
|
| 396 |
+
user_content += f"\nACTION: {step.action.value}"
|
| 397 |
+
if step.tool_name:
|
| 398 |
+
user_content += f"\nTOOL: {step.tool_name}"
|
| 399 |
+
if step.observation:
|
| 400 |
+
user_content += f"\nOBSERVATION: {step.observation[:500]}..."
|
| 401 |
+
|
| 402 |
+
user_content += "\n\nNow generate your next step:"
|
| 403 |
+
messages.append({"role": "user", "content": user_content})
|
| 404 |
+
|
| 405 |
+
# Generate response
|
| 406 |
+
llm = self.llm_client.get_llm(complexity="complex", temperature=self.temperature)
|
| 407 |
+
|
| 408 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
| 409 |
+
lc_messages = [
|
| 410 |
+
SystemMessage(content=system_prompt),
|
| 411 |
+
HumanMessage(content=user_content),
|
| 412 |
+
]
|
| 413 |
+
|
| 414 |
+
response = await llm.ainvoke(lc_messages)
|
| 415 |
+
response_text = response.content
|
| 416 |
+
|
| 417 |
+
# Parse response
|
| 418 |
+
return self._parse_step(response_text)
|
| 419 |
+
|
| 420 |
+
def _parse_step(self, response: str) -> ThoughtAction:
|
| 421 |
+
"""Parse LLM response into ThoughtAction."""
|
| 422 |
+
thought = ""
|
| 423 |
+
action = AgentAction.THINK
|
| 424 |
+
tool_name = None
|
| 425 |
+
tool_args = None
|
| 426 |
+
|
| 427 |
+
lines = response.strip().split("\n")
|
| 428 |
+
current_section = None
|
| 429 |
+
|
| 430 |
+
for line in lines:
|
| 431 |
+
line = line.strip()
|
| 432 |
+
|
| 433 |
+
if line.startswith("THOUGHT:"):
|
| 434 |
+
current_section = "thought"
|
| 435 |
+
thought = line[8:].strip()
|
| 436 |
+
elif line.startswith("ACTION:"):
|
| 437 |
+
current_section = "action"
|
| 438 |
+
action_str = line[7:].strip().lower()
|
| 439 |
+
if action_str == "answer":
|
| 440 |
+
action = AgentAction.ANSWER
|
| 441 |
+
elif action_str == "abstain":
|
| 442 |
+
action = AgentAction.ABSTAIN
|
| 443 |
+
elif action_str in self.TOOLS:
|
| 444 |
+
action = AgentAction.USE_TOOL
|
| 445 |
+
tool_name = action_str
|
| 446 |
+
else:
|
| 447 |
+
action = AgentAction.USE_TOOL
|
| 448 |
+
tool_name = action_str
|
| 449 |
+
elif line.startswith("ACTION_INPUT:"):
|
| 450 |
+
current_section = "input"
|
| 451 |
+
input_str = line[13:].strip()
|
| 452 |
+
try:
|
| 453 |
+
tool_args = json.loads(input_str)
|
| 454 |
+
except json.JSONDecodeError:
|
| 455 |
+
tool_args = {"raw": input_str}
|
| 456 |
+
elif current_section == "thought":
|
| 457 |
+
thought += " " + line
|
| 458 |
+
elif current_section == "input":
|
| 459 |
+
try:
|
| 460 |
+
tool_args = json.loads(line)
|
| 461 |
+
except:
|
| 462 |
+
pass
|
| 463 |
+
|
| 464 |
+
return ThoughtAction(
|
| 465 |
+
thought=thought,
|
| 466 |
+
action=action,
|
| 467 |
+
tool_name=tool_name,
|
| 468 |
+
tool_args=tool_args,
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
async def _execute_tool(
|
| 472 |
+
self,
|
| 473 |
+
tool_name: str,
|
| 474 |
+
tool_args: Optional[Dict[str, Any]],
|
| 475 |
+
) -> Tuple[str, List[EvidenceRef]]:
|
| 476 |
+
"""Execute a tool and return observation."""
|
| 477 |
+
if not tool_args:
|
| 478 |
+
tool_args = {}
|
| 479 |
+
|
| 480 |
+
doc = self._current_document
|
| 481 |
+
evidence = []
|
| 482 |
+
|
| 483 |
+
try:
|
| 484 |
+
if tool_name == "extract_text":
|
| 485 |
+
return self._tool_extract_text(tool_args)
|
| 486 |
+
|
| 487 |
+
elif tool_name == "analyze_table":
|
| 488 |
+
return await self._tool_analyze_table(tool_args)
|
| 489 |
+
|
| 490 |
+
elif tool_name == "analyze_chart":
|
| 491 |
+
return await self._tool_analyze_chart(tool_args)
|
| 492 |
+
|
| 493 |
+
elif tool_name == "extract_fields":
|
| 494 |
+
return await self._tool_extract_fields(tool_args)
|
| 495 |
+
|
| 496 |
+
elif tool_name == "classify_document":
|
| 497 |
+
return self._tool_classify_document(tool_args)
|
| 498 |
+
|
| 499 |
+
elif tool_name == "search_text":
|
| 500 |
+
return self._tool_search_text(tool_args)
|
| 501 |
+
|
| 502 |
+
else:
|
| 503 |
+
return f"Unknown tool: {tool_name}", []
|
| 504 |
+
|
| 505 |
+
except Exception as e:
|
| 506 |
+
logger.error(f"Tool {tool_name} failed: {e}")
|
| 507 |
+
return f"Error executing {tool_name}: {e}", []
|
| 508 |
+
|
| 509 |
+
def _tool_extract_text(self, args: Dict[str, Any]) -> Tuple[str, List[EvidenceRef]]:
|
| 510 |
+
"""Extract text from pages or regions."""
|
| 511 |
+
doc = self._current_document
|
| 512 |
+
page_numbers = args.get("page_numbers", list(range(doc.metadata.num_pages)))
|
| 513 |
+
|
| 514 |
+
if isinstance(page_numbers, int):
|
| 515 |
+
page_numbers = [page_numbers]
|
| 516 |
+
|
| 517 |
+
texts = []
|
| 518 |
+
evidence = []
|
| 519 |
+
|
| 520 |
+
for page in page_numbers:
|
| 521 |
+
page_chunks = doc.get_page_chunks(page)
|
| 522 |
+
for chunk in page_chunks:
|
| 523 |
+
texts.append(f"[Page {page + 1}]: {chunk.text}")
|
| 524 |
+
evidence.append(EvidenceRef(
|
| 525 |
+
chunk_id=chunk.chunk_id,
|
| 526 |
+
page=chunk.page,
|
| 527 |
+
bbox=chunk.bbox,
|
| 528 |
+
source_type="text",
|
| 529 |
+
snippet=chunk.text[:100],
|
| 530 |
+
confidence=chunk.confidence,
|
| 531 |
+
))
|
| 532 |
+
|
| 533 |
+
return "\n".join(texts[:20]), evidence[:10]
|
| 534 |
+
|
| 535 |
+
async def _tool_analyze_table(self, args: Dict[str, Any]) -> Tuple[str, List[EvidenceRef]]:
|
| 536 |
+
"""Analyze a table region."""
|
| 537 |
+
page = args.get("page", 0)
|
| 538 |
+
doc = self._current_document
|
| 539 |
+
|
| 540 |
+
# Find table chunks
|
| 541 |
+
table_chunks = [c for c in doc.chunks if c.chunk_type.value == "table" and c.page == page]
|
| 542 |
+
|
| 543 |
+
if not table_chunks:
|
| 544 |
+
return "No table found on this page", []
|
| 545 |
+
|
| 546 |
+
# Use LLM to analyze table
|
| 547 |
+
table_text = table_chunks[0].text
|
| 548 |
+
llm = self.llm_client.get_llm(complexity="standard")
|
| 549 |
+
|
| 550 |
+
from langchain_core.messages import HumanMessage
|
| 551 |
+
prompt = f"Analyze this table and extract structured data as JSON:\n\n{table_text}"
|
| 552 |
+
response = await llm.ainvoke([HumanMessage(content=prompt)])
|
| 553 |
+
|
| 554 |
+
evidence = [EvidenceRef(
|
| 555 |
+
chunk_id=table_chunks[0].chunk_id,
|
| 556 |
+
page=page,
|
| 557 |
+
bbox=table_chunks[0].bbox,
|
| 558 |
+
source_type="table",
|
| 559 |
+
snippet=table_text[:200],
|
| 560 |
+
confidence=table_chunks[0].confidence,
|
| 561 |
+
)]
|
| 562 |
+
|
| 563 |
+
return response.content, evidence
|
| 564 |
+
|
| 565 |
+
async def _tool_analyze_chart(self, args: Dict[str, Any]) -> Tuple[str, List[EvidenceRef]]:
|
| 566 |
+
"""Analyze a chart region."""
|
| 567 |
+
page = args.get("page", 0)
|
| 568 |
+
doc = self._current_document
|
| 569 |
+
|
| 570 |
+
# Find chart/figure chunks
|
| 571 |
+
chart_chunks = [
|
| 572 |
+
c for c in doc.chunks
|
| 573 |
+
if c.chunk_type.value in ("chart", "figure") and c.page == page
|
| 574 |
+
]
|
| 575 |
+
|
| 576 |
+
if not chart_chunks:
|
| 577 |
+
return "No chart/figure found on this page", []
|
| 578 |
+
|
| 579 |
+
# If we have the image, use vision model
|
| 580 |
+
if page in self._page_images:
|
| 581 |
+
# TODO: Use vision model for chart analysis
|
| 582 |
+
pass
|
| 583 |
+
|
| 584 |
+
return f"Chart found on page {page + 1}: {chart_chunks[0].caption or 'No caption'}", []
|
| 585 |
+
|
| 586 |
+
async def _tool_extract_fields(self, args: Dict[str, Any]) -> Tuple[str, List[EvidenceRef]]:
|
| 587 |
+
"""Extract specific fields."""
|
| 588 |
+
schema_dict = args.get("schema", {})
|
| 589 |
+
doc = self._current_document
|
| 590 |
+
|
| 591 |
+
# Build context from chunks
|
| 592 |
+
context = "\n".join(c.text for c in doc.chunks[:20])
|
| 593 |
+
|
| 594 |
+
# Use LLM to extract
|
| 595 |
+
llm = self.llm_client.get_llm(complexity="complex")
|
| 596 |
+
|
| 597 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
| 598 |
+
system = "Extract the requested fields from the document. Output JSON with field names as keys."
|
| 599 |
+
user = f"Fields to extract: {json.dumps(schema_dict)}\n\nDocument content:\n{context}"
|
| 600 |
+
|
| 601 |
+
response = await llm.ainvoke([
|
| 602 |
+
SystemMessage(content=system),
|
| 603 |
+
HumanMessage(content=user),
|
| 604 |
+
])
|
| 605 |
+
|
| 606 |
+
return response.content, []
|
| 607 |
+
|
| 608 |
+
def _tool_classify_document(self, args: Dict[str, Any]) -> Tuple[str, List[EvidenceRef]]:
|
| 609 |
+
"""Classify document type based on first page."""
|
| 610 |
+
doc = self._current_document
|
| 611 |
+
first_page_chunks = doc.get_page_chunks(0)
|
| 612 |
+
text = " ".join(c.text for c in first_page_chunks[:5])
|
| 613 |
+
|
| 614 |
+
return f"First page content for classification:\n{text[:500]}", []
|
| 615 |
+
|
| 616 |
+
def _tool_search_text(self, args: Dict[str, Any]) -> Tuple[str, List[EvidenceRef]]:
|
| 617 |
+
"""Search for text in document."""
|
| 618 |
+
query = args.get("query", "").lower()
|
| 619 |
+
doc = self._current_document
|
| 620 |
+
|
| 621 |
+
matches = []
|
| 622 |
+
evidence = []
|
| 623 |
+
|
| 624 |
+
for chunk in doc.chunks:
|
| 625 |
+
if query in chunk.text.lower():
|
| 626 |
+
matches.append(f"[Page {chunk.page + 1}]: ...{chunk.text}...")
|
| 627 |
+
evidence.append(EvidenceRef(
|
| 628 |
+
chunk_id=chunk.chunk_id,
|
| 629 |
+
page=chunk.page,
|
| 630 |
+
bbox=chunk.bbox,
|
| 631 |
+
source_type="text",
|
| 632 |
+
snippet=chunk.text[:100],
|
| 633 |
+
confidence=chunk.confidence,
|
| 634 |
+
))
|
| 635 |
+
|
| 636 |
+
if not matches:
|
| 637 |
+
return f"No matches found for '{query}'", []
|
| 638 |
+
|
| 639 |
+
return f"Found {len(matches)} matches:\n" + "\n".join(matches[:10]), evidence[:10]
|
| 640 |
+
|
| 641 |
+
def _parse_answer(self, answer_input: Optional[Dict[str, Any]]) -> Any:
|
| 642 |
+
"""Parse the final answer from tool args."""
|
| 643 |
+
if not answer_input:
|
| 644 |
+
return None
|
| 645 |
+
|
| 646 |
+
if isinstance(answer_input, dict):
|
| 647 |
+
return answer_input
|
| 648 |
+
|
| 649 |
+
return {"answer": answer_input}
|
| 650 |
+
|
| 651 |
+
def _calculate_confidence(self, steps: List[ThoughtAction]) -> float:
|
| 652 |
+
"""Calculate overall confidence from trace."""
|
| 653 |
+
if not steps:
|
| 654 |
+
return 0.0
|
| 655 |
+
|
| 656 |
+
# Average evidence confidence
|
| 657 |
+
all_evidence = [e for s in steps if s.evidence for e in s.evidence]
|
| 658 |
+
if all_evidence:
|
| 659 |
+
return sum(e.confidence for e in all_evidence) / len(all_evidence)
|
| 660 |
+
|
| 661 |
+
return 0.5 # Default moderate confidence
|
src/cli/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SPARKNET Command Line Interface
|
| 3 |
+
|
| 4 |
+
Provides CLI commands for document intelligence and RAG operations.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from .main import app, main
|
| 8 |
+
|
| 9 |
+
__all__ = ["app", "main"]
|
src/cli/docint.py
ADDED
|
@@ -0,0 +1,681 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document Intelligence CLI Commands
|
| 3 |
+
|
| 4 |
+
CLI interface for the document_intelligence subsystem.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import List, Optional
|
| 11 |
+
|
| 12 |
+
import click
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@click.group(name="docint")
|
| 16 |
+
def docint_cli():
|
| 17 |
+
"""Document Intelligence commands."""
|
| 18 |
+
pass
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@docint_cli.command()
|
| 22 |
+
@click.argument("path", type=click.Path(exists=True))
|
| 23 |
+
@click.option("--output", "-o", type=click.Path(), help="Output JSON file")
|
| 24 |
+
@click.option("--max-pages", type=int, help="Maximum pages to process")
|
| 25 |
+
@click.option("--dpi", type=int, default=200, help="Render DPI (default: 200)")
|
| 26 |
+
@click.option("--format", "output_format", type=click.Choice(["json", "markdown", "text"]),
|
| 27 |
+
default="json", help="Output format")
|
| 28 |
+
def parse(path: str, output: Optional[str], max_pages: Optional[int],
|
| 29 |
+
dpi: int, output_format: str):
|
| 30 |
+
"""
|
| 31 |
+
Parse a document into semantic chunks.
|
| 32 |
+
|
| 33 |
+
Example:
|
| 34 |
+
sparknet docint parse invoice.pdf -o result.json
|
| 35 |
+
sparknet docint parse document.pdf --format markdown
|
| 36 |
+
"""
|
| 37 |
+
from src.document_intelligence import (
|
| 38 |
+
DocumentParser,
|
| 39 |
+
ParserConfig,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
config = ParserConfig(
|
| 43 |
+
render_dpi=dpi,
|
| 44 |
+
max_pages=max_pages,
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
parser = DocumentParser(config=config)
|
| 48 |
+
|
| 49 |
+
click.echo(f"Parsing: {path}")
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
result = parser.parse(path)
|
| 53 |
+
|
| 54 |
+
if output_format == "json":
|
| 55 |
+
output_data = {
|
| 56 |
+
"doc_id": result.doc_id,
|
| 57 |
+
"filename": result.filename,
|
| 58 |
+
"num_pages": result.num_pages,
|
| 59 |
+
"chunks": [
|
| 60 |
+
{
|
| 61 |
+
"chunk_id": c.chunk_id,
|
| 62 |
+
"type": c.chunk_type.value,
|
| 63 |
+
"text": c.text,
|
| 64 |
+
"page": c.page,
|
| 65 |
+
"bbox": c.bbox.xyxy,
|
| 66 |
+
"confidence": c.confidence,
|
| 67 |
+
}
|
| 68 |
+
for c in result.chunks
|
| 69 |
+
],
|
| 70 |
+
"processing_time_ms": result.processing_time_ms,
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
if output:
|
| 74 |
+
with open(output, "w") as f:
|
| 75 |
+
json.dump(output_data, f, indent=2)
|
| 76 |
+
click.echo(f"Output written to: {output}")
|
| 77 |
+
else:
|
| 78 |
+
click.echo(json.dumps(output_data, indent=2))
|
| 79 |
+
|
| 80 |
+
elif output_format == "markdown":
|
| 81 |
+
if output:
|
| 82 |
+
with open(output, "w") as f:
|
| 83 |
+
f.write(result.markdown_full)
|
| 84 |
+
click.echo(f"Markdown written to: {output}")
|
| 85 |
+
else:
|
| 86 |
+
click.echo(result.markdown_full)
|
| 87 |
+
|
| 88 |
+
else: # text
|
| 89 |
+
for chunk in result.chunks:
|
| 90 |
+
click.echo(f"[Page {chunk.page}, {chunk.chunk_type.value}]")
|
| 91 |
+
click.echo(chunk.text)
|
| 92 |
+
click.echo()
|
| 93 |
+
|
| 94 |
+
click.echo(f"\nParsed {len(result.chunks)} chunks in {result.processing_time_ms:.0f}ms")
|
| 95 |
+
|
| 96 |
+
except Exception as e:
|
| 97 |
+
click.echo(f"Error: {e}", err=True)
|
| 98 |
+
sys.exit(1)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
@docint_cli.command()
|
| 102 |
+
@click.argument("path", type=click.Path(exists=True))
|
| 103 |
+
@click.option("--field", "-f", multiple=True, help="Field to extract (can specify multiple)")
|
| 104 |
+
@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema file")
|
| 105 |
+
@click.option("--preset", type=click.Choice(["invoice", "receipt", "contract"]),
|
| 106 |
+
help="Use preset schema")
|
| 107 |
+
@click.option("--output", "-o", type=click.Path(), help="Output JSON file")
|
| 108 |
+
def extract(path: str, field: tuple, schema: Optional[str], preset: Optional[str],
|
| 109 |
+
output: Optional[str]):
|
| 110 |
+
"""
|
| 111 |
+
Extract fields from a document.
|
| 112 |
+
|
| 113 |
+
Example:
|
| 114 |
+
sparknet docint extract invoice.pdf --preset invoice
|
| 115 |
+
sparknet docint extract doc.pdf -f vendor_name -f total_amount
|
| 116 |
+
sparknet docint extract doc.pdf --schema my_schema.json
|
| 117 |
+
"""
|
| 118 |
+
from src.document_intelligence import (
|
| 119 |
+
DocumentParser,
|
| 120 |
+
FieldExtractor,
|
| 121 |
+
ExtractionSchema,
|
| 122 |
+
FieldSpec,
|
| 123 |
+
FieldType,
|
| 124 |
+
create_invoice_schema,
|
| 125 |
+
create_receipt_schema,
|
| 126 |
+
create_contract_schema,
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
# Build schema
|
| 130 |
+
if preset:
|
| 131 |
+
if preset == "invoice":
|
| 132 |
+
extraction_schema = create_invoice_schema()
|
| 133 |
+
elif preset == "receipt":
|
| 134 |
+
extraction_schema = create_receipt_schema()
|
| 135 |
+
elif preset == "contract":
|
| 136 |
+
extraction_schema = create_contract_schema()
|
| 137 |
+
elif schema:
|
| 138 |
+
with open(schema) as f:
|
| 139 |
+
schema_dict = json.load(f)
|
| 140 |
+
extraction_schema = ExtractionSchema.from_json_schema(schema_dict)
|
| 141 |
+
elif field:
|
| 142 |
+
extraction_schema = ExtractionSchema(name="custom")
|
| 143 |
+
for f in field:
|
| 144 |
+
extraction_schema.add_string_field(f, required=True)
|
| 145 |
+
else:
|
| 146 |
+
click.echo("Error: Specify --field, --schema, or --preset", err=True)
|
| 147 |
+
sys.exit(1)
|
| 148 |
+
|
| 149 |
+
click.echo(f"Extracting from: {path}")
|
| 150 |
+
click.echo(f"Fields: {', '.join(f.name for f in extraction_schema.fields)}")
|
| 151 |
+
|
| 152 |
+
try:
|
| 153 |
+
# Parse document
|
| 154 |
+
parser = DocumentParser()
|
| 155 |
+
parse_result = parser.parse(path)
|
| 156 |
+
|
| 157 |
+
# Extract fields
|
| 158 |
+
extractor = FieldExtractor()
|
| 159 |
+
result = extractor.extract(parse_result, extraction_schema)
|
| 160 |
+
|
| 161 |
+
output_data = {
|
| 162 |
+
"doc_id": parse_result.doc_id,
|
| 163 |
+
"filename": parse_result.filename,
|
| 164 |
+
"extracted_data": result.data,
|
| 165 |
+
"confidence": result.overall_confidence,
|
| 166 |
+
"abstained_fields": result.abstained_fields,
|
| 167 |
+
"evidence": [
|
| 168 |
+
{
|
| 169 |
+
"chunk_id": e.chunk_id,
|
| 170 |
+
"page": e.page,
|
| 171 |
+
"bbox": e.bbox.xyxy,
|
| 172 |
+
"snippet": e.snippet,
|
| 173 |
+
}
|
| 174 |
+
for e in result.evidence
|
| 175 |
+
],
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
if output:
|
| 179 |
+
with open(output, "w") as f:
|
| 180 |
+
json.dump(output_data, f, indent=2)
|
| 181 |
+
click.echo(f"Output written to: {output}")
|
| 182 |
+
else:
|
| 183 |
+
click.echo("\nExtracted Data:")
|
| 184 |
+
for key, value in result.data.items():
|
| 185 |
+
status = "" if key not in result.abstained_fields else " [ABSTAINED]"
|
| 186 |
+
click.echo(f" {key}: {value}{status}")
|
| 187 |
+
|
| 188 |
+
click.echo(f"\nConfidence: {result.overall_confidence:.2f}")
|
| 189 |
+
|
| 190 |
+
if result.abstained_fields:
|
| 191 |
+
click.echo(f"Abstained: {', '.join(result.abstained_fields)}")
|
| 192 |
+
|
| 193 |
+
except Exception as e:
|
| 194 |
+
click.echo(f"Error: {e}", err=True)
|
| 195 |
+
sys.exit(1)
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
@docint_cli.command()
|
| 199 |
+
@click.argument("path", type=click.Path(exists=True))
|
| 200 |
+
@click.argument("question")
|
| 201 |
+
@click.option("--verbose", "-v", is_flag=True, help="Show evidence details")
|
| 202 |
+
@click.option("--use-rag", is_flag=True, help="Use RAG for retrieval (requires indexed document)")
|
| 203 |
+
@click.option("--document-id", "-d", help="Document ID for RAG retrieval")
|
| 204 |
+
@click.option("--top-k", "-k", type=int, default=5, help="Number of chunks to consider")
|
| 205 |
+
@click.option("--chunk-type", "-t", multiple=True, help="Filter by chunk type (can specify multiple)")
|
| 206 |
+
@click.option("--page-start", type=int, help="Filter by page range start")
|
| 207 |
+
@click.option("--page-end", type=int, help="Filter by page range end")
|
| 208 |
+
def ask(path: str, question: str, verbose: bool, use_rag: bool,
|
| 209 |
+
document_id: Optional[str], top_k: int, chunk_type: tuple,
|
| 210 |
+
page_start: Optional[int], page_end: Optional[int]):
|
| 211 |
+
"""
|
| 212 |
+
Ask a question about a document.
|
| 213 |
+
|
| 214 |
+
Example:
|
| 215 |
+
sparknet docint ask invoice.pdf "What is the total amount?"
|
| 216 |
+
sparknet docint ask doc.pdf "Find claims" --use-rag --top-k 10
|
| 217 |
+
sparknet docint ask doc.pdf "What tables show?" -t table --use-rag
|
| 218 |
+
"""
|
| 219 |
+
from src.document_intelligence import DocumentParser
|
| 220 |
+
|
| 221 |
+
click.echo(f"Document: {path}")
|
| 222 |
+
click.echo(f"Question: {question}")
|
| 223 |
+
|
| 224 |
+
if use_rag:
|
| 225 |
+
click.echo("Mode: RAG (semantic retrieval)")
|
| 226 |
+
else:
|
| 227 |
+
click.echo("Mode: Keyword search")
|
| 228 |
+
|
| 229 |
+
click.echo()
|
| 230 |
+
|
| 231 |
+
try:
|
| 232 |
+
if use_rag:
|
| 233 |
+
# Use RAG-based answering
|
| 234 |
+
from src.document_intelligence.tools import get_rag_tool
|
| 235 |
+
|
| 236 |
+
tool = get_rag_tool("rag_answer")
|
| 237 |
+
|
| 238 |
+
# Build page range if specified
|
| 239 |
+
page_range = None
|
| 240 |
+
if page_start is not None and page_end is not None:
|
| 241 |
+
page_range = (page_start, page_end)
|
| 242 |
+
|
| 243 |
+
result = tool.execute(
|
| 244 |
+
question=question,
|
| 245 |
+
document_id=document_id,
|
| 246 |
+
top_k=top_k,
|
| 247 |
+
chunk_types=list(chunk_type) if chunk_type else None,
|
| 248 |
+
page_range=page_range,
|
| 249 |
+
)
|
| 250 |
+
else:
|
| 251 |
+
# Parse document and use keyword-based search
|
| 252 |
+
from src.document_intelligence.tools import get_tool
|
| 253 |
+
|
| 254 |
+
parser = DocumentParser()
|
| 255 |
+
parse_result = parser.parse(path)
|
| 256 |
+
|
| 257 |
+
tool = get_tool("answer_question")
|
| 258 |
+
result = tool.execute(
|
| 259 |
+
parse_result=parse_result,
|
| 260 |
+
question=question,
|
| 261 |
+
top_k=top_k,
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
if result.success:
|
| 265 |
+
data = result.data
|
| 266 |
+
click.echo(f"Answer: {data.get('answer', 'No answer found')}")
|
| 267 |
+
click.echo(f"Confidence: {data.get('confidence', 0):.2f}")
|
| 268 |
+
|
| 269 |
+
if data.get('abstained'):
|
| 270 |
+
click.echo("Note: The system abstained due to low confidence.")
|
| 271 |
+
|
| 272 |
+
if verbose and result.evidence:
|
| 273 |
+
click.echo("\nEvidence:")
|
| 274 |
+
for ev in result.evidence:
|
| 275 |
+
click.echo(f" - Page {ev.get('page', '?')}: {ev.get('snippet', '')[:100]}...")
|
| 276 |
+
|
| 277 |
+
if data.get('citations'):
|
| 278 |
+
click.echo("\nCitations:")
|
| 279 |
+
for cit in data['citations']:
|
| 280 |
+
click.echo(f" [{cit['index']}] {cit.get('text', '')[:80]}...")
|
| 281 |
+
else:
|
| 282 |
+
click.echo(f"Error: {result.error}", err=True)
|
| 283 |
+
|
| 284 |
+
except Exception as e:
|
| 285 |
+
click.echo(f"Error: {e}", err=True)
|
| 286 |
+
sys.exit(1)
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
@docint_cli.command()
|
| 290 |
+
@click.argument("path", type=click.Path(exists=True))
|
| 291 |
+
@click.option("--output", "-o", type=click.Path(), help="Output JSON file")
|
| 292 |
+
def classify(path: str, output: Optional[str]):
|
| 293 |
+
"""
|
| 294 |
+
Classify a document's type.
|
| 295 |
+
|
| 296 |
+
Example:
|
| 297 |
+
sparknet docint classify document.pdf
|
| 298 |
+
"""
|
| 299 |
+
from src.document_intelligence import DocumentParser
|
| 300 |
+
from src.document_intelligence.chunks import DocumentType
|
| 301 |
+
|
| 302 |
+
click.echo(f"Classifying: {path}")
|
| 303 |
+
|
| 304 |
+
try:
|
| 305 |
+
# Parse document
|
| 306 |
+
parser = DocumentParser()
|
| 307 |
+
parse_result = parser.parse(path)
|
| 308 |
+
|
| 309 |
+
# Simple classification based on keywords
|
| 310 |
+
first_page_chunks = [c for c in parse_result.chunks if c.page == 1][:5]
|
| 311 |
+
content = " ".join(c.text[:200] for c in first_page_chunks).lower()
|
| 312 |
+
|
| 313 |
+
doc_type = "other"
|
| 314 |
+
confidence = 0.5
|
| 315 |
+
|
| 316 |
+
type_keywords = {
|
| 317 |
+
"invoice": ["invoice", "bill", "payment due", "amount due", "invoice number"],
|
| 318 |
+
"contract": ["agreement", "contract", "party", "whereas", "terms and conditions"],
|
| 319 |
+
"receipt": ["receipt", "paid", "transaction", "thank you for your purchase"],
|
| 320 |
+
"form": ["form", "fill in", "checkbox", "signature line"],
|
| 321 |
+
"letter": ["dear", "sincerely", "regards", "to whom it may concern"],
|
| 322 |
+
"report": ["report", "findings", "conclusion", "summary", "analysis"],
|
| 323 |
+
"patent": ["patent", "claims", "invention", "embodiment", "disclosed"],
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
for dtype, keywords in type_keywords.items():
|
| 327 |
+
matches = sum(1 for k in keywords if k in content)
|
| 328 |
+
if matches >= 2:
|
| 329 |
+
doc_type = dtype
|
| 330 |
+
confidence = min(0.95, 0.5 + matches * 0.15)
|
| 331 |
+
break
|
| 332 |
+
|
| 333 |
+
output_data = {
|
| 334 |
+
"doc_id": parse_result.doc_id,
|
| 335 |
+
"filename": parse_result.filename,
|
| 336 |
+
"document_type": doc_type,
|
| 337 |
+
"confidence": confidence,
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
if output:
|
| 341 |
+
with open(output, "w") as f:
|
| 342 |
+
json.dump(output_data, f, indent=2)
|
| 343 |
+
click.echo(f"Output written to: {output}")
|
| 344 |
+
else:
|
| 345 |
+
click.echo(f"Type: {doc_type}")
|
| 346 |
+
click.echo(f"Confidence: {confidence:.2f}")
|
| 347 |
+
|
| 348 |
+
except Exception as e:
|
| 349 |
+
click.echo(f"Error: {e}", err=True)
|
| 350 |
+
sys.exit(1)
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
@docint_cli.command()
|
| 354 |
+
@click.argument("path", type=click.Path(exists=True))
|
| 355 |
+
@click.option("--query", "-q", help="Search query")
|
| 356 |
+
@click.option("--type", "chunk_type", help="Filter by chunk type")
|
| 357 |
+
@click.option("--top", "-k", type=int, default=10, help="Number of results")
|
| 358 |
+
def search(path: str, query: Optional[str], chunk_type: Optional[str], top: int):
|
| 359 |
+
"""
|
| 360 |
+
Search document content.
|
| 361 |
+
|
| 362 |
+
Example:
|
| 363 |
+
sparknet docint search document.pdf -q "payment terms"
|
| 364 |
+
sparknet docint search document.pdf --type table
|
| 365 |
+
"""
|
| 366 |
+
from src.document_intelligence import DocumentParser
|
| 367 |
+
from src.document_intelligence.tools import get_tool
|
| 368 |
+
|
| 369 |
+
click.echo(f"Searching: {path}")
|
| 370 |
+
|
| 371 |
+
try:
|
| 372 |
+
# Parse document
|
| 373 |
+
parser = DocumentParser()
|
| 374 |
+
parse_result = parser.parse(path)
|
| 375 |
+
|
| 376 |
+
if query:
|
| 377 |
+
# Search by query
|
| 378 |
+
tool = get_tool("search_chunks")
|
| 379 |
+
result = tool.execute(
|
| 380 |
+
parse_result=parse_result,
|
| 381 |
+
query=query,
|
| 382 |
+
chunk_types=[chunk_type] if chunk_type else None,
|
| 383 |
+
top_k=top,
|
| 384 |
+
)
|
| 385 |
+
|
| 386 |
+
if result.success:
|
| 387 |
+
results = result.data.get("results", [])
|
| 388 |
+
click.echo(f"Found {len(results)} results:\n")
|
| 389 |
+
|
| 390 |
+
for i, r in enumerate(results, 1):
|
| 391 |
+
click.echo(f"{i}. [Page {r['page']}, {r['type']}] (score: {r['score']:.2f})")
|
| 392 |
+
click.echo(f" {r['text'][:200]}...")
|
| 393 |
+
click.echo()
|
| 394 |
+
else:
|
| 395 |
+
click.echo(f"Error: {result.error}", err=True)
|
| 396 |
+
|
| 397 |
+
elif chunk_type:
|
| 398 |
+
# Filter by type
|
| 399 |
+
matching = [c for c in parse_result.chunks if c.chunk_type.value == chunk_type]
|
| 400 |
+
click.echo(f"Found {len(matching)} {chunk_type} chunks:\n")
|
| 401 |
+
|
| 402 |
+
for i, chunk in enumerate(matching[:top], 1):
|
| 403 |
+
click.echo(f"{i}. [Page {chunk.page}] {chunk.chunk_id}")
|
| 404 |
+
click.echo(f" {chunk.text[:200]}...")
|
| 405 |
+
click.echo()
|
| 406 |
+
|
| 407 |
+
else:
|
| 408 |
+
# List all chunks
|
| 409 |
+
click.echo(f"Total chunks: {len(parse_result.chunks)}\n")
|
| 410 |
+
|
| 411 |
+
# Group by type
|
| 412 |
+
by_type = {}
|
| 413 |
+
for chunk in parse_result.chunks:
|
| 414 |
+
t = chunk.chunk_type.value
|
| 415 |
+
by_type[t] = by_type.get(t, 0) + 1
|
| 416 |
+
|
| 417 |
+
click.echo("Chunk types:")
|
| 418 |
+
for t, count in sorted(by_type.items()):
|
| 419 |
+
click.echo(f" {t}: {count}")
|
| 420 |
+
|
| 421 |
+
except Exception as e:
|
| 422 |
+
click.echo(f"Error: {e}", err=True)
|
| 423 |
+
sys.exit(1)
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
@docint_cli.command()
|
| 427 |
+
@click.argument("path", type=click.Path(exists=True))
|
| 428 |
+
@click.option("--page", "-p", type=int, default=1, help="Page number")
|
| 429 |
+
@click.option("--output-dir", "-d", type=click.Path(), default="./crops",
|
| 430 |
+
help="Output directory for crops")
|
| 431 |
+
@click.option("--annotate", "-a", is_flag=True, help="Create annotated page image")
|
| 432 |
+
def visualize(path: str, page: int, output_dir: str, annotate: bool):
|
| 433 |
+
"""
|
| 434 |
+
Visualize document regions.
|
| 435 |
+
|
| 436 |
+
Example:
|
| 437 |
+
sparknet docint visualize document.pdf --page 1 --annotate
|
| 438 |
+
"""
|
| 439 |
+
from src.document_intelligence import (
|
| 440 |
+
DocumentParser,
|
| 441 |
+
load_document,
|
| 442 |
+
RenderOptions,
|
| 443 |
+
)
|
| 444 |
+
from src.document_intelligence.grounding import create_annotated_image, CropManager
|
| 445 |
+
from PIL import Image
|
| 446 |
+
import numpy as np
|
| 447 |
+
|
| 448 |
+
output_path = Path(output_dir)
|
| 449 |
+
output_path.mkdir(parents=True, exist_ok=True)
|
| 450 |
+
|
| 451 |
+
click.echo(f"Processing: {path}, page {page}")
|
| 452 |
+
|
| 453 |
+
try:
|
| 454 |
+
# Parse document
|
| 455 |
+
parser = DocumentParser()
|
| 456 |
+
parse_result = parser.parse(path)
|
| 457 |
+
|
| 458 |
+
# Load and render page
|
| 459 |
+
loader, renderer = load_document(path)
|
| 460 |
+
page_image = renderer.render_page(page, RenderOptions(dpi=200))
|
| 461 |
+
loader.close()
|
| 462 |
+
|
| 463 |
+
# Get page chunks
|
| 464 |
+
page_chunks = [c for c in parse_result.chunks if c.page == page]
|
| 465 |
+
|
| 466 |
+
if annotate:
|
| 467 |
+
# Create annotated image
|
| 468 |
+
bboxes = [c.bbox for c in page_chunks]
|
| 469 |
+
labels = [f"{c.chunk_type.value[:10]}" for c in page_chunks]
|
| 470 |
+
|
| 471 |
+
annotated = create_annotated_image(page_image, bboxes, labels)
|
| 472 |
+
|
| 473 |
+
output_file = output_path / f"annotated_page_{page}.png"
|
| 474 |
+
Image.fromarray(annotated).save(output_file)
|
| 475 |
+
click.echo(f"Saved annotated image: {output_file}")
|
| 476 |
+
|
| 477 |
+
else:
|
| 478 |
+
# Save individual crops
|
| 479 |
+
crop_manager = CropManager(output_path)
|
| 480 |
+
|
| 481 |
+
for chunk in page_chunks:
|
| 482 |
+
crop_path = crop_manager.save_crop(
|
| 483 |
+
page_image,
|
| 484 |
+
parse_result.doc_id,
|
| 485 |
+
page,
|
| 486 |
+
chunk.bbox,
|
| 487 |
+
)
|
| 488 |
+
click.echo(f"Saved crop: {crop_path}")
|
| 489 |
+
|
| 490 |
+
click.echo(f"\nProcessed {len(page_chunks)} chunks from page {page}")
|
| 491 |
+
|
| 492 |
+
except Exception as e:
|
| 493 |
+
click.echo(f"Error: {e}", err=True)
|
| 494 |
+
sys.exit(1)
|
| 495 |
+
|
| 496 |
+
|
| 497 |
+
@docint_cli.command()
|
| 498 |
+
@click.argument("paths", nargs=-1, type=click.Path(exists=True), required=True)
|
| 499 |
+
@click.option("--max-pages", type=int, help="Maximum pages to process per document")
|
| 500 |
+
@click.option("--batch-size", type=int, default=32, help="Embedding batch size")
|
| 501 |
+
@click.option("--min-length", type=int, default=10, help="Minimum chunk text length")
|
| 502 |
+
def index(paths: tuple, max_pages: Optional[int], batch_size: int, min_length: int):
|
| 503 |
+
"""
|
| 504 |
+
Index documents into the vector store for RAG.
|
| 505 |
+
|
| 506 |
+
Example:
|
| 507 |
+
sparknet docint index document.pdf
|
| 508 |
+
sparknet docint index *.pdf --max-pages 50
|
| 509 |
+
sparknet docint index doc1.pdf doc2.pdf doc3.pdf
|
| 510 |
+
"""
|
| 511 |
+
from src.document_intelligence.tools import get_rag_tool
|
| 512 |
+
|
| 513 |
+
click.echo(f"Indexing {len(paths)} document(s)...")
|
| 514 |
+
click.echo()
|
| 515 |
+
|
| 516 |
+
try:
|
| 517 |
+
tool = get_rag_tool("index_document")
|
| 518 |
+
|
| 519 |
+
total_indexed = 0
|
| 520 |
+
total_skipped = 0
|
| 521 |
+
errors = []
|
| 522 |
+
|
| 523 |
+
for path in paths:
|
| 524 |
+
click.echo(f"Processing: {path}")
|
| 525 |
+
|
| 526 |
+
result = tool.execute(
|
| 527 |
+
path=path,
|
| 528 |
+
max_pages=max_pages,
|
| 529 |
+
)
|
| 530 |
+
|
| 531 |
+
if result.success:
|
| 532 |
+
data = result.data
|
| 533 |
+
indexed = data.get("chunks_indexed", 0)
|
| 534 |
+
skipped = data.get("chunks_skipped", 0)
|
| 535 |
+
total_indexed += indexed
|
| 536 |
+
total_skipped += skipped
|
| 537 |
+
click.echo(f" Indexed: {indexed} chunks, Skipped: {skipped}")
|
| 538 |
+
click.echo(f" Document ID: {data.get('document_id', 'unknown')}")
|
| 539 |
+
else:
|
| 540 |
+
errors.append((path, result.error))
|
| 541 |
+
click.echo(f" Error: {result.error}", err=True)
|
| 542 |
+
|
| 543 |
+
click.echo()
|
| 544 |
+
click.echo("=" * 40)
|
| 545 |
+
click.echo(f"Total documents: {len(paths)}")
|
| 546 |
+
click.echo(f"Total chunks indexed: {total_indexed}")
|
| 547 |
+
click.echo(f"Total chunks skipped: {total_skipped}")
|
| 548 |
+
|
| 549 |
+
if errors:
|
| 550 |
+
click.echo(f"Errors: {len(errors)}")
|
| 551 |
+
for path, err in errors:
|
| 552 |
+
click.echo(f" - {path}: {err}")
|
| 553 |
+
|
| 554 |
+
except Exception as e:
|
| 555 |
+
click.echo(f"Error: {e}", err=True)
|
| 556 |
+
sys.exit(1)
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
@docint_cli.command(name="index-stats")
|
| 560 |
+
def index_stats():
|
| 561 |
+
"""
|
| 562 |
+
Show statistics about the vector store index.
|
| 563 |
+
|
| 564 |
+
Example:
|
| 565 |
+
sparknet docint index-stats
|
| 566 |
+
"""
|
| 567 |
+
from src.document_intelligence.tools import get_rag_tool
|
| 568 |
+
|
| 569 |
+
try:
|
| 570 |
+
tool = get_rag_tool("get_index_stats")
|
| 571 |
+
result = tool.execute()
|
| 572 |
+
|
| 573 |
+
if result.success:
|
| 574 |
+
data = result.data
|
| 575 |
+
click.echo("Vector Store Statistics:")
|
| 576 |
+
click.echo(f" Total chunks: {data.get('total_chunks', 0)}")
|
| 577 |
+
click.echo(f" Embedding model: {data.get('embedding_model', 'unknown')}")
|
| 578 |
+
click.echo(f" Embedding dimension: {data.get('embedding_dimension', 'unknown')}")
|
| 579 |
+
else:
|
| 580 |
+
click.echo(f"Error: {result.error}", err=True)
|
| 581 |
+
|
| 582 |
+
except Exception as e:
|
| 583 |
+
click.echo(f"Error: {e}", err=True)
|
| 584 |
+
sys.exit(1)
|
| 585 |
+
|
| 586 |
+
|
| 587 |
+
@docint_cli.command(name="delete-index")
|
| 588 |
+
@click.argument("document_id")
|
| 589 |
+
@click.option("--yes", "-y", is_flag=True, help="Skip confirmation prompt")
|
| 590 |
+
def delete_index(document_id: str, yes: bool):
|
| 591 |
+
"""
|
| 592 |
+
Delete a document from the vector store index.
|
| 593 |
+
|
| 594 |
+
Example:
|
| 595 |
+
sparknet docint delete-index doc_abc123
|
| 596 |
+
"""
|
| 597 |
+
from src.document_intelligence.tools import get_rag_tool
|
| 598 |
+
|
| 599 |
+
if not yes:
|
| 600 |
+
click.confirm(f"Delete document '{document_id}' from index?", abort=True)
|
| 601 |
+
|
| 602 |
+
try:
|
| 603 |
+
tool = get_rag_tool("delete_document")
|
| 604 |
+
result = tool.execute(document_id=document_id)
|
| 605 |
+
|
| 606 |
+
if result.success:
|
| 607 |
+
data = result.data
|
| 608 |
+
click.echo(f"Deleted {data.get('chunks_deleted', 0)} chunks for document: {document_id}")
|
| 609 |
+
else:
|
| 610 |
+
click.echo(f"Error: {result.error}", err=True)
|
| 611 |
+
|
| 612 |
+
except Exception as e:
|
| 613 |
+
click.echo(f"Error: {e}", err=True)
|
| 614 |
+
sys.exit(1)
|
| 615 |
+
|
| 616 |
+
|
| 617 |
+
@docint_cli.command(name="retrieve")
|
| 618 |
+
@click.argument("query")
|
| 619 |
+
@click.option("--top-k", "-k", type=int, default=5, help="Number of results")
|
| 620 |
+
@click.option("--document-id", "-d", help="Filter by document ID")
|
| 621 |
+
@click.option("--chunk-type", "-t", multiple=True, help="Filter by chunk type")
|
| 622 |
+
@click.option("--page-start", type=int, help="Filter by page range start")
|
| 623 |
+
@click.option("--page-end", type=int, help="Filter by page range end")
|
| 624 |
+
@click.option("--verbose", "-v", is_flag=True, help="Show full chunk text")
|
| 625 |
+
def retrieve(query: str, top_k: int, document_id: Optional[str],
|
| 626 |
+
chunk_type: tuple, page_start: Optional[int],
|
| 627 |
+
page_end: Optional[int], verbose: bool):
|
| 628 |
+
"""
|
| 629 |
+
Retrieve relevant chunks from the vector store.
|
| 630 |
+
|
| 631 |
+
Example:
|
| 632 |
+
sparknet docint retrieve "payment terms"
|
| 633 |
+
sparknet docint retrieve "claims" -d doc_abc123 -t paragraph -k 10
|
| 634 |
+
"""
|
| 635 |
+
from src.document_intelligence.tools import get_rag_tool
|
| 636 |
+
|
| 637 |
+
click.echo(f"Query: {query}")
|
| 638 |
+
click.echo()
|
| 639 |
+
|
| 640 |
+
try:
|
| 641 |
+
tool = get_rag_tool("retrieve_chunks")
|
| 642 |
+
|
| 643 |
+
page_range = None
|
| 644 |
+
if page_start is not None and page_end is not None:
|
| 645 |
+
page_range = (page_start, page_end)
|
| 646 |
+
|
| 647 |
+
result = tool.execute(
|
| 648 |
+
query=query,
|
| 649 |
+
top_k=top_k,
|
| 650 |
+
document_id=document_id,
|
| 651 |
+
chunk_types=list(chunk_type) if chunk_type else None,
|
| 652 |
+
page_range=page_range,
|
| 653 |
+
)
|
| 654 |
+
|
| 655 |
+
if result.success:
|
| 656 |
+
data = result.data
|
| 657 |
+
chunks = data.get("chunks", [])
|
| 658 |
+
click.echo(f"Found {len(chunks)} results:\n")
|
| 659 |
+
|
| 660 |
+
for i, chunk in enumerate(chunks, 1):
|
| 661 |
+
click.echo(f"{i}. [sim={chunk['similarity']:.3f}] Page {chunk.get('page', '?')}, {chunk.get('chunk_type', 'text')}")
|
| 662 |
+
click.echo(f" Document: {chunk['document_id']}")
|
| 663 |
+
|
| 664 |
+
text = chunk['text']
|
| 665 |
+
if verbose:
|
| 666 |
+
click.echo(f" Text: {text}")
|
| 667 |
+
else:
|
| 668 |
+
click.echo(f" Text: {text[:150]}...")
|
| 669 |
+
click.echo()
|
| 670 |
+
else:
|
| 671 |
+
click.echo(f"Error: {result.error}", err=True)
|
| 672 |
+
|
| 673 |
+
except Exception as e:
|
| 674 |
+
click.echo(f"Error: {e}", err=True)
|
| 675 |
+
sys.exit(1)
|
| 676 |
+
|
| 677 |
+
|
| 678 |
+
# Register with main CLI
|
| 679 |
+
def register_commands(cli):
|
| 680 |
+
"""Register docint commands with main CLI."""
|
| 681 |
+
cli.add_command(docint_cli)
|
src/cli/document.py
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document Processing CLI Commands
|
| 3 |
+
|
| 4 |
+
Commands:
|
| 5 |
+
sparknet document parse <file> - Parse and extract text from document
|
| 6 |
+
sparknet document extract <file> - Extract structured fields
|
| 7 |
+
sparknet document classify <file> - Classify document type
|
| 8 |
+
sparknet document analyze <file> - Full document analysis
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import typer
|
| 12 |
+
from typing import Optional, List
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
import json
|
| 15 |
+
import sys
|
| 16 |
+
|
| 17 |
+
# Create document sub-app
|
| 18 |
+
document_app = typer.Typer(
|
| 19 |
+
name="document",
|
| 20 |
+
help="Document processing commands",
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@document_app.command("parse")
|
| 25 |
+
def parse_document(
|
| 26 |
+
file_path: Path = typer.Argument(..., help="Path to document file"),
|
| 27 |
+
output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"),
|
| 28 |
+
ocr_engine: str = typer.Option("paddleocr", "--ocr", help="OCR engine: paddleocr, tesseract"),
|
| 29 |
+
dpi: int = typer.Option(300, "--dpi", help="Rendering DPI for PDFs"),
|
| 30 |
+
max_pages: Optional[int] = typer.Option(None, "--max-pages", help="Maximum pages to process"),
|
| 31 |
+
include_images: bool = typer.Option(False, "--images", help="Include cropped region images"),
|
| 32 |
+
):
|
| 33 |
+
"""
|
| 34 |
+
Parse a document and extract text with layout information.
|
| 35 |
+
|
| 36 |
+
Example:
|
| 37 |
+
sparknet document parse invoice.pdf -o result.json
|
| 38 |
+
"""
|
| 39 |
+
from loguru import logger
|
| 40 |
+
|
| 41 |
+
if not file_path.exists():
|
| 42 |
+
typer.echo(f"Error: File not found: {file_path}", err=True)
|
| 43 |
+
raise typer.Exit(1)
|
| 44 |
+
|
| 45 |
+
typer.echo(f"Parsing document: {file_path}")
|
| 46 |
+
|
| 47 |
+
try:
|
| 48 |
+
from ..document.pipeline import (
|
| 49 |
+
PipelineConfig,
|
| 50 |
+
get_document_processor,
|
| 51 |
+
)
|
| 52 |
+
from ..document.ocr import OCRConfig
|
| 53 |
+
|
| 54 |
+
# Build config
|
| 55 |
+
ocr_config = OCRConfig(engine=ocr_engine)
|
| 56 |
+
config = PipelineConfig(
|
| 57 |
+
ocr=ocr_config,
|
| 58 |
+
render_dpi=dpi,
|
| 59 |
+
max_pages=max_pages,
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Process document
|
| 63 |
+
processor = get_document_processor(config)
|
| 64 |
+
result = processor.process(str(file_path))
|
| 65 |
+
|
| 66 |
+
# Format output
|
| 67 |
+
output_data = {
|
| 68 |
+
"document_id": result.metadata.document_id,
|
| 69 |
+
"filename": result.metadata.filename,
|
| 70 |
+
"num_pages": result.metadata.num_pages,
|
| 71 |
+
"total_chunks": result.metadata.total_chunks,
|
| 72 |
+
"total_characters": result.metadata.total_characters,
|
| 73 |
+
"ocr_confidence": result.metadata.ocr_confidence_avg,
|
| 74 |
+
"chunks": [
|
| 75 |
+
{
|
| 76 |
+
"chunk_id": c.chunk_id,
|
| 77 |
+
"type": c.chunk_type.value,
|
| 78 |
+
"page": c.page,
|
| 79 |
+
"text": c.text[:500] + "..." if len(c.text) > 500 else c.text,
|
| 80 |
+
"confidence": c.confidence,
|
| 81 |
+
"bbox": {
|
| 82 |
+
"x_min": c.bbox.x_min,
|
| 83 |
+
"y_min": c.bbox.y_min,
|
| 84 |
+
"x_max": c.bbox.x_max,
|
| 85 |
+
"y_max": c.bbox.y_max,
|
| 86 |
+
},
|
| 87 |
+
}
|
| 88 |
+
for c in result.chunks
|
| 89 |
+
],
|
| 90 |
+
"full_text": result.full_text[:2000] + "..." if len(result.full_text) > 2000 else result.full_text,
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
# Output
|
| 94 |
+
if output:
|
| 95 |
+
with open(output, "w") as f:
|
| 96 |
+
json.dump(output_data, f, indent=2)
|
| 97 |
+
typer.echo(f"Results written to: {output}")
|
| 98 |
+
else:
|
| 99 |
+
typer.echo(json.dumps(output_data, indent=2))
|
| 100 |
+
|
| 101 |
+
typer.echo(f"\nProcessed {result.metadata.num_pages} pages, {len(result.chunks)} chunks")
|
| 102 |
+
|
| 103 |
+
except ImportError as e:
|
| 104 |
+
typer.echo(f"Error: Missing dependency - {e}", err=True)
|
| 105 |
+
raise typer.Exit(1)
|
| 106 |
+
except Exception as e:
|
| 107 |
+
typer.echo(f"Error processing document: {e}", err=True)
|
| 108 |
+
raise typer.Exit(1)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
@document_app.command("extract")
|
| 112 |
+
def extract_fields(
|
| 113 |
+
file_path: Path = typer.Argument(..., help="Path to document file"),
|
| 114 |
+
schema: Optional[Path] = typer.Option(None, "--schema", "-s", help="Extraction schema YAML file"),
|
| 115 |
+
fields: Optional[List[str]] = typer.Option(None, "--field", "-f", help="Fields to extract (can use multiple)"),
|
| 116 |
+
output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"),
|
| 117 |
+
validate: bool = typer.Option(True, "--validate/--no-validate", help="Validate extraction"),
|
| 118 |
+
):
|
| 119 |
+
"""
|
| 120 |
+
Extract structured fields from a document.
|
| 121 |
+
|
| 122 |
+
Example:
|
| 123 |
+
sparknet document extract invoice.pdf -f "invoice_number" -f "total_amount"
|
| 124 |
+
sparknet document extract contract.pdf --schema contract_schema.yaml
|
| 125 |
+
"""
|
| 126 |
+
from loguru import logger
|
| 127 |
+
|
| 128 |
+
if not file_path.exists():
|
| 129 |
+
typer.echo(f"Error: File not found: {file_path}", err=True)
|
| 130 |
+
raise typer.Exit(1)
|
| 131 |
+
|
| 132 |
+
if not schema and not fields:
|
| 133 |
+
typer.echo("Error: Provide --schema or --field options", err=True)
|
| 134 |
+
raise typer.Exit(1)
|
| 135 |
+
|
| 136 |
+
typer.echo(f"Extracting fields from: {file_path}")
|
| 137 |
+
|
| 138 |
+
try:
|
| 139 |
+
from ..document.schemas.extraction import ExtractionSchema, FieldDefinition
|
| 140 |
+
from ..agents.document_agent import DocumentAgent
|
| 141 |
+
|
| 142 |
+
# Build extraction schema
|
| 143 |
+
if schema:
|
| 144 |
+
import yaml
|
| 145 |
+
with open(schema) as f:
|
| 146 |
+
schema_data = yaml.safe_load(f)
|
| 147 |
+
extraction_schema = ExtractionSchema(**schema_data)
|
| 148 |
+
else:
|
| 149 |
+
# Build from field names
|
| 150 |
+
field_defs = [
|
| 151 |
+
FieldDefinition(
|
| 152 |
+
name=f,
|
| 153 |
+
field_type="string",
|
| 154 |
+
required=True,
|
| 155 |
+
)
|
| 156 |
+
for f in fields
|
| 157 |
+
]
|
| 158 |
+
extraction_schema = ExtractionSchema(
|
| 159 |
+
name="cli_extraction",
|
| 160 |
+
fields=field_defs,
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
# Run extraction with agent
|
| 164 |
+
import asyncio
|
| 165 |
+
agent = DocumentAgent()
|
| 166 |
+
asyncio.run(agent.load_document(str(file_path)))
|
| 167 |
+
result = asyncio.run(agent.extract_fields(extraction_schema))
|
| 168 |
+
|
| 169 |
+
# Format output
|
| 170 |
+
output_data = {
|
| 171 |
+
"document": str(file_path),
|
| 172 |
+
"fields": result.fields,
|
| 173 |
+
"confidence": result.confidence,
|
| 174 |
+
"evidence": [
|
| 175 |
+
{
|
| 176 |
+
"chunk_id": e.chunk_id,
|
| 177 |
+
"page": e.page,
|
| 178 |
+
"snippet": e.snippet,
|
| 179 |
+
}
|
| 180 |
+
for e in result.evidence
|
| 181 |
+
] if result.evidence else [],
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
# Validate if requested
|
| 185 |
+
if validate and result.fields:
|
| 186 |
+
from ..document.validation import get_extraction_critic
|
| 187 |
+
critic = get_extraction_critic()
|
| 188 |
+
|
| 189 |
+
evidence_chunks = [
|
| 190 |
+
{"text": e.snippet, "page": e.page, "chunk_id": e.chunk_id}
|
| 191 |
+
for e in result.evidence
|
| 192 |
+
] if result.evidence else []
|
| 193 |
+
|
| 194 |
+
validation = critic.validate_extraction(result.fields, evidence_chunks)
|
| 195 |
+
output_data["validation"] = {
|
| 196 |
+
"status": validation.overall_status.value,
|
| 197 |
+
"confidence": validation.overall_confidence,
|
| 198 |
+
"should_accept": validation.should_accept,
|
| 199 |
+
"abstain_reason": validation.abstain_reason,
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
# Output
|
| 203 |
+
if output:
|
| 204 |
+
with open(output, "w") as f:
|
| 205 |
+
json.dump(output_data, f, indent=2)
|
| 206 |
+
typer.echo(f"Results written to: {output}")
|
| 207 |
+
else:
|
| 208 |
+
typer.echo(json.dumps(output_data, indent=2))
|
| 209 |
+
|
| 210 |
+
except ImportError as e:
|
| 211 |
+
typer.echo(f"Error: Missing dependency - {e}", err=True)
|
| 212 |
+
raise typer.Exit(1)
|
| 213 |
+
except Exception as e:
|
| 214 |
+
typer.echo(f"Error extracting fields: {e}", err=True)
|
| 215 |
+
raise typer.Exit(1)
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
@document_app.command("classify")
|
| 219 |
+
def classify_document(
|
| 220 |
+
file_path: Path = typer.Argument(..., help="Path to document file"),
|
| 221 |
+
output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"),
|
| 222 |
+
):
|
| 223 |
+
"""
|
| 224 |
+
Classify document type.
|
| 225 |
+
|
| 226 |
+
Example:
|
| 227 |
+
sparknet document classify document.pdf
|
| 228 |
+
"""
|
| 229 |
+
from loguru import logger
|
| 230 |
+
|
| 231 |
+
if not file_path.exists():
|
| 232 |
+
typer.echo(f"Error: File not found: {file_path}", err=True)
|
| 233 |
+
raise typer.Exit(1)
|
| 234 |
+
|
| 235 |
+
typer.echo(f"Classifying document: {file_path}")
|
| 236 |
+
|
| 237 |
+
try:
|
| 238 |
+
from ..agents.document_agent import DocumentAgent
|
| 239 |
+
import asyncio
|
| 240 |
+
|
| 241 |
+
agent = DocumentAgent()
|
| 242 |
+
asyncio.run(agent.load_document(str(file_path)))
|
| 243 |
+
classification = asyncio.run(agent.classify())
|
| 244 |
+
|
| 245 |
+
output_data = {
|
| 246 |
+
"document": str(file_path),
|
| 247 |
+
"document_type": classification.document_type.value,
|
| 248 |
+
"confidence": classification.confidence,
|
| 249 |
+
"reasoning": classification.reasoning,
|
| 250 |
+
"metadata": classification.metadata,
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
if output:
|
| 254 |
+
with open(output, "w") as f:
|
| 255 |
+
json.dump(output_data, f, indent=2)
|
| 256 |
+
typer.echo(f"Results written to: {output}")
|
| 257 |
+
else:
|
| 258 |
+
typer.echo(json.dumps(output_data, indent=2))
|
| 259 |
+
|
| 260 |
+
except Exception as e:
|
| 261 |
+
typer.echo(f"Error classifying document: {e}", err=True)
|
| 262 |
+
raise typer.Exit(1)
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
@document_app.command("ask")
|
| 266 |
+
def ask_document(
|
| 267 |
+
file_path: Path = typer.Argument(..., help="Path to document file"),
|
| 268 |
+
question: str = typer.Argument(..., help="Question to ask about the document"),
|
| 269 |
+
output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"),
|
| 270 |
+
):
|
| 271 |
+
"""
|
| 272 |
+
Ask a question about a document.
|
| 273 |
+
|
| 274 |
+
Example:
|
| 275 |
+
sparknet document ask invoice.pdf "What is the total amount?"
|
| 276 |
+
"""
|
| 277 |
+
from loguru import logger
|
| 278 |
+
|
| 279 |
+
if not file_path.exists():
|
| 280 |
+
typer.echo(f"Error: File not found: {file_path}", err=True)
|
| 281 |
+
raise typer.Exit(1)
|
| 282 |
+
|
| 283 |
+
typer.echo(f"Processing question for: {file_path}")
|
| 284 |
+
|
| 285 |
+
try:
|
| 286 |
+
from ..agents.document_agent import DocumentAgent
|
| 287 |
+
import asyncio
|
| 288 |
+
|
| 289 |
+
agent = DocumentAgent()
|
| 290 |
+
asyncio.run(agent.load_document(str(file_path)))
|
| 291 |
+
answer, evidence = asyncio.run(agent.answer_question(question))
|
| 292 |
+
|
| 293 |
+
output_data = {
|
| 294 |
+
"document": str(file_path),
|
| 295 |
+
"question": question,
|
| 296 |
+
"answer": answer,
|
| 297 |
+
"evidence": [
|
| 298 |
+
{
|
| 299 |
+
"chunk_id": e.chunk_id,
|
| 300 |
+
"page": e.page,
|
| 301 |
+
"snippet": e.snippet,
|
| 302 |
+
"confidence": e.confidence,
|
| 303 |
+
}
|
| 304 |
+
for e in evidence
|
| 305 |
+
] if evidence else [],
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
if output:
|
| 309 |
+
with open(output, "w") as f:
|
| 310 |
+
json.dump(output_data, f, indent=2)
|
| 311 |
+
typer.echo(f"Results written to: {output}")
|
| 312 |
+
else:
|
| 313 |
+
typer.echo(f"\nQuestion: {question}")
|
| 314 |
+
typer.echo(f"\nAnswer: {answer}")
|
| 315 |
+
if evidence:
|
| 316 |
+
typer.echo(f"\nEvidence ({len(evidence)} sources):")
|
| 317 |
+
for e in evidence[:3]:
|
| 318 |
+
typer.echo(f" - Page {e.page + 1}: {e.snippet[:100]}...")
|
| 319 |
+
|
| 320 |
+
except Exception as e:
|
| 321 |
+
typer.echo(f"Error processing question: {e}", err=True)
|
| 322 |
+
raise typer.Exit(1)
|
src/cli/main.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SPARKNET CLI Main Entry Point
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
sparknet document parse <file>
|
| 6 |
+
sparknet document extract <file> --schema <schema.yaml>
|
| 7 |
+
sparknet rag index <file>
|
| 8 |
+
sparknet rag ask <question>
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import typer
|
| 12 |
+
from typing import Optional
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
import json
|
| 15 |
+
import sys
|
| 16 |
+
|
| 17 |
+
from .document import document_app
|
| 18 |
+
from .rag import rag_app
|
| 19 |
+
|
| 20 |
+
# Create main app
|
| 21 |
+
app = typer.Typer(
|
| 22 |
+
name="sparknet",
|
| 23 |
+
help="SPARKNET Document Intelligence CLI",
|
| 24 |
+
add_completion=False,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# Register sub-commands
|
| 28 |
+
app.add_typer(document_app, name="document", help="Document processing commands")
|
| 29 |
+
app.add_typer(rag_app, name="rag", help="RAG and retrieval commands")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
@app.command()
|
| 33 |
+
def version():
|
| 34 |
+
"""Show SPARKNET version."""
|
| 35 |
+
typer.echo("SPARKNET Document Intelligence v0.1.0")
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@app.command()
|
| 39 |
+
def info():
|
| 40 |
+
"""Show system information and configuration."""
|
| 41 |
+
from loguru import logger
|
| 42 |
+
import platform
|
| 43 |
+
|
| 44 |
+
typer.echo("SPARKNET Document Intelligence")
|
| 45 |
+
typer.echo("=" * 40)
|
| 46 |
+
typer.echo(f"Python: {platform.python_version()}")
|
| 47 |
+
typer.echo(f"Platform: {platform.system()} {platform.release()}")
|
| 48 |
+
typer.echo()
|
| 49 |
+
|
| 50 |
+
# Check component availability
|
| 51 |
+
typer.echo("Components:")
|
| 52 |
+
|
| 53 |
+
# OCR
|
| 54 |
+
try:
|
| 55 |
+
from paddleocr import PaddleOCR
|
| 56 |
+
typer.echo(" [✓] PaddleOCR")
|
| 57 |
+
except ImportError:
|
| 58 |
+
typer.echo(" [✗] PaddleOCR (install with: pip install paddleocr)")
|
| 59 |
+
|
| 60 |
+
try:
|
| 61 |
+
import pytesseract
|
| 62 |
+
typer.echo(" [✓] Tesseract")
|
| 63 |
+
except ImportError:
|
| 64 |
+
typer.echo(" [✗] Tesseract (install with: pip install pytesseract)")
|
| 65 |
+
|
| 66 |
+
# Vector Store
|
| 67 |
+
try:
|
| 68 |
+
import chromadb
|
| 69 |
+
typer.echo(" [✓] ChromaDB")
|
| 70 |
+
except ImportError:
|
| 71 |
+
typer.echo(" [✗] ChromaDB (install with: pip install chromadb)")
|
| 72 |
+
|
| 73 |
+
# Ollama
|
| 74 |
+
try:
|
| 75 |
+
import httpx
|
| 76 |
+
with httpx.Client(timeout=2.0) as client:
|
| 77 |
+
resp = client.get("http://localhost:11434/api/tags")
|
| 78 |
+
if resp.status_code == 200:
|
| 79 |
+
models = resp.json().get("models", [])
|
| 80 |
+
typer.echo(f" [✓] Ollama ({len(models)} models)")
|
| 81 |
+
else:
|
| 82 |
+
typer.echo(" [✗] Ollama (not responding)")
|
| 83 |
+
except Exception:
|
| 84 |
+
typer.echo(" [✗] Ollama (not running)")
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
@app.callback()
|
| 88 |
+
def main_callback(
|
| 89 |
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output"),
|
| 90 |
+
quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress output"),
|
| 91 |
+
):
|
| 92 |
+
"""SPARKNET Document Intelligence CLI."""
|
| 93 |
+
from loguru import logger
|
| 94 |
+
import sys
|
| 95 |
+
|
| 96 |
+
# Configure logging
|
| 97 |
+
logger.remove()
|
| 98 |
+
if verbose:
|
| 99 |
+
logger.add(sys.stderr, level="DEBUG")
|
| 100 |
+
elif not quiet:
|
| 101 |
+
logger.add(sys.stderr, level="INFO")
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def main():
|
| 105 |
+
"""Main entry point."""
|
| 106 |
+
app()
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
if __name__ == "__main__":
|
| 110 |
+
main()
|
src/cli/rag.py
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
RAG CLI Commands
|
| 3 |
+
|
| 4 |
+
Commands:
|
| 5 |
+
sparknet rag index <file> - Index document for retrieval
|
| 6 |
+
sparknet rag search <query> - Search indexed documents
|
| 7 |
+
sparknet rag ask <question> - Answer question using RAG
|
| 8 |
+
sparknet rag status - Show index status
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import typer
|
| 12 |
+
from typing import Optional, List
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
import json
|
| 15 |
+
import sys
|
| 16 |
+
|
| 17 |
+
# Create RAG sub-app
|
| 18 |
+
rag_app = typer.Typer(
|
| 19 |
+
name="rag",
|
| 20 |
+
help="RAG and retrieval commands",
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@rag_app.command("index")
|
| 25 |
+
def index_document(
|
| 26 |
+
files: List[Path] = typer.Argument(..., help="Document file(s) to index"),
|
| 27 |
+
collection: str = typer.Option("sparknet_documents", "--collection", "-c", help="Collection name"),
|
| 28 |
+
embedding_model: str = typer.Option("nomic-embed-text", "--model", "-m", help="Embedding model"),
|
| 29 |
+
):
|
| 30 |
+
"""
|
| 31 |
+
Index document(s) for RAG retrieval.
|
| 32 |
+
|
| 33 |
+
Example:
|
| 34 |
+
sparknet rag index document.pdf
|
| 35 |
+
sparknet rag index *.pdf --collection contracts
|
| 36 |
+
"""
|
| 37 |
+
from loguru import logger
|
| 38 |
+
|
| 39 |
+
# Validate files
|
| 40 |
+
valid_files = []
|
| 41 |
+
for f in files:
|
| 42 |
+
if f.exists():
|
| 43 |
+
valid_files.append(f)
|
| 44 |
+
else:
|
| 45 |
+
typer.echo(f"Warning: File not found, skipping: {f}", err=True)
|
| 46 |
+
|
| 47 |
+
if not valid_files:
|
| 48 |
+
typer.echo("Error: No valid files to index", err=True)
|
| 49 |
+
raise typer.Exit(1)
|
| 50 |
+
|
| 51 |
+
typer.echo(f"Indexing {len(valid_files)} document(s)...")
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
from ..rag import (
|
| 55 |
+
VectorStoreConfig,
|
| 56 |
+
EmbeddingConfig,
|
| 57 |
+
get_document_indexer,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Configure
|
| 61 |
+
store_config = VectorStoreConfig(collection_name=collection)
|
| 62 |
+
embed_config = EmbeddingConfig(ollama_model=embedding_model)
|
| 63 |
+
|
| 64 |
+
# Get indexer
|
| 65 |
+
indexer = get_document_indexer()
|
| 66 |
+
|
| 67 |
+
# Index documents
|
| 68 |
+
results = indexer.index_batch([str(f) for f in valid_files])
|
| 69 |
+
|
| 70 |
+
# Summary
|
| 71 |
+
successful = sum(1 for r in results if r.success)
|
| 72 |
+
total_chunks = sum(r.num_chunks_indexed for r in results)
|
| 73 |
+
|
| 74 |
+
typer.echo(f"\nIndexing complete:")
|
| 75 |
+
typer.echo(f" Documents: {successful}/{len(results)} successful")
|
| 76 |
+
typer.echo(f" Chunks indexed: {total_chunks}")
|
| 77 |
+
|
| 78 |
+
for r in results:
|
| 79 |
+
status = "✓" if r.success else "✗"
|
| 80 |
+
typer.echo(f" [{status}] {r.source_path}: {r.num_chunks_indexed} chunks")
|
| 81 |
+
if r.error:
|
| 82 |
+
typer.echo(f" Error: {r.error}")
|
| 83 |
+
|
| 84 |
+
except ImportError as e:
|
| 85 |
+
typer.echo(f"Error: Missing dependency - {e}", err=True)
|
| 86 |
+
raise typer.Exit(1)
|
| 87 |
+
except Exception as e:
|
| 88 |
+
typer.echo(f"Error indexing documents: {e}", err=True)
|
| 89 |
+
raise typer.Exit(1)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@rag_app.command("search")
|
| 93 |
+
def search_documents(
|
| 94 |
+
query: str = typer.Argument(..., help="Search query"),
|
| 95 |
+
top_k: int = typer.Option(5, "--top", "-k", help="Number of results"),
|
| 96 |
+
collection: str = typer.Option("sparknet_documents", "--collection", "-c", help="Collection name"),
|
| 97 |
+
document_id: Optional[str] = typer.Option(None, "--document", "-d", help="Filter by document ID"),
|
| 98 |
+
chunk_type: Optional[str] = typer.Option(None, "--type", "-t", help="Filter by chunk type"),
|
| 99 |
+
output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"),
|
| 100 |
+
):
|
| 101 |
+
"""
|
| 102 |
+
Search indexed documents.
|
| 103 |
+
|
| 104 |
+
Example:
|
| 105 |
+
sparknet rag search "payment terms" --top 10
|
| 106 |
+
sparknet rag search "table data" --type table
|
| 107 |
+
"""
|
| 108 |
+
typer.echo(f"Searching: {query}")
|
| 109 |
+
|
| 110 |
+
try:
|
| 111 |
+
from ..rag import get_document_retriever, RetrieverConfig
|
| 112 |
+
|
| 113 |
+
# Configure
|
| 114 |
+
config = RetrieverConfig(default_top_k=top_k)
|
| 115 |
+
retriever = get_document_retriever(config)
|
| 116 |
+
|
| 117 |
+
# Build filters
|
| 118 |
+
filters = {}
|
| 119 |
+
if document_id:
|
| 120 |
+
filters["document_id"] = document_id
|
| 121 |
+
if chunk_type:
|
| 122 |
+
filters["chunk_type"] = chunk_type
|
| 123 |
+
|
| 124 |
+
# Search
|
| 125 |
+
chunks = retriever.retrieve(query, top_k=top_k, filters=filters if filters else None)
|
| 126 |
+
|
| 127 |
+
if not chunks:
|
| 128 |
+
typer.echo("No results found.")
|
| 129 |
+
return
|
| 130 |
+
|
| 131 |
+
# Format output
|
| 132 |
+
output_data = {
|
| 133 |
+
"query": query,
|
| 134 |
+
"num_results": len(chunks),
|
| 135 |
+
"results": [
|
| 136 |
+
{
|
| 137 |
+
"chunk_id": c.chunk_id,
|
| 138 |
+
"document_id": c.document_id,
|
| 139 |
+
"page": c.page,
|
| 140 |
+
"chunk_type": c.chunk_type,
|
| 141 |
+
"similarity": c.similarity,
|
| 142 |
+
"text": c.text[:500] + "..." if len(c.text) > 500 else c.text,
|
| 143 |
+
}
|
| 144 |
+
for c in chunks
|
| 145 |
+
],
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
if output:
|
| 149 |
+
with open(output, "w") as f:
|
| 150 |
+
json.dump(output_data, f, indent=2)
|
| 151 |
+
typer.echo(f"Results written to: {output}")
|
| 152 |
+
else:
|
| 153 |
+
typer.echo(f"\nFound {len(chunks)} results:\n")
|
| 154 |
+
for i, c in enumerate(chunks, 1):
|
| 155 |
+
typer.echo(f"[{i}] Similarity: {c.similarity:.3f}")
|
| 156 |
+
if c.page is not None:
|
| 157 |
+
typer.echo(f" Page: {c.page + 1}, Type: {c.chunk_type or 'text'}")
|
| 158 |
+
typer.echo(f" {c.text[:200]}...")
|
| 159 |
+
typer.echo()
|
| 160 |
+
|
| 161 |
+
except Exception as e:
|
| 162 |
+
typer.echo(f"Error searching: {e}", err=True)
|
| 163 |
+
raise typer.Exit(1)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
@rag_app.command("ask")
|
| 167 |
+
def ask_question(
|
| 168 |
+
question: str = typer.Argument(..., help="Question to answer"),
|
| 169 |
+
top_k: int = typer.Option(5, "--top", "-k", help="Number of context chunks"),
|
| 170 |
+
collection: str = typer.Option("sparknet_documents", "--collection", "-c", help="Collection name"),
|
| 171 |
+
document_id: Optional[str] = typer.Option(None, "--document", "-d", help="Filter by document ID"),
|
| 172 |
+
output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"),
|
| 173 |
+
show_evidence: bool = typer.Option(True, "--evidence/--no-evidence", help="Show evidence sources"),
|
| 174 |
+
):
|
| 175 |
+
"""
|
| 176 |
+
Answer a question using RAG.
|
| 177 |
+
|
| 178 |
+
Example:
|
| 179 |
+
sparknet rag ask "What are the payment terms?"
|
| 180 |
+
sparknet rag ask "What is the contract value?" --document contract123
|
| 181 |
+
"""
|
| 182 |
+
typer.echo(f"Question: {question}")
|
| 183 |
+
typer.echo("Processing...")
|
| 184 |
+
|
| 185 |
+
try:
|
| 186 |
+
from ..rag import get_grounded_generator, GeneratorConfig
|
| 187 |
+
|
| 188 |
+
# Configure
|
| 189 |
+
config = GeneratorConfig()
|
| 190 |
+
generator = get_grounded_generator(config)
|
| 191 |
+
|
| 192 |
+
# Build filters
|
| 193 |
+
filters = {"document_id": document_id} if document_id else None
|
| 194 |
+
|
| 195 |
+
# Generate answer
|
| 196 |
+
result = generator.answer_question(question, top_k=top_k, filters=filters)
|
| 197 |
+
|
| 198 |
+
# Format output
|
| 199 |
+
output_data = {
|
| 200 |
+
"question": question,
|
| 201 |
+
"answer": result.answer,
|
| 202 |
+
"confidence": result.confidence,
|
| 203 |
+
"abstained": result.abstained,
|
| 204 |
+
"abstain_reason": result.abstain_reason,
|
| 205 |
+
"citations": [
|
| 206 |
+
{
|
| 207 |
+
"index": c.index,
|
| 208 |
+
"page": c.page,
|
| 209 |
+
"snippet": c.text_snippet,
|
| 210 |
+
"confidence": c.confidence,
|
| 211 |
+
}
|
| 212 |
+
for c in result.citations
|
| 213 |
+
],
|
| 214 |
+
"num_chunks_used": result.num_chunks_used,
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
if output:
|
| 218 |
+
with open(output, "w") as f:
|
| 219 |
+
json.dump(output_data, f, indent=2)
|
| 220 |
+
typer.echo(f"Results written to: {output}")
|
| 221 |
+
else:
|
| 222 |
+
typer.echo(f"\nAnswer: {result.answer}")
|
| 223 |
+
typer.echo(f"\nConfidence: {result.confidence:.2f}")
|
| 224 |
+
|
| 225 |
+
if result.abstained:
|
| 226 |
+
typer.echo(f"Note: {result.abstain_reason}")
|
| 227 |
+
|
| 228 |
+
if show_evidence and result.citations:
|
| 229 |
+
typer.echo(f"\nSources ({len(result.citations)}):")
|
| 230 |
+
for c in result.citations:
|
| 231 |
+
page_info = f"Page {c.page + 1}" if c.page is not None else ""
|
| 232 |
+
typer.echo(f" [{c.index}] {page_info}: {c.text_snippet[:80]}...")
|
| 233 |
+
|
| 234 |
+
except Exception as e:
|
| 235 |
+
typer.echo(f"Error generating answer: {e}", err=True)
|
| 236 |
+
raise typer.Exit(1)
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
@rag_app.command("status")
|
| 240 |
+
def show_status(
|
| 241 |
+
collection: str = typer.Option("sparknet_documents", "--collection", "-c", help="Collection name"),
|
| 242 |
+
):
|
| 243 |
+
"""
|
| 244 |
+
Show RAG index status.
|
| 245 |
+
|
| 246 |
+
Example:
|
| 247 |
+
sparknet rag status
|
| 248 |
+
sparknet rag status --collection contracts
|
| 249 |
+
"""
|
| 250 |
+
typer.echo("RAG Index Status")
|
| 251 |
+
typer.echo("=" * 40)
|
| 252 |
+
|
| 253 |
+
try:
|
| 254 |
+
from ..rag import get_vector_store, VectorStoreConfig
|
| 255 |
+
|
| 256 |
+
config = VectorStoreConfig(collection_name=collection)
|
| 257 |
+
store = get_vector_store(config)
|
| 258 |
+
|
| 259 |
+
# Get stats
|
| 260 |
+
total_chunks = store.count()
|
| 261 |
+
|
| 262 |
+
typer.echo(f"Collection: {collection}")
|
| 263 |
+
typer.echo(f"Total chunks: {total_chunks}")
|
| 264 |
+
|
| 265 |
+
# List documents
|
| 266 |
+
if hasattr(store, 'list_documents'):
|
| 267 |
+
doc_ids = store.list_documents()
|
| 268 |
+
typer.echo(f"Documents indexed: {len(doc_ids)}")
|
| 269 |
+
|
| 270 |
+
if doc_ids:
|
| 271 |
+
typer.echo("\nDocuments:")
|
| 272 |
+
for doc_id in doc_ids[:10]:
|
| 273 |
+
chunk_count = store.count(doc_id)
|
| 274 |
+
typer.echo(f" - {doc_id}: {chunk_count} chunks")
|
| 275 |
+
|
| 276 |
+
if len(doc_ids) > 10:
|
| 277 |
+
typer.echo(f" ... and {len(doc_ids) - 10} more")
|
| 278 |
+
|
| 279 |
+
except Exception as e:
|
| 280 |
+
typer.echo(f"Error getting status: {e}", err=True)
|
| 281 |
+
raise typer.Exit(1)
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
@rag_app.command("delete")
|
| 285 |
+
def delete_document(
|
| 286 |
+
document_id: str = typer.Argument(..., help="Document ID to delete"),
|
| 287 |
+
collection: str = typer.Option("sparknet_documents", "--collection", "-c", help="Collection name"),
|
| 288 |
+
force: bool = typer.Option(False, "--force", "-f", help="Skip confirmation"),
|
| 289 |
+
):
|
| 290 |
+
"""
|
| 291 |
+
Delete a document from the index.
|
| 292 |
+
|
| 293 |
+
Example:
|
| 294 |
+
sparknet rag delete doc123
|
| 295 |
+
sparknet rag delete doc123 --force
|
| 296 |
+
"""
|
| 297 |
+
if not force:
|
| 298 |
+
confirm = typer.confirm(f"Delete document '{document_id}' from index?")
|
| 299 |
+
if not confirm:
|
| 300 |
+
typer.echo("Cancelled.")
|
| 301 |
+
return
|
| 302 |
+
|
| 303 |
+
try:
|
| 304 |
+
from ..rag import get_vector_store, VectorStoreConfig
|
| 305 |
+
|
| 306 |
+
config = VectorStoreConfig(collection_name=collection)
|
| 307 |
+
store = get_vector_store(config)
|
| 308 |
+
|
| 309 |
+
deleted = store.delete_document(document_id)
|
| 310 |
+
typer.echo(f"Deleted {deleted} chunks for document: {document_id}")
|
| 311 |
+
|
| 312 |
+
except Exception as e:
|
| 313 |
+
typer.echo(f"Error deleting document: {e}", err=True)
|
| 314 |
+
raise typer.Exit(1)
|
src/document/__init__.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SPARKNET Document Intelligence Subsystem
|
| 3 |
+
|
| 4 |
+
A comprehensive document processing pipeline for:
|
| 5 |
+
- OCR with PaddleOCR and Tesseract
|
| 6 |
+
- Layout detection and reading order reconstruction
|
| 7 |
+
- Semantic chunking with grounding evidence
|
| 8 |
+
- Document classification and field extraction
|
| 9 |
+
- Extraction validation with Critic/Verifier
|
| 10 |
+
|
| 11 |
+
Principles:
|
| 12 |
+
- Processing is not understanding: OCR alone is insufficient
|
| 13 |
+
- Every extraction includes evidence pointers (bbox, page, chunk_id)
|
| 14 |
+
- Modular, pluggable components with clean interfaces
|
| 15 |
+
- Abstain with evidence when confidence is low
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from .schemas.core import (
|
| 19 |
+
BoundingBox,
|
| 20 |
+
OCRRegion,
|
| 21 |
+
LayoutRegion,
|
| 22 |
+
LayoutType,
|
| 23 |
+
DocumentChunk,
|
| 24 |
+
ChunkType,
|
| 25 |
+
EvidenceRef,
|
| 26 |
+
ExtractionResult,
|
| 27 |
+
DocumentMetadata,
|
| 28 |
+
ProcessedDocument,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
from .pipeline import (
|
| 32 |
+
PipelineConfig,
|
| 33 |
+
DocumentProcessor,
|
| 34 |
+
get_document_processor,
|
| 35 |
+
process_document,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
from .validation import (
|
| 39 |
+
CriticConfig,
|
| 40 |
+
ValidationResult,
|
| 41 |
+
ExtractionCritic,
|
| 42 |
+
get_extraction_critic,
|
| 43 |
+
VerifierConfig,
|
| 44 |
+
VerificationResult,
|
| 45 |
+
EvidenceVerifier,
|
| 46 |
+
get_evidence_verifier,
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
__all__ = [
|
| 50 |
+
# Core schemas
|
| 51 |
+
"BoundingBox",
|
| 52 |
+
"OCRRegion",
|
| 53 |
+
"LayoutRegion",
|
| 54 |
+
"LayoutType",
|
| 55 |
+
"DocumentChunk",
|
| 56 |
+
"ChunkType",
|
| 57 |
+
"EvidenceRef",
|
| 58 |
+
"ExtractionResult",
|
| 59 |
+
"DocumentMetadata",
|
| 60 |
+
"ProcessedDocument",
|
| 61 |
+
# Pipeline
|
| 62 |
+
"PipelineConfig",
|
| 63 |
+
"DocumentProcessor",
|
| 64 |
+
"get_document_processor",
|
| 65 |
+
"process_document",
|
| 66 |
+
# Validation
|
| 67 |
+
"CriticConfig",
|
| 68 |
+
"ValidationResult",
|
| 69 |
+
"ExtractionCritic",
|
| 70 |
+
"get_extraction_critic",
|
| 71 |
+
"VerifierConfig",
|
| 72 |
+
"VerificationResult",
|
| 73 |
+
"EvidenceVerifier",
|
| 74 |
+
"get_evidence_verifier",
|
| 75 |
+
]
|
src/document/chunking/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document Chunking Module
|
| 3 |
+
|
| 4 |
+
Creates semantic chunks from document content for retrieval and processing.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from .chunker import (
|
| 8 |
+
ChunkerConfig,
|
| 9 |
+
DocumentChunker,
|
| 10 |
+
SemanticChunker,
|
| 11 |
+
get_document_chunker,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
__all__ = [
|
| 15 |
+
"ChunkerConfig",
|
| 16 |
+
"DocumentChunker",
|
| 17 |
+
"SemanticChunker",
|
| 18 |
+
"get_document_chunker",
|
| 19 |
+
]
|
src/document/chunking/chunker.py
ADDED
|
@@ -0,0 +1,944 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document Chunker Implementation
|
| 3 |
+
|
| 4 |
+
Creates semantic chunks from document content with bounding box tracking.
|
| 5 |
+
Includes TableAwareChunker for preserving table structure in markdown format.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import uuid
|
| 9 |
+
import time
|
| 10 |
+
import re
|
| 11 |
+
from typing import List, Optional, Dict, Any, Tuple
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
from pydantic import BaseModel, Field
|
| 14 |
+
from loguru import logger
|
| 15 |
+
from collections import defaultdict
|
| 16 |
+
|
| 17 |
+
from ..schemas.core import (
|
| 18 |
+
BoundingBox,
|
| 19 |
+
DocumentChunk,
|
| 20 |
+
ChunkType,
|
| 21 |
+
LayoutRegion,
|
| 22 |
+
LayoutType,
|
| 23 |
+
OCRRegion,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class ChunkerConfig(BaseModel):
|
| 28 |
+
"""Configuration for document chunking."""
|
| 29 |
+
# Chunk size limits
|
| 30 |
+
max_chunk_chars: int = Field(
|
| 31 |
+
default=1000,
|
| 32 |
+
ge=100,
|
| 33 |
+
description="Maximum characters per chunk"
|
| 34 |
+
)
|
| 35 |
+
min_chunk_chars: int = Field(
|
| 36 |
+
default=50,
|
| 37 |
+
ge=10,
|
| 38 |
+
description="Minimum characters per chunk"
|
| 39 |
+
)
|
| 40 |
+
overlap_chars: int = Field(
|
| 41 |
+
default=100,
|
| 42 |
+
ge=0,
|
| 43 |
+
description="Character overlap between chunks"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# Chunking strategy
|
| 47 |
+
strategy: str = Field(
|
| 48 |
+
default="semantic",
|
| 49 |
+
description="Chunking strategy: semantic, fixed, or layout"
|
| 50 |
+
)
|
| 51 |
+
respect_layout: bool = Field(
|
| 52 |
+
default=True,
|
| 53 |
+
description="Respect layout region boundaries"
|
| 54 |
+
)
|
| 55 |
+
merge_small_regions: bool = Field(
|
| 56 |
+
default=True,
|
| 57 |
+
description="Merge small adjacent regions"
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Special element handling
|
| 61 |
+
chunk_tables: bool = Field(
|
| 62 |
+
default=True,
|
| 63 |
+
description="Create separate chunks for tables"
|
| 64 |
+
)
|
| 65 |
+
chunk_figures: bool = Field(
|
| 66 |
+
default=True,
|
| 67 |
+
description="Create separate chunks for figures"
|
| 68 |
+
)
|
| 69 |
+
include_captions: bool = Field(
|
| 70 |
+
default=True,
|
| 71 |
+
description="Include captions with figures/tables"
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# Sentence handling
|
| 75 |
+
split_on_sentences: bool = Field(
|
| 76 |
+
default=True,
|
| 77 |
+
description="Split on sentence boundaries when possible"
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# Table-aware chunking (FG-002)
|
| 81 |
+
preserve_table_structure: bool = Field(
|
| 82 |
+
default=True,
|
| 83 |
+
description="Preserve table structure as markdown with structured data"
|
| 84 |
+
)
|
| 85 |
+
table_row_threshold: float = Field(
|
| 86 |
+
default=10.0,
|
| 87 |
+
description="Y-coordinate threshold for grouping cells into rows"
|
| 88 |
+
)
|
| 89 |
+
table_col_threshold: float = Field(
|
| 90 |
+
default=20.0,
|
| 91 |
+
description="X-coordinate threshold for grouping cells into columns"
|
| 92 |
+
)
|
| 93 |
+
detect_table_headers: bool = Field(
|
| 94 |
+
default=True,
|
| 95 |
+
description="Attempt to detect and mark header rows"
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# Map layout types to chunk types
|
| 100 |
+
LAYOUT_TO_CHUNK_TYPE = {
|
| 101 |
+
LayoutType.TEXT: ChunkType.TEXT,
|
| 102 |
+
LayoutType.TITLE: ChunkType.TITLE,
|
| 103 |
+
LayoutType.HEADING: ChunkType.HEADING,
|
| 104 |
+
LayoutType.PARAGRAPH: ChunkType.PARAGRAPH,
|
| 105 |
+
LayoutType.LIST: ChunkType.LIST_ITEM,
|
| 106 |
+
LayoutType.TABLE: ChunkType.TABLE,
|
| 107 |
+
LayoutType.FIGURE: ChunkType.FIGURE,
|
| 108 |
+
LayoutType.CHART: ChunkType.CHART,
|
| 109 |
+
LayoutType.FORMULA: ChunkType.FORMULA,
|
| 110 |
+
LayoutType.CAPTION: ChunkType.CAPTION,
|
| 111 |
+
LayoutType.FOOTNOTE: ChunkType.FOOTNOTE,
|
| 112 |
+
LayoutType.HEADER: ChunkType.HEADER,
|
| 113 |
+
LayoutType.FOOTER: ChunkType.FOOTER,
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
class DocumentChunker:
|
| 118 |
+
"""Base class for document chunkers."""
|
| 119 |
+
|
| 120 |
+
def __init__(self, config: Optional[ChunkerConfig] = None):
|
| 121 |
+
self.config = config or ChunkerConfig()
|
| 122 |
+
|
| 123 |
+
def create_chunks(
|
| 124 |
+
self,
|
| 125 |
+
ocr_regions: List[OCRRegion],
|
| 126 |
+
layout_regions: Optional[List[LayoutRegion]] = None,
|
| 127 |
+
document_id: str = "",
|
| 128 |
+
source_path: Optional[str] = None,
|
| 129 |
+
) -> List[DocumentChunk]:
|
| 130 |
+
"""
|
| 131 |
+
Create chunks from OCR and layout regions.
|
| 132 |
+
|
| 133 |
+
Args:
|
| 134 |
+
ocr_regions: OCR text regions
|
| 135 |
+
layout_regions: Optional layout regions
|
| 136 |
+
document_id: Parent document ID
|
| 137 |
+
source_path: Source file path
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
List of DocumentChunk
|
| 141 |
+
"""
|
| 142 |
+
raise NotImplementedError
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
class SemanticChunker(DocumentChunker):
|
| 146 |
+
"""
|
| 147 |
+
Semantic chunker that respects document structure.
|
| 148 |
+
|
| 149 |
+
Creates chunks based on:
|
| 150 |
+
- Layout region boundaries
|
| 151 |
+
- Semantic coherence (paragraphs, sections)
|
| 152 |
+
- Size constraints with overlap
|
| 153 |
+
"""
|
| 154 |
+
|
| 155 |
+
def create_chunks(
|
| 156 |
+
self,
|
| 157 |
+
ocr_regions: List[OCRRegion],
|
| 158 |
+
layout_regions: Optional[List[LayoutRegion]] = None,
|
| 159 |
+
document_id: str = "",
|
| 160 |
+
source_path: Optional[str] = None,
|
| 161 |
+
) -> List[DocumentChunk]:
|
| 162 |
+
"""Create semantic chunks from document content."""
|
| 163 |
+
if not ocr_regions:
|
| 164 |
+
return []
|
| 165 |
+
|
| 166 |
+
start_time = time.time()
|
| 167 |
+
chunks = []
|
| 168 |
+
chunk_index = 0
|
| 169 |
+
|
| 170 |
+
if layout_regions and self.config.respect_layout:
|
| 171 |
+
# Use layout regions to guide chunking
|
| 172 |
+
chunks = self._chunk_by_layout(
|
| 173 |
+
ocr_regions, layout_regions, document_id, source_path
|
| 174 |
+
)
|
| 175 |
+
else:
|
| 176 |
+
# Fall back to text-based chunking
|
| 177 |
+
chunks = self._chunk_by_text(
|
| 178 |
+
ocr_regions, document_id, source_path
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
# Assign sequence indices
|
| 182 |
+
for i, chunk in enumerate(chunks):
|
| 183 |
+
chunk.sequence_index = i
|
| 184 |
+
|
| 185 |
+
logger.debug(
|
| 186 |
+
f"Created {len(chunks)} chunks in "
|
| 187 |
+
f"{(time.time() - start_time) * 1000:.1f}ms"
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
return chunks
|
| 191 |
+
|
| 192 |
+
def _chunk_by_layout(
|
| 193 |
+
self,
|
| 194 |
+
ocr_regions: List[OCRRegion],
|
| 195 |
+
layout_regions: List[LayoutRegion],
|
| 196 |
+
document_id: str,
|
| 197 |
+
source_path: Optional[str],
|
| 198 |
+
) -> List[DocumentChunk]:
|
| 199 |
+
"""Create chunks based on layout regions."""
|
| 200 |
+
chunks = []
|
| 201 |
+
|
| 202 |
+
# Sort layout regions by reading order
|
| 203 |
+
sorted_layouts = sorted(
|
| 204 |
+
layout_regions,
|
| 205 |
+
key=lambda r: (r.reading_order or 0, r.bbox.y_min, r.bbox.x_min)
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
for layout in sorted_layouts:
|
| 209 |
+
# Get OCR regions within this layout region
|
| 210 |
+
contained_ocr = self._get_contained_ocr(ocr_regions, layout)
|
| 211 |
+
|
| 212 |
+
if not contained_ocr:
|
| 213 |
+
continue
|
| 214 |
+
|
| 215 |
+
# Determine chunk type
|
| 216 |
+
chunk_type = LAYOUT_TO_CHUNK_TYPE.get(layout.type, ChunkType.TEXT)
|
| 217 |
+
|
| 218 |
+
# Handle special types differently
|
| 219 |
+
if layout.type == LayoutType.TABLE and self.config.chunk_tables:
|
| 220 |
+
chunk = self._create_table_chunk(
|
| 221 |
+
contained_ocr, layout, document_id, source_path
|
| 222 |
+
)
|
| 223 |
+
chunks.append(chunk)
|
| 224 |
+
|
| 225 |
+
elif layout.type in (LayoutType.FIGURE, LayoutType.CHART) and self.config.chunk_figures:
|
| 226 |
+
chunk = self._create_figure_chunk(
|
| 227 |
+
contained_ocr, layout, document_id, source_path
|
| 228 |
+
)
|
| 229 |
+
chunks.append(chunk)
|
| 230 |
+
|
| 231 |
+
else:
|
| 232 |
+
# Regular text chunk - may need splitting
|
| 233 |
+
text_chunks = self._create_text_chunks(
|
| 234 |
+
contained_ocr, layout, chunk_type, document_id, source_path
|
| 235 |
+
)
|
| 236 |
+
chunks.extend(text_chunks)
|
| 237 |
+
|
| 238 |
+
return chunks
|
| 239 |
+
|
| 240 |
+
def _chunk_by_text(
|
| 241 |
+
self,
|
| 242 |
+
ocr_regions: List[OCRRegion],
|
| 243 |
+
document_id: str,
|
| 244 |
+
source_path: Optional[str],
|
| 245 |
+
) -> List[DocumentChunk]:
|
| 246 |
+
"""Create chunks from text without layout guidance."""
|
| 247 |
+
chunks = []
|
| 248 |
+
|
| 249 |
+
# Sort by reading order (y then x)
|
| 250 |
+
sorted_regions = sorted(
|
| 251 |
+
ocr_regions,
|
| 252 |
+
key=lambda r: (r.page, r.bbox.y_min, r.bbox.x_min)
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
# Group by page
|
| 256 |
+
pages: Dict[int, List[OCRRegion]] = {}
|
| 257 |
+
for r in sorted_regions:
|
| 258 |
+
if r.page not in pages:
|
| 259 |
+
pages[r.page] = []
|
| 260 |
+
pages[r.page].append(r)
|
| 261 |
+
|
| 262 |
+
# Process each page
|
| 263 |
+
for page_num in sorted(pages.keys()):
|
| 264 |
+
page_regions = pages[page_num]
|
| 265 |
+
page_chunks = self._split_text_regions(
|
| 266 |
+
page_regions, document_id, source_path, page_num
|
| 267 |
+
)
|
| 268 |
+
chunks.extend(page_chunks)
|
| 269 |
+
|
| 270 |
+
return chunks
|
| 271 |
+
|
| 272 |
+
def _get_contained_ocr(
|
| 273 |
+
self,
|
| 274 |
+
ocr_regions: List[OCRRegion],
|
| 275 |
+
layout: LayoutRegion,
|
| 276 |
+
) -> List[OCRRegion]:
|
| 277 |
+
"""Get OCR regions contained within a layout region."""
|
| 278 |
+
contained = []
|
| 279 |
+
for ocr in ocr_regions:
|
| 280 |
+
if ocr.page == layout.page:
|
| 281 |
+
# Check if OCR region overlaps significantly with layout
|
| 282 |
+
iou = layout.bbox.iou(ocr.bbox)
|
| 283 |
+
if iou > 0.3 or layout.bbox.contains(ocr.bbox):
|
| 284 |
+
contained.append(ocr)
|
| 285 |
+
return contained
|
| 286 |
+
|
| 287 |
+
def _create_text_chunks(
|
| 288 |
+
self,
|
| 289 |
+
ocr_regions: List[OCRRegion],
|
| 290 |
+
layout: LayoutRegion,
|
| 291 |
+
chunk_type: ChunkType,
|
| 292 |
+
document_id: str,
|
| 293 |
+
source_path: Optional[str],
|
| 294 |
+
) -> List[DocumentChunk]:
|
| 295 |
+
"""Create text chunks from OCR regions, splitting if needed."""
|
| 296 |
+
chunks = []
|
| 297 |
+
|
| 298 |
+
# Combine text
|
| 299 |
+
text = " ".join(r.text for r in ocr_regions)
|
| 300 |
+
|
| 301 |
+
# Calculate average confidence
|
| 302 |
+
avg_conf = sum(r.confidence for r in ocr_regions) / len(ocr_regions)
|
| 303 |
+
|
| 304 |
+
# Check if splitting is needed
|
| 305 |
+
if len(text) <= self.config.max_chunk_chars:
|
| 306 |
+
# Single chunk
|
| 307 |
+
chunk = DocumentChunk(
|
| 308 |
+
chunk_id=f"{document_id}_{uuid.uuid4().hex[:8]}",
|
| 309 |
+
chunk_type=chunk_type,
|
| 310 |
+
text=text,
|
| 311 |
+
bbox=layout.bbox,
|
| 312 |
+
page=layout.page,
|
| 313 |
+
document_id=document_id,
|
| 314 |
+
source_path=source_path,
|
| 315 |
+
sequence_index=0,
|
| 316 |
+
confidence=avg_conf,
|
| 317 |
+
)
|
| 318 |
+
chunks.append(chunk)
|
| 319 |
+
else:
|
| 320 |
+
# Split into multiple chunks
|
| 321 |
+
split_chunks = self._split_text(
|
| 322 |
+
text, layout.bbox, layout.page, chunk_type,
|
| 323 |
+
document_id, source_path, avg_conf
|
| 324 |
+
)
|
| 325 |
+
chunks.extend(split_chunks)
|
| 326 |
+
|
| 327 |
+
return chunks
|
| 328 |
+
|
| 329 |
+
def _split_text(
|
| 330 |
+
self,
|
| 331 |
+
text: str,
|
| 332 |
+
bbox: BoundingBox,
|
| 333 |
+
page: int,
|
| 334 |
+
chunk_type: ChunkType,
|
| 335 |
+
document_id: str,
|
| 336 |
+
source_path: Optional[str],
|
| 337 |
+
confidence: float,
|
| 338 |
+
) -> List[DocumentChunk]:
|
| 339 |
+
"""Split long text into multiple chunks with overlap."""
|
| 340 |
+
chunks = []
|
| 341 |
+
max_chars = self.config.max_chunk_chars
|
| 342 |
+
overlap = self.config.overlap_chars
|
| 343 |
+
|
| 344 |
+
# Split on sentences if enabled
|
| 345 |
+
if self.config.split_on_sentences:
|
| 346 |
+
sentences = self._split_sentences(text)
|
| 347 |
+
else:
|
| 348 |
+
sentences = [text]
|
| 349 |
+
|
| 350 |
+
current_text = ""
|
| 351 |
+
for sentence in sentences:
|
| 352 |
+
if len(current_text) + len(sentence) > max_chars and current_text:
|
| 353 |
+
# Create chunk
|
| 354 |
+
chunk = DocumentChunk(
|
| 355 |
+
chunk_id=f"{document_id}_{uuid.uuid4().hex[:8]}",
|
| 356 |
+
chunk_type=chunk_type,
|
| 357 |
+
text=current_text.strip(),
|
| 358 |
+
bbox=bbox,
|
| 359 |
+
page=page,
|
| 360 |
+
document_id=document_id,
|
| 361 |
+
source_path=source_path,
|
| 362 |
+
sequence_index=len(chunks),
|
| 363 |
+
confidence=confidence,
|
| 364 |
+
)
|
| 365 |
+
chunks.append(chunk)
|
| 366 |
+
|
| 367 |
+
# Start new chunk with overlap
|
| 368 |
+
if overlap > 0:
|
| 369 |
+
overlap_text = current_text[-overlap:] if len(current_text) > overlap else current_text
|
| 370 |
+
current_text = overlap_text + " " + sentence
|
| 371 |
+
else:
|
| 372 |
+
current_text = sentence
|
| 373 |
+
else:
|
| 374 |
+
current_text += " " + sentence if current_text else sentence
|
| 375 |
+
|
| 376 |
+
# Don't forget the last chunk
|
| 377 |
+
if current_text.strip():
|
| 378 |
+
chunk = DocumentChunk(
|
| 379 |
+
chunk_id=f"{document_id}_{uuid.uuid4().hex[:8]}",
|
| 380 |
+
chunk_type=chunk_type,
|
| 381 |
+
text=current_text.strip(),
|
| 382 |
+
bbox=bbox,
|
| 383 |
+
page=page,
|
| 384 |
+
document_id=document_id,
|
| 385 |
+
source_path=source_path,
|
| 386 |
+
sequence_index=len(chunks),
|
| 387 |
+
confidence=confidence,
|
| 388 |
+
)
|
| 389 |
+
chunks.append(chunk)
|
| 390 |
+
|
| 391 |
+
return chunks
|
| 392 |
+
|
| 393 |
+
def _split_sentences(self, text: str) -> List[str]:
|
| 394 |
+
"""Split text into sentences."""
|
| 395 |
+
# Simple sentence splitting
|
| 396 |
+
import re
|
| 397 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 398 |
+
return [s.strip() for s in sentences if s.strip()]
|
| 399 |
+
|
| 400 |
+
def _create_table_chunk(
|
| 401 |
+
self,
|
| 402 |
+
ocr_regions: List[OCRRegion],
|
| 403 |
+
layout: LayoutRegion,
|
| 404 |
+
document_id: str,
|
| 405 |
+
source_path: Optional[str],
|
| 406 |
+
) -> DocumentChunk:
|
| 407 |
+
"""
|
| 408 |
+
Create a chunk for table content with structure preservation.
|
| 409 |
+
|
| 410 |
+
Enhanced table handling (FG-002):
|
| 411 |
+
- Reconstructs table structure from OCR regions
|
| 412 |
+
- Generates markdown table representation
|
| 413 |
+
- Stores structured data for SQL-like queries
|
| 414 |
+
- Detects and marks header rows
|
| 415 |
+
"""
|
| 416 |
+
if not ocr_regions:
|
| 417 |
+
return DocumentChunk(
|
| 418 |
+
chunk_id=f"{document_id}_table_{uuid.uuid4().hex[:8]}",
|
| 419 |
+
chunk_type=ChunkType.TABLE,
|
| 420 |
+
text="[Empty Table]",
|
| 421 |
+
bbox=layout.bbox,
|
| 422 |
+
page=layout.page,
|
| 423 |
+
document_id=document_id,
|
| 424 |
+
source_path=source_path,
|
| 425 |
+
sequence_index=0,
|
| 426 |
+
confidence=0.0,
|
| 427 |
+
extra=layout.extra or {},
|
| 428 |
+
)
|
| 429 |
+
|
| 430 |
+
avg_conf = sum(r.confidence for r in ocr_regions) / len(ocr_regions)
|
| 431 |
+
|
| 432 |
+
# Check if we should preserve table structure
|
| 433 |
+
if not self.config.preserve_table_structure:
|
| 434 |
+
# Fall back to simple pipe-separated format
|
| 435 |
+
text = " | ".join(r.text for r in ocr_regions)
|
| 436 |
+
return DocumentChunk(
|
| 437 |
+
chunk_id=f"{document_id}_table_{uuid.uuid4().hex[:8]}",
|
| 438 |
+
chunk_type=ChunkType.TABLE,
|
| 439 |
+
text=text,
|
| 440 |
+
bbox=layout.bbox,
|
| 441 |
+
page=layout.page,
|
| 442 |
+
document_id=document_id,
|
| 443 |
+
source_path=source_path,
|
| 444 |
+
sequence_index=0,
|
| 445 |
+
confidence=avg_conf,
|
| 446 |
+
extra=layout.extra or {},
|
| 447 |
+
)
|
| 448 |
+
|
| 449 |
+
# Reconstruct table structure from spatial positions
|
| 450 |
+
table_data = self._reconstruct_table_structure(ocr_regions)
|
| 451 |
+
|
| 452 |
+
# Generate markdown representation
|
| 453 |
+
markdown_table = self._table_to_markdown(
|
| 454 |
+
table_data["rows"],
|
| 455 |
+
table_data["headers"],
|
| 456 |
+
table_data["has_header"]
|
| 457 |
+
)
|
| 458 |
+
|
| 459 |
+
# Create rich metadata for structured queries
|
| 460 |
+
table_extra = {
|
| 461 |
+
**(layout.extra or {}),
|
| 462 |
+
"table_structure": {
|
| 463 |
+
"row_count": table_data["row_count"],
|
| 464 |
+
"col_count": table_data["col_count"],
|
| 465 |
+
"has_header": table_data["has_header"],
|
| 466 |
+
"headers": table_data["headers"],
|
| 467 |
+
"cells": table_data["cells"], # 2D list of cell values
|
| 468 |
+
"cell_positions": table_data["cell_positions"], # For highlighting
|
| 469 |
+
},
|
| 470 |
+
"format": "markdown",
|
| 471 |
+
"searchable_text": table_data["searchable_text"],
|
| 472 |
+
}
|
| 473 |
+
|
| 474 |
+
return DocumentChunk(
|
| 475 |
+
chunk_id=f"{document_id}_table_{uuid.uuid4().hex[:8]}",
|
| 476 |
+
chunk_type=ChunkType.TABLE,
|
| 477 |
+
text=markdown_table,
|
| 478 |
+
bbox=layout.bbox,
|
| 479 |
+
page=layout.page,
|
| 480 |
+
document_id=document_id,
|
| 481 |
+
source_path=source_path,
|
| 482 |
+
sequence_index=0,
|
| 483 |
+
confidence=avg_conf,
|
| 484 |
+
extra=table_extra,
|
| 485 |
+
)
|
| 486 |
+
|
| 487 |
+
def _reconstruct_table_structure(
|
| 488 |
+
self,
|
| 489 |
+
ocr_regions: List[OCRRegion],
|
| 490 |
+
) -> Dict[str, Any]:
|
| 491 |
+
"""
|
| 492 |
+
Reconstruct table structure from OCR regions based on spatial positions.
|
| 493 |
+
|
| 494 |
+
Groups OCR regions into rows and columns by analyzing their bounding boxes.
|
| 495 |
+
Returns structured table data for markdown generation and queries.
|
| 496 |
+
"""
|
| 497 |
+
if not ocr_regions:
|
| 498 |
+
return {
|
| 499 |
+
"rows": [],
|
| 500 |
+
"headers": [],
|
| 501 |
+
"has_header": False,
|
| 502 |
+
"row_count": 0,
|
| 503 |
+
"col_count": 0,
|
| 504 |
+
"cells": [],
|
| 505 |
+
"cell_positions": [],
|
| 506 |
+
"searchable_text": "",
|
| 507 |
+
}
|
| 508 |
+
|
| 509 |
+
# Sort regions by vertical position (y_min) then horizontal (x_min)
|
| 510 |
+
sorted_regions = sorted(
|
| 511 |
+
ocr_regions,
|
| 512 |
+
key=lambda r: (r.bbox.y_min, r.bbox.x_min)
|
| 513 |
+
)
|
| 514 |
+
|
| 515 |
+
# Group into rows based on y-coordinate proximity
|
| 516 |
+
row_threshold = self.config.table_row_threshold
|
| 517 |
+
rows: List[List[OCRRegion]] = []
|
| 518 |
+
current_row: List[OCRRegion] = []
|
| 519 |
+
current_y = None
|
| 520 |
+
|
| 521 |
+
for region in sorted_regions:
|
| 522 |
+
if current_y is None:
|
| 523 |
+
current_y = region.bbox.y_min
|
| 524 |
+
current_row.append(region)
|
| 525 |
+
elif abs(region.bbox.y_min - current_y) <= row_threshold:
|
| 526 |
+
current_row.append(region)
|
| 527 |
+
else:
|
| 528 |
+
if current_row:
|
| 529 |
+
# Sort row by x position
|
| 530 |
+
current_row.sort(key=lambda r: r.bbox.x_min)
|
| 531 |
+
rows.append(current_row)
|
| 532 |
+
current_row = [region]
|
| 533 |
+
current_y = region.bbox.y_min
|
| 534 |
+
|
| 535 |
+
# Don't forget the last row
|
| 536 |
+
if current_row:
|
| 537 |
+
current_row.sort(key=lambda r: r.bbox.x_min)
|
| 538 |
+
rows.append(current_row)
|
| 539 |
+
|
| 540 |
+
# Determine column structure
|
| 541 |
+
# Find consistent column boundaries across all rows
|
| 542 |
+
col_positions = self._detect_column_positions(rows)
|
| 543 |
+
num_cols = len(col_positions) if col_positions else max(len(row) for row in rows)
|
| 544 |
+
|
| 545 |
+
# Build structured cell data
|
| 546 |
+
cells: List[List[str]] = []
|
| 547 |
+
cell_positions: List[List[Dict[str, Any]]] = []
|
| 548 |
+
|
| 549 |
+
for row in rows:
|
| 550 |
+
row_cells = self._assign_cells_to_columns(row, col_positions, num_cols)
|
| 551 |
+
cells.append([cell["text"] for cell in row_cells])
|
| 552 |
+
cell_positions.append([{
|
| 553 |
+
"text": cell["text"],
|
| 554 |
+
"bbox": cell["bbox"],
|
| 555 |
+
"confidence": cell["confidence"]
|
| 556 |
+
} for cell in row_cells])
|
| 557 |
+
|
| 558 |
+
# Detect header row
|
| 559 |
+
has_header = False
|
| 560 |
+
headers: List[str] = []
|
| 561 |
+
|
| 562 |
+
if self.config.detect_table_headers and len(cells) > 0:
|
| 563 |
+
has_header, headers = self._detect_header_row(cells, rows)
|
| 564 |
+
|
| 565 |
+
# Build searchable text (for vector embedding)
|
| 566 |
+
searchable_parts = []
|
| 567 |
+
for i, row in enumerate(cells):
|
| 568 |
+
if has_header and i == 0:
|
| 569 |
+
searchable_parts.append("Headers: " + ", ".join(row))
|
| 570 |
+
else:
|
| 571 |
+
if has_header and headers:
|
| 572 |
+
# Include header context for each value
|
| 573 |
+
for j, cell in enumerate(row):
|
| 574 |
+
if j < len(headers) and headers[j]:
|
| 575 |
+
searchable_parts.append(f"{headers[j]}: {cell}")
|
| 576 |
+
else:
|
| 577 |
+
searchable_parts.append(cell)
|
| 578 |
+
else:
|
| 579 |
+
searchable_parts.extend(row)
|
| 580 |
+
|
| 581 |
+
return {
|
| 582 |
+
"rows": cells,
|
| 583 |
+
"headers": headers,
|
| 584 |
+
"has_header": has_header,
|
| 585 |
+
"row_count": len(cells),
|
| 586 |
+
"col_count": num_cols,
|
| 587 |
+
"cells": cells,
|
| 588 |
+
"cell_positions": cell_positions,
|
| 589 |
+
"searchable_text": " | ".join(searchable_parts),
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
def _detect_column_positions(
|
| 593 |
+
self,
|
| 594 |
+
rows: List[List[OCRRegion]],
|
| 595 |
+
) -> List[Tuple[float, float]]:
|
| 596 |
+
"""
|
| 597 |
+
Detect consistent column boundaries from table rows.
|
| 598 |
+
|
| 599 |
+
Returns list of (x_start, x_end) tuples for each column.
|
| 600 |
+
"""
|
| 601 |
+
if not rows:
|
| 602 |
+
return []
|
| 603 |
+
|
| 604 |
+
col_threshold = self.config.table_col_threshold
|
| 605 |
+
|
| 606 |
+
# Collect all x positions
|
| 607 |
+
all_x_starts = []
|
| 608 |
+
for row in rows:
|
| 609 |
+
for region in row:
|
| 610 |
+
all_x_starts.append(region.bbox.x_min)
|
| 611 |
+
|
| 612 |
+
if not all_x_starts:
|
| 613 |
+
return []
|
| 614 |
+
|
| 615 |
+
# Cluster x positions into columns
|
| 616 |
+
all_x_starts.sort()
|
| 617 |
+
columns = []
|
| 618 |
+
current_col_start = all_x_starts[0]
|
| 619 |
+
current_col_regions = [all_x_starts[0]]
|
| 620 |
+
|
| 621 |
+
for x in all_x_starts[1:]:
|
| 622 |
+
if x - current_col_regions[-1] <= col_threshold:
|
| 623 |
+
current_col_regions.append(x)
|
| 624 |
+
else:
|
| 625 |
+
# Calculate column boundary
|
| 626 |
+
col_center = sum(current_col_regions) / len(current_col_regions)
|
| 627 |
+
columns.append(col_center)
|
| 628 |
+
current_col_regions = [x]
|
| 629 |
+
|
| 630 |
+
# Last column
|
| 631 |
+
if current_col_regions:
|
| 632 |
+
col_center = sum(current_col_regions) / len(current_col_regions)
|
| 633 |
+
columns.append(col_center)
|
| 634 |
+
|
| 635 |
+
# Convert to column ranges
|
| 636 |
+
col_ranges = []
|
| 637 |
+
for i, col_x in enumerate(columns):
|
| 638 |
+
x_start = col_x - col_threshold
|
| 639 |
+
if i < len(columns) - 1:
|
| 640 |
+
x_end = (col_x + columns[i + 1]) / 2
|
| 641 |
+
else:
|
| 642 |
+
x_end = col_x + col_threshold * 3 # Extend last column
|
| 643 |
+
col_ranges.append((x_start, x_end))
|
| 644 |
+
|
| 645 |
+
return col_ranges
|
| 646 |
+
|
| 647 |
+
def _assign_cells_to_columns(
|
| 648 |
+
self,
|
| 649 |
+
row_regions: List[OCRRegion],
|
| 650 |
+
col_positions: List[Tuple[float, float]],
|
| 651 |
+
num_cols: int,
|
| 652 |
+
) -> List[Dict[str, Any]]:
|
| 653 |
+
"""
|
| 654 |
+
Assign OCR regions in a row to their respective columns.
|
| 655 |
+
Handles merged cells and missing cells.
|
| 656 |
+
"""
|
| 657 |
+
# Initialize empty cells for each column
|
| 658 |
+
row_cells = [
|
| 659 |
+
{"text": "", "bbox": None, "confidence": 0.0}
|
| 660 |
+
for _ in range(num_cols)
|
| 661 |
+
]
|
| 662 |
+
|
| 663 |
+
if not col_positions:
|
| 664 |
+
# No column positions detected, just use order
|
| 665 |
+
for i, region in enumerate(row_regions):
|
| 666 |
+
if i < num_cols:
|
| 667 |
+
row_cells[i] = {
|
| 668 |
+
"text": region.text.strip(),
|
| 669 |
+
"bbox": region.bbox.to_xyxy(),
|
| 670 |
+
"confidence": region.confidence,
|
| 671 |
+
}
|
| 672 |
+
return row_cells
|
| 673 |
+
|
| 674 |
+
# Assign regions to columns based on x position
|
| 675 |
+
for region in row_regions:
|
| 676 |
+
region_x = region.bbox.x_min
|
| 677 |
+
assigned = False
|
| 678 |
+
|
| 679 |
+
for col_idx, (x_start, x_end) in enumerate(col_positions):
|
| 680 |
+
if x_start <= region_x <= x_end:
|
| 681 |
+
# Append to existing cell (handle multi-line cells)
|
| 682 |
+
if row_cells[col_idx]["text"]:
|
| 683 |
+
row_cells[col_idx]["text"] += " " + region.text.strip()
|
| 684 |
+
else:
|
| 685 |
+
row_cells[col_idx]["text"] = region.text.strip()
|
| 686 |
+
row_cells[col_idx]["bbox"] = region.bbox.to_xyxy()
|
| 687 |
+
row_cells[col_idx]["confidence"] = max(
|
| 688 |
+
row_cells[col_idx]["confidence"],
|
| 689 |
+
region.confidence
|
| 690 |
+
)
|
| 691 |
+
assigned = True
|
| 692 |
+
break
|
| 693 |
+
|
| 694 |
+
# If not assigned, put in nearest column
|
| 695 |
+
if not assigned:
|
| 696 |
+
min_dist = float("inf")
|
| 697 |
+
nearest_col = 0
|
| 698 |
+
for col_idx, (x_start, x_end) in enumerate(col_positions):
|
| 699 |
+
col_center = (x_start + x_end) / 2
|
| 700 |
+
dist = abs(region_x - col_center)
|
| 701 |
+
if dist < min_dist:
|
| 702 |
+
min_dist = dist
|
| 703 |
+
nearest_col = col_idx
|
| 704 |
+
|
| 705 |
+
if row_cells[nearest_col]["text"]:
|
| 706 |
+
row_cells[nearest_col]["text"] += " " + region.text.strip()
|
| 707 |
+
else:
|
| 708 |
+
row_cells[nearest_col]["text"] = region.text.strip()
|
| 709 |
+
row_cells[nearest_col]["bbox"] = region.bbox.to_xyxy()
|
| 710 |
+
row_cells[nearest_col]["confidence"] = region.confidence
|
| 711 |
+
|
| 712 |
+
return row_cells
|
| 713 |
+
|
| 714 |
+
def _detect_header_row(
|
| 715 |
+
self,
|
| 716 |
+
cells: List[List[str]],
|
| 717 |
+
rows: List[List[OCRRegion]],
|
| 718 |
+
) -> Tuple[bool, List[str]]:
|
| 719 |
+
"""
|
| 720 |
+
Detect if the first row is a header row.
|
| 721 |
+
|
| 722 |
+
Heuristics used:
|
| 723 |
+
- First row contains non-numeric text
|
| 724 |
+
- First row text is shorter (labels vs data)
|
| 725 |
+
- First row has distinct formatting (if available)
|
| 726 |
+
"""
|
| 727 |
+
if not cells or len(cells) < 2:
|
| 728 |
+
return False, []
|
| 729 |
+
|
| 730 |
+
first_row = cells[0]
|
| 731 |
+
other_rows = cells[1:]
|
| 732 |
+
|
| 733 |
+
# Check if first row is mostly non-numeric
|
| 734 |
+
first_row_numeric_count = sum(
|
| 735 |
+
1 for cell in first_row
|
| 736 |
+
if cell and self._is_numeric(cell)
|
| 737 |
+
)
|
| 738 |
+
first_row_text_ratio = (len(first_row) - first_row_numeric_count) / max(len(first_row), 1)
|
| 739 |
+
|
| 740 |
+
# Check if other rows are more numeric
|
| 741 |
+
other_numeric_ratios = []
|
| 742 |
+
for row in other_rows:
|
| 743 |
+
if row:
|
| 744 |
+
numeric_count = sum(1 for cell in row if cell and self._is_numeric(cell))
|
| 745 |
+
other_numeric_ratios.append(numeric_count / max(len(row), 1))
|
| 746 |
+
|
| 747 |
+
avg_other_numeric = sum(other_numeric_ratios) / max(len(other_numeric_ratios), 1)
|
| 748 |
+
|
| 749 |
+
# Header detection: first row is text-heavy, others are more numeric
|
| 750 |
+
is_header = (
|
| 751 |
+
first_row_text_ratio > 0.5 and
|
| 752 |
+
(avg_other_numeric > first_row_text_ratio * 0.5 or first_row_text_ratio > 0.8)
|
| 753 |
+
)
|
| 754 |
+
|
| 755 |
+
# Also consider: shorter cell lengths in first row (labels are usually shorter)
|
| 756 |
+
first_row_avg_len = sum(len(cell) for cell in first_row) / max(len(first_row), 1)
|
| 757 |
+
other_avg_lens = [
|
| 758 |
+
sum(len(cell) for cell in row) / max(len(row), 1)
|
| 759 |
+
for row in other_rows
|
| 760 |
+
]
|
| 761 |
+
avg_other_len = sum(other_avg_lens) / max(len(other_avg_lens), 1)
|
| 762 |
+
|
| 763 |
+
if first_row_avg_len < avg_other_len * 0.8:
|
| 764 |
+
is_header = True
|
| 765 |
+
|
| 766 |
+
return is_header, first_row if is_header else []
|
| 767 |
+
|
| 768 |
+
def _is_numeric(self, text: str) -> bool:
|
| 769 |
+
"""Check if text is primarily numeric (including currency, percentages)."""
|
| 770 |
+
cleaned = re.sub(r'[$€£¥%,.\s\-+()]', '', text)
|
| 771 |
+
return cleaned.isdigit() if cleaned else False
|
| 772 |
+
|
| 773 |
+
def _table_to_markdown(
|
| 774 |
+
self,
|
| 775 |
+
rows: List[List[str]],
|
| 776 |
+
headers: List[str],
|
| 777 |
+
has_header: bool,
|
| 778 |
+
) -> str:
|
| 779 |
+
"""
|
| 780 |
+
Convert table data to markdown format.
|
| 781 |
+
|
| 782 |
+
Creates a properly formatted markdown table with:
|
| 783 |
+
- Header row (if detected)
|
| 784 |
+
- Separator row
|
| 785 |
+
- Data rows
|
| 786 |
+
"""
|
| 787 |
+
if not rows:
|
| 788 |
+
return "[Empty Table]"
|
| 789 |
+
|
| 790 |
+
# Determine column count
|
| 791 |
+
num_cols = max(len(row) for row in rows) if rows else 0
|
| 792 |
+
if num_cols == 0:
|
| 793 |
+
return "[Empty Table]"
|
| 794 |
+
|
| 795 |
+
# Normalize all rows to same column count
|
| 796 |
+
normalized_rows = []
|
| 797 |
+
for row in rows:
|
| 798 |
+
normalized = row + [""] * (num_cols - len(row))
|
| 799 |
+
normalized_rows.append(normalized)
|
| 800 |
+
|
| 801 |
+
# Build markdown lines
|
| 802 |
+
md_lines = []
|
| 803 |
+
|
| 804 |
+
if has_header and headers:
|
| 805 |
+
# Use detected headers
|
| 806 |
+
header_line = "| " + " | ".join(headers + [""] * (num_cols - len(headers))) + " |"
|
| 807 |
+
separator = "| " + " | ".join(["---"] * num_cols) + " |"
|
| 808 |
+
md_lines.append(header_line)
|
| 809 |
+
md_lines.append(separator)
|
| 810 |
+
data_rows = normalized_rows[1:]
|
| 811 |
+
else:
|
| 812 |
+
# No header - create generic headers
|
| 813 |
+
generic_headers = [f"Col{i+1}" for i in range(num_cols)]
|
| 814 |
+
header_line = "| " + " | ".join(generic_headers) + " |"
|
| 815 |
+
separator = "| " + " | ".join(["---"] * num_cols) + " |"
|
| 816 |
+
md_lines.append(header_line)
|
| 817 |
+
md_lines.append(separator)
|
| 818 |
+
data_rows = normalized_rows
|
| 819 |
+
|
| 820 |
+
# Add data rows
|
| 821 |
+
for row in data_rows:
|
| 822 |
+
# Escape pipe characters in cell content
|
| 823 |
+
escaped_row = [cell.replace("|", "\\|") for cell in row]
|
| 824 |
+
row_line = "| " + " | ".join(escaped_row) + " |"
|
| 825 |
+
md_lines.append(row_line)
|
| 826 |
+
|
| 827 |
+
return "\n".join(md_lines)
|
| 828 |
+
|
| 829 |
+
def _create_figure_chunk(
|
| 830 |
+
self,
|
| 831 |
+
ocr_regions: List[OCRRegion],
|
| 832 |
+
layout: LayoutRegion,
|
| 833 |
+
document_id: str,
|
| 834 |
+
source_path: Optional[str],
|
| 835 |
+
) -> DocumentChunk:
|
| 836 |
+
"""Create a chunk for figure/chart content."""
|
| 837 |
+
# For figures, text is usually caption
|
| 838 |
+
text = " ".join(r.text for r in ocr_regions) if ocr_regions else "[Figure]"
|
| 839 |
+
avg_conf = sum(r.confidence for r in ocr_regions) / len(ocr_regions) if ocr_regions else 0.5
|
| 840 |
+
|
| 841 |
+
chunk_type = ChunkType.CHART if layout.type == LayoutType.CHART else ChunkType.FIGURE
|
| 842 |
+
|
| 843 |
+
return DocumentChunk(
|
| 844 |
+
chunk_id=f"{document_id}_{chunk_type.value}_{uuid.uuid4().hex[:8]}",
|
| 845 |
+
chunk_type=chunk_type,
|
| 846 |
+
text=text,
|
| 847 |
+
bbox=layout.bbox,
|
| 848 |
+
page=layout.page,
|
| 849 |
+
document_id=document_id,
|
| 850 |
+
source_path=source_path,
|
| 851 |
+
sequence_index=0,
|
| 852 |
+
confidence=avg_conf,
|
| 853 |
+
caption=text if ocr_regions else None,
|
| 854 |
+
)
|
| 855 |
+
|
| 856 |
+
def _split_text_regions(
|
| 857 |
+
self,
|
| 858 |
+
ocr_regions: List[OCRRegion],
|
| 859 |
+
document_id: str,
|
| 860 |
+
source_path: Optional[str],
|
| 861 |
+
page_num: int,
|
| 862 |
+
) -> List[DocumentChunk]:
|
| 863 |
+
"""Split OCR regions into chunks without layout guidance."""
|
| 864 |
+
if not ocr_regions:
|
| 865 |
+
return []
|
| 866 |
+
|
| 867 |
+
chunks = []
|
| 868 |
+
current_text = ""
|
| 869 |
+
current_regions = []
|
| 870 |
+
|
| 871 |
+
for region in ocr_regions:
|
| 872 |
+
if len(current_text) + len(region.text) > self.config.max_chunk_chars:
|
| 873 |
+
if current_regions:
|
| 874 |
+
# Create chunk from accumulated regions
|
| 875 |
+
chunk = self._create_chunk_from_regions(
|
| 876 |
+
current_regions, document_id, source_path, page_num, len(chunks)
|
| 877 |
+
)
|
| 878 |
+
chunks.append(chunk)
|
| 879 |
+
|
| 880 |
+
current_text = region.text
|
| 881 |
+
current_regions = [region]
|
| 882 |
+
else:
|
| 883 |
+
current_text += " " + region.text
|
| 884 |
+
current_regions.append(region)
|
| 885 |
+
|
| 886 |
+
# Final chunk
|
| 887 |
+
if current_regions:
|
| 888 |
+
chunk = self._create_chunk_from_regions(
|
| 889 |
+
current_regions, document_id, source_path, page_num, len(chunks)
|
| 890 |
+
)
|
| 891 |
+
chunks.append(chunk)
|
| 892 |
+
|
| 893 |
+
return chunks
|
| 894 |
+
|
| 895 |
+
def _create_chunk_from_regions(
|
| 896 |
+
self,
|
| 897 |
+
regions: List[OCRRegion],
|
| 898 |
+
document_id: str,
|
| 899 |
+
source_path: Optional[str],
|
| 900 |
+
page_num: int,
|
| 901 |
+
sequence_index: int,
|
| 902 |
+
) -> DocumentChunk:
|
| 903 |
+
"""Create a chunk from a list of OCR regions."""
|
| 904 |
+
text = " ".join(r.text for r in regions)
|
| 905 |
+
avg_conf = sum(r.confidence for r in regions) / len(regions)
|
| 906 |
+
|
| 907 |
+
# Compute bounding box
|
| 908 |
+
x_min = min(r.bbox.x_min for r in regions)
|
| 909 |
+
y_min = min(r.bbox.y_min for r in regions)
|
| 910 |
+
x_max = max(r.bbox.x_max for r in regions)
|
| 911 |
+
y_max = max(r.bbox.y_max for r in regions)
|
| 912 |
+
|
| 913 |
+
bbox = BoundingBox(
|
| 914 |
+
x_min=x_min, y_min=y_min,
|
| 915 |
+
x_max=x_max, y_max=y_max,
|
| 916 |
+
normalized=False,
|
| 917 |
+
)
|
| 918 |
+
|
| 919 |
+
return DocumentChunk(
|
| 920 |
+
chunk_id=f"{document_id}_{uuid.uuid4().hex[:8]}",
|
| 921 |
+
chunk_type=ChunkType.TEXT,
|
| 922 |
+
text=text,
|
| 923 |
+
bbox=bbox,
|
| 924 |
+
page=page_num,
|
| 925 |
+
document_id=document_id,
|
| 926 |
+
source_path=source_path,
|
| 927 |
+
sequence_index=sequence_index,
|
| 928 |
+
confidence=avg_conf,
|
| 929 |
+
)
|
| 930 |
+
|
| 931 |
+
|
| 932 |
+
# Factory
|
| 933 |
+
_document_chunker: Optional[DocumentChunker] = None
|
| 934 |
+
|
| 935 |
+
|
| 936 |
+
def get_document_chunker(
|
| 937 |
+
config: Optional[ChunkerConfig] = None,
|
| 938 |
+
) -> DocumentChunker:
|
| 939 |
+
"""Get or create singleton document chunker."""
|
| 940 |
+
global _document_chunker
|
| 941 |
+
if _document_chunker is None:
|
| 942 |
+
config = config or ChunkerConfig()
|
| 943 |
+
_document_chunker = SemanticChunker(config)
|
| 944 |
+
return _document_chunker
|
src/document/grounding/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document Grounding Module
|
| 3 |
+
|
| 4 |
+
Provides evidence packaging and visual grounding for extracted information.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from .evidence import (
|
| 8 |
+
GroundingConfig,
|
| 9 |
+
EvidenceBuilder,
|
| 10 |
+
create_evidence_ref,
|
| 11 |
+
crop_region_image,
|
| 12 |
+
encode_image_base64,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
__all__ = [
|
| 16 |
+
"GroundingConfig",
|
| 17 |
+
"EvidenceBuilder",
|
| 18 |
+
"create_evidence_ref",
|
| 19 |
+
"crop_region_image",
|
| 20 |
+
"encode_image_base64",
|
| 21 |
+
]
|
src/document/grounding/evidence.py
ADDED
|
@@ -0,0 +1,365 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Evidence Builder for Document Grounding
|
| 3 |
+
|
| 4 |
+
Creates evidence references for extracted information.
|
| 5 |
+
Handles image cropping and base64 encoding.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import base64
|
| 9 |
+
import io
|
| 10 |
+
from typing import List, Optional, Dict, Any, Tuple
|
| 11 |
+
from pydantic import BaseModel, Field
|
| 12 |
+
import numpy as np
|
| 13 |
+
from PIL import Image
|
| 14 |
+
from loguru import logger
|
| 15 |
+
|
| 16 |
+
from ..schemas.core import (
|
| 17 |
+
BoundingBox,
|
| 18 |
+
DocumentChunk,
|
| 19 |
+
EvidenceRef,
|
| 20 |
+
OCRRegion,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class GroundingConfig(BaseModel):
|
| 25 |
+
"""Configuration for grounding and evidence generation."""
|
| 26 |
+
# Image cropping
|
| 27 |
+
include_images: bool = Field(
|
| 28 |
+
default=True,
|
| 29 |
+
description="Include cropped images in evidence"
|
| 30 |
+
)
|
| 31 |
+
crop_padding: int = Field(
|
| 32 |
+
default=10,
|
| 33 |
+
ge=0,
|
| 34 |
+
description="Padding around crop regions in pixels"
|
| 35 |
+
)
|
| 36 |
+
max_image_size: int = Field(
|
| 37 |
+
default=512,
|
| 38 |
+
ge=64,
|
| 39 |
+
description="Maximum dimension for cropped images"
|
| 40 |
+
)
|
| 41 |
+
image_format: str = Field(
|
| 42 |
+
default="PNG",
|
| 43 |
+
description="Image format for encoding (PNG/JPEG)"
|
| 44 |
+
)
|
| 45 |
+
image_quality: int = Field(
|
| 46 |
+
default=85,
|
| 47 |
+
ge=1,
|
| 48 |
+
le=100,
|
| 49 |
+
description="JPEG quality if using JPEG format"
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Snippet settings
|
| 53 |
+
max_snippet_length: int = Field(
|
| 54 |
+
default=200,
|
| 55 |
+
ge=50,
|
| 56 |
+
description="Maximum length of text snippets"
|
| 57 |
+
)
|
| 58 |
+
include_context: bool = Field(
|
| 59 |
+
default=True,
|
| 60 |
+
description="Include surrounding context in snippets"
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def crop_region_image(
|
| 65 |
+
image: np.ndarray,
|
| 66 |
+
bbox: BoundingBox,
|
| 67 |
+
padding: int = 10,
|
| 68 |
+
max_size: Optional[int] = None,
|
| 69 |
+
) -> np.ndarray:
|
| 70 |
+
"""
|
| 71 |
+
Crop a region from an image.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
image: Source image (RGB, HWC format)
|
| 75 |
+
bbox: Bounding box to crop
|
| 76 |
+
padding: Padding around the crop
|
| 77 |
+
max_size: Maximum dimension (will resize if larger)
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
Cropped image as numpy array
|
| 81 |
+
"""
|
| 82 |
+
height, width = image.shape[:2]
|
| 83 |
+
|
| 84 |
+
# Get coordinates with padding
|
| 85 |
+
x1 = max(0, int(bbox.x_min) - padding)
|
| 86 |
+
y1 = max(0, int(bbox.y_min) - padding)
|
| 87 |
+
x2 = min(width, int(bbox.x_max) + padding)
|
| 88 |
+
y2 = min(height, int(bbox.y_max) + padding)
|
| 89 |
+
|
| 90 |
+
# Crop
|
| 91 |
+
cropped = image[y1:y2, x1:x2]
|
| 92 |
+
|
| 93 |
+
# Resize if needed
|
| 94 |
+
if max_size and max(cropped.shape[:2]) > max_size:
|
| 95 |
+
pil_img = Image.fromarray(cropped)
|
| 96 |
+
pil_img.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
|
| 97 |
+
cropped = np.array(pil_img)
|
| 98 |
+
|
| 99 |
+
return cropped
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def encode_image_base64(
|
| 103 |
+
image: np.ndarray,
|
| 104 |
+
format: str = "PNG",
|
| 105 |
+
quality: int = 85,
|
| 106 |
+
) -> str:
|
| 107 |
+
"""
|
| 108 |
+
Encode image to base64 string.
|
| 109 |
+
|
| 110 |
+
Args:
|
| 111 |
+
image: Image as numpy array
|
| 112 |
+
format: Image format (PNG/JPEG)
|
| 113 |
+
quality: JPEG quality if applicable
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
Base64-encoded string
|
| 117 |
+
"""
|
| 118 |
+
pil_img = Image.fromarray(image)
|
| 119 |
+
|
| 120 |
+
# Convert to RGB if needed
|
| 121 |
+
if pil_img.mode != "RGB":
|
| 122 |
+
pil_img = pil_img.convert("RGB")
|
| 123 |
+
|
| 124 |
+
# Encode
|
| 125 |
+
buffer = io.BytesIO()
|
| 126 |
+
if format.upper() == "JPEG":
|
| 127 |
+
pil_img.save(buffer, format="JPEG", quality=quality)
|
| 128 |
+
else:
|
| 129 |
+
pil_img.save(buffer, format="PNG")
|
| 130 |
+
|
| 131 |
+
buffer.seek(0)
|
| 132 |
+
encoded = base64.b64encode(buffer.read()).decode("utf-8")
|
| 133 |
+
|
| 134 |
+
return encoded
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def create_evidence_ref(
|
| 138 |
+
chunk: DocumentChunk,
|
| 139 |
+
source_type: str = "text",
|
| 140 |
+
snippet: Optional[str] = None,
|
| 141 |
+
confidence: float = 1.0,
|
| 142 |
+
image: Optional[np.ndarray] = None,
|
| 143 |
+
config: Optional[GroundingConfig] = None,
|
| 144 |
+
) -> EvidenceRef:
|
| 145 |
+
"""
|
| 146 |
+
Create an evidence reference from a document chunk.
|
| 147 |
+
|
| 148 |
+
Args:
|
| 149 |
+
chunk: Source chunk
|
| 150 |
+
source_type: Type of source (text/table/figure)
|
| 151 |
+
snippet: Optional specific snippet (defaults to chunk text)
|
| 152 |
+
confidence: Confidence score
|
| 153 |
+
image: Optional page image for cropping
|
| 154 |
+
config: Grounding configuration
|
| 155 |
+
|
| 156 |
+
Returns:
|
| 157 |
+
EvidenceRef instance
|
| 158 |
+
"""
|
| 159 |
+
config = config or GroundingConfig()
|
| 160 |
+
|
| 161 |
+
# Create snippet
|
| 162 |
+
if snippet is None:
|
| 163 |
+
snippet = chunk.text[:config.max_snippet_length]
|
| 164 |
+
if len(chunk.text) > config.max_snippet_length:
|
| 165 |
+
snippet += "..."
|
| 166 |
+
|
| 167 |
+
# Create base evidence
|
| 168 |
+
evidence = EvidenceRef(
|
| 169 |
+
chunk_id=chunk.chunk_id,
|
| 170 |
+
page=chunk.page,
|
| 171 |
+
bbox=chunk.bbox,
|
| 172 |
+
source_type=source_type,
|
| 173 |
+
snippet=snippet,
|
| 174 |
+
confidence=confidence,
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
# Add image if available and configured
|
| 178 |
+
if image is not None and config.include_images:
|
| 179 |
+
try:
|
| 180 |
+
cropped = crop_region_image(
|
| 181 |
+
image,
|
| 182 |
+
chunk.bbox,
|
| 183 |
+
padding=config.crop_padding,
|
| 184 |
+
max_size=config.max_image_size,
|
| 185 |
+
)
|
| 186 |
+
evidence.image_base64 = encode_image_base64(
|
| 187 |
+
cropped,
|
| 188 |
+
format=config.image_format,
|
| 189 |
+
quality=config.image_quality,
|
| 190 |
+
)
|
| 191 |
+
except Exception as e:
|
| 192 |
+
logger.warning(f"Failed to crop evidence image: {e}")
|
| 193 |
+
|
| 194 |
+
return evidence
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
class EvidenceBuilder:
|
| 198 |
+
"""
|
| 199 |
+
Builder for creating evidence references.
|
| 200 |
+
|
| 201 |
+
Handles:
|
| 202 |
+
- Evidence from chunks
|
| 203 |
+
- Evidence from OCR regions
|
| 204 |
+
- Evidence aggregation
|
| 205 |
+
- Image cropping and encoding
|
| 206 |
+
"""
|
| 207 |
+
|
| 208 |
+
def __init__(self, config: Optional[GroundingConfig] = None):
|
| 209 |
+
"""Initialize evidence builder."""
|
| 210 |
+
self.config = config or GroundingConfig()
|
| 211 |
+
|
| 212 |
+
def from_chunk(
|
| 213 |
+
self,
|
| 214 |
+
chunk: DocumentChunk,
|
| 215 |
+
image: Optional[np.ndarray] = None,
|
| 216 |
+
additional_context: Optional[str] = None,
|
| 217 |
+
) -> EvidenceRef:
|
| 218 |
+
"""
|
| 219 |
+
Create evidence reference from a chunk.
|
| 220 |
+
|
| 221 |
+
Args:
|
| 222 |
+
chunk: Source chunk
|
| 223 |
+
image: Optional page image for visual evidence
|
| 224 |
+
additional_context: Optional additional context
|
| 225 |
+
|
| 226 |
+
Returns:
|
| 227 |
+
EvidenceRef
|
| 228 |
+
"""
|
| 229 |
+
# Determine source type
|
| 230 |
+
source_type = chunk.chunk_type.value
|
| 231 |
+
|
| 232 |
+
# Build snippet with optional context
|
| 233 |
+
snippet = chunk.text[:self.config.max_snippet_length]
|
| 234 |
+
if additional_context:
|
| 235 |
+
snippet = f"{additional_context}\n{snippet}"
|
| 236 |
+
if len(chunk.text) > self.config.max_snippet_length:
|
| 237 |
+
snippet += "..."
|
| 238 |
+
|
| 239 |
+
return create_evidence_ref(
|
| 240 |
+
chunk=chunk,
|
| 241 |
+
source_type=source_type,
|
| 242 |
+
snippet=snippet,
|
| 243 |
+
confidence=chunk.confidence,
|
| 244 |
+
image=image,
|
| 245 |
+
config=self.config,
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
def from_ocr_region(
|
| 249 |
+
self,
|
| 250 |
+
region: OCRRegion,
|
| 251 |
+
chunk_id: str,
|
| 252 |
+
document_id: str,
|
| 253 |
+
image: Optional[np.ndarray] = None,
|
| 254 |
+
) -> EvidenceRef:
|
| 255 |
+
"""
|
| 256 |
+
Create evidence reference from an OCR region.
|
| 257 |
+
|
| 258 |
+
Args:
|
| 259 |
+
region: OCR region
|
| 260 |
+
chunk_id: ID to assign
|
| 261 |
+
document_id: Parent document ID
|
| 262 |
+
image: Optional page image
|
| 263 |
+
|
| 264 |
+
Returns:
|
| 265 |
+
EvidenceRef
|
| 266 |
+
"""
|
| 267 |
+
# Create a temporary chunk for the evidence
|
| 268 |
+
from ..schemas.core import DocumentChunk, ChunkType
|
| 269 |
+
|
| 270 |
+
chunk = DocumentChunk(
|
| 271 |
+
chunk_id=chunk_id,
|
| 272 |
+
chunk_type=ChunkType.TEXT,
|
| 273 |
+
text=region.text,
|
| 274 |
+
bbox=region.bbox,
|
| 275 |
+
page=region.page,
|
| 276 |
+
document_id=document_id,
|
| 277 |
+
source_path=None,
|
| 278 |
+
sequence_index=0,
|
| 279 |
+
confidence=region.confidence,
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
return self.from_chunk(chunk, image)
|
| 283 |
+
|
| 284 |
+
def aggregate_evidence(
|
| 285 |
+
self,
|
| 286 |
+
evidence_list: List[EvidenceRef],
|
| 287 |
+
combine_snippets: bool = True,
|
| 288 |
+
) -> List[EvidenceRef]:
|
| 289 |
+
"""
|
| 290 |
+
Aggregate and deduplicate evidence references.
|
| 291 |
+
|
| 292 |
+
Args:
|
| 293 |
+
evidence_list: List of evidence references
|
| 294 |
+
combine_snippets: Whether to combine snippets from same chunk
|
| 295 |
+
|
| 296 |
+
Returns:
|
| 297 |
+
Deduplicated evidence list
|
| 298 |
+
"""
|
| 299 |
+
if not evidence_list:
|
| 300 |
+
return []
|
| 301 |
+
|
| 302 |
+
# Group by chunk_id
|
| 303 |
+
by_chunk: Dict[str, List[EvidenceRef]] = {}
|
| 304 |
+
for ev in evidence_list:
|
| 305 |
+
if ev.chunk_id not in by_chunk:
|
| 306 |
+
by_chunk[ev.chunk_id] = []
|
| 307 |
+
by_chunk[ev.chunk_id].append(ev)
|
| 308 |
+
|
| 309 |
+
# Combine or select best
|
| 310 |
+
result = []
|
| 311 |
+
for chunk_id, evidences in by_chunk.items():
|
| 312 |
+
if len(evidences) == 1:
|
| 313 |
+
result.append(evidences[0])
|
| 314 |
+
else:
|
| 315 |
+
# Take highest confidence, combine snippets
|
| 316 |
+
best = max(evidences, key=lambda e: e.confidence)
|
| 317 |
+
if combine_snippets:
|
| 318 |
+
all_snippets = list(set(e.snippet for e in evidences))
|
| 319 |
+
combined = " ... ".join(all_snippets[:3])
|
| 320 |
+
best = EvidenceRef(
|
| 321 |
+
chunk_id=best.chunk_id,
|
| 322 |
+
page=best.page,
|
| 323 |
+
bbox=best.bbox,
|
| 324 |
+
source_type=best.source_type,
|
| 325 |
+
snippet=combined[:self.config.max_snippet_length],
|
| 326 |
+
confidence=best.confidence,
|
| 327 |
+
image_base64=best.image_base64,
|
| 328 |
+
)
|
| 329 |
+
result.append(best)
|
| 330 |
+
|
| 331 |
+
# Sort by page and position
|
| 332 |
+
result.sort(key=lambda e: (e.page, e.bbox.y_min, e.bbox.x_min))
|
| 333 |
+
|
| 334 |
+
return result
|
| 335 |
+
|
| 336 |
+
def create_grounding_context(
|
| 337 |
+
self,
|
| 338 |
+
evidence_list: List[EvidenceRef],
|
| 339 |
+
include_images: bool = False,
|
| 340 |
+
) -> str:
|
| 341 |
+
"""
|
| 342 |
+
Create a text context from evidence for LLM prompting.
|
| 343 |
+
|
| 344 |
+
Args:
|
| 345 |
+
evidence_list: Evidence references
|
| 346 |
+
include_images: Whether to include image markers
|
| 347 |
+
|
| 348 |
+
Returns:
|
| 349 |
+
Formatted context string
|
| 350 |
+
"""
|
| 351 |
+
if not evidence_list:
|
| 352 |
+
return ""
|
| 353 |
+
|
| 354 |
+
lines = ["Evidence from document:"]
|
| 355 |
+
for i, ev in enumerate(evidence_list, 1):
|
| 356 |
+
lines.append(
|
| 357 |
+
f"\n[{i}] Page {ev.page + 1}, {ev.source_type} "
|
| 358 |
+
f"(confidence: {ev.confidence:.2f}):"
|
| 359 |
+
)
|
| 360 |
+
lines.append(f' "{ev.snippet}"')
|
| 361 |
+
|
| 362 |
+
if include_images and ev.image_base64:
|
| 363 |
+
lines.append(" [Image available]")
|
| 364 |
+
|
| 365 |
+
return "\n".join(lines)
|
src/document/io/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document I/O Module
|
| 3 |
+
|
| 4 |
+
Handles loading, rendering, and caching of PDF and image documents.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from .loader import (
|
| 8 |
+
DocumentLoader,
|
| 9 |
+
load_document,
|
| 10 |
+
load_pdf,
|
| 11 |
+
load_image,
|
| 12 |
+
render_page,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
from .cache import (
|
| 16 |
+
DocumentCache,
|
| 17 |
+
get_document_cache,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
__all__ = [
|
| 21 |
+
"DocumentLoader",
|
| 22 |
+
"load_document",
|
| 23 |
+
"load_pdf",
|
| 24 |
+
"load_image",
|
| 25 |
+
"render_page",
|
| 26 |
+
"DocumentCache",
|
| 27 |
+
"get_document_cache",
|
| 28 |
+
]
|
src/document/io/cache.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document Cache
|
| 3 |
+
|
| 4 |
+
Caches rendered page images and document metadata for performance.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import hashlib
|
| 8 |
+
import os
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Dict, Optional, Tuple
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
+
from datetime import datetime, timedelta
|
| 13 |
+
from loguru import logger
|
| 14 |
+
|
| 15 |
+
import numpy as np
|
| 16 |
+
from PIL import Image
|
| 17 |
+
|
| 18 |
+
from cachetools import TTLCache, LRUCache
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class CacheEntry:
|
| 23 |
+
"""A cached page image entry."""
|
| 24 |
+
document_id: str
|
| 25 |
+
page_number: int
|
| 26 |
+
dpi: int
|
| 27 |
+
image: np.ndarray
|
| 28 |
+
created_at: datetime
|
| 29 |
+
size_bytes: int
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class DocumentCache:
|
| 33 |
+
"""
|
| 34 |
+
In-memory cache for rendered document pages.
|
| 35 |
+
Uses LRU eviction with optional disk persistence.
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
def __init__(
|
| 39 |
+
self,
|
| 40 |
+
max_pages: int = 100,
|
| 41 |
+
max_memory_mb: int = 1024,
|
| 42 |
+
ttl_seconds: int = 3600,
|
| 43 |
+
disk_cache_dir: Optional[str] = None,
|
| 44 |
+
):
|
| 45 |
+
"""
|
| 46 |
+
Initialize document cache.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
max_pages: Maximum number of pages to cache in memory
|
| 50 |
+
max_memory_mb: Maximum memory usage in MB
|
| 51 |
+
ttl_seconds: Time-to-live for cache entries
|
| 52 |
+
disk_cache_dir: Optional directory for disk caching
|
| 53 |
+
"""
|
| 54 |
+
self.max_pages = max_pages
|
| 55 |
+
self.max_memory_mb = max_memory_mb
|
| 56 |
+
self.ttl_seconds = ttl_seconds
|
| 57 |
+
self.disk_cache_dir = disk_cache_dir
|
| 58 |
+
|
| 59 |
+
# In-memory cache
|
| 60 |
+
self._cache: TTLCache = TTLCache(maxsize=max_pages, ttl=ttl_seconds)
|
| 61 |
+
|
| 62 |
+
# Memory tracking
|
| 63 |
+
self._memory_used_bytes = 0
|
| 64 |
+
|
| 65 |
+
# Statistics
|
| 66 |
+
self._hits = 0
|
| 67 |
+
self._misses = 0
|
| 68 |
+
|
| 69 |
+
# Initialize disk cache if enabled
|
| 70 |
+
if disk_cache_dir:
|
| 71 |
+
self._disk_cache_path = Path(disk_cache_dir)
|
| 72 |
+
self._disk_cache_path.mkdir(parents=True, exist_ok=True)
|
| 73 |
+
else:
|
| 74 |
+
self._disk_cache_path = None
|
| 75 |
+
|
| 76 |
+
logger.debug(f"Initialized DocumentCache (max_pages={max_pages}, max_memory={max_memory_mb}MB)")
|
| 77 |
+
|
| 78 |
+
def _make_key(self, document_id: str, page_number: int, dpi: int) -> str:
|
| 79 |
+
"""Generate cache key."""
|
| 80 |
+
return f"{document_id}:p{page_number}:d{dpi}"
|
| 81 |
+
|
| 82 |
+
def get(
|
| 83 |
+
self,
|
| 84 |
+
document_id: str,
|
| 85 |
+
page_number: int,
|
| 86 |
+
dpi: int = 300,
|
| 87 |
+
) -> Optional[np.ndarray]:
|
| 88 |
+
"""
|
| 89 |
+
Get a cached page image.
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
document_id: Document identifier
|
| 93 |
+
page_number: Page number
|
| 94 |
+
dpi: Rendering DPI
|
| 95 |
+
|
| 96 |
+
Returns:
|
| 97 |
+
Cached image array or None
|
| 98 |
+
"""
|
| 99 |
+
key = self._make_key(document_id, page_number, dpi)
|
| 100 |
+
|
| 101 |
+
# Check in-memory cache
|
| 102 |
+
entry = self._cache.get(key)
|
| 103 |
+
if entry is not None:
|
| 104 |
+
self._hits += 1
|
| 105 |
+
return entry.image
|
| 106 |
+
|
| 107 |
+
# Check disk cache
|
| 108 |
+
if self._disk_cache_path:
|
| 109 |
+
disk_path = self._disk_cache_path / f"{key}.npy"
|
| 110 |
+
if disk_path.exists():
|
| 111 |
+
try:
|
| 112 |
+
image = np.load(disk_path)
|
| 113 |
+
# Promote to memory cache
|
| 114 |
+
self._put_memory(key, document_id, page_number, dpi, image)
|
| 115 |
+
self._hits += 1
|
| 116 |
+
return image
|
| 117 |
+
except Exception as e:
|
| 118 |
+
logger.warning(f"Failed to load from disk cache: {e}")
|
| 119 |
+
|
| 120 |
+
self._misses += 1
|
| 121 |
+
return None
|
| 122 |
+
|
| 123 |
+
def put(
|
| 124 |
+
self,
|
| 125 |
+
document_id: str,
|
| 126 |
+
page_number: int,
|
| 127 |
+
dpi: int,
|
| 128 |
+
image: np.ndarray,
|
| 129 |
+
persist_to_disk: bool = False,
|
| 130 |
+
):
|
| 131 |
+
"""
|
| 132 |
+
Cache a page image.
|
| 133 |
+
|
| 134 |
+
Args:
|
| 135 |
+
document_id: Document identifier
|
| 136 |
+
page_number: Page number
|
| 137 |
+
dpi: Rendering DPI
|
| 138 |
+
image: Page image as numpy array
|
| 139 |
+
persist_to_disk: Whether to persist to disk
|
| 140 |
+
"""
|
| 141 |
+
key = self._make_key(document_id, page_number, dpi)
|
| 142 |
+
|
| 143 |
+
# Put in memory cache
|
| 144 |
+
self._put_memory(key, document_id, page_number, dpi, image)
|
| 145 |
+
|
| 146 |
+
# Optionally persist to disk
|
| 147 |
+
if persist_to_disk and self._disk_cache_path:
|
| 148 |
+
self._put_disk(key, image)
|
| 149 |
+
|
| 150 |
+
def _put_memory(
|
| 151 |
+
self,
|
| 152 |
+
key: str,
|
| 153 |
+
document_id: str,
|
| 154 |
+
page_number: int,
|
| 155 |
+
dpi: int,
|
| 156 |
+
image: np.ndarray,
|
| 157 |
+
):
|
| 158 |
+
"""Put entry in memory cache."""
|
| 159 |
+
size_bytes = image.nbytes
|
| 160 |
+
|
| 161 |
+
# Check memory limit
|
| 162 |
+
max_bytes = self.max_memory_mb * 1024 * 1024
|
| 163 |
+
if self._memory_used_bytes + size_bytes > max_bytes:
|
| 164 |
+
# Evict oldest entries until we have space
|
| 165 |
+
self._evict_to_fit(size_bytes)
|
| 166 |
+
|
| 167 |
+
entry = CacheEntry(
|
| 168 |
+
document_id=document_id,
|
| 169 |
+
page_number=page_number,
|
| 170 |
+
dpi=dpi,
|
| 171 |
+
image=image,
|
| 172 |
+
created_at=datetime.utcnow(),
|
| 173 |
+
size_bytes=size_bytes,
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
self._cache[key] = entry
|
| 177 |
+
self._memory_used_bytes += size_bytes
|
| 178 |
+
|
| 179 |
+
def _put_disk(self, key: str, image: np.ndarray):
|
| 180 |
+
"""Persist entry to disk cache."""
|
| 181 |
+
if not self._disk_cache_path:
|
| 182 |
+
return
|
| 183 |
+
|
| 184 |
+
try:
|
| 185 |
+
disk_path = self._disk_cache_path / f"{key}.npy"
|
| 186 |
+
np.save(disk_path, image)
|
| 187 |
+
except Exception as e:
|
| 188 |
+
logger.warning(f"Failed to write to disk cache: {e}")
|
| 189 |
+
|
| 190 |
+
def _evict_to_fit(self, needed_bytes: int):
|
| 191 |
+
"""Evict entries to fit new entry."""
|
| 192 |
+
max_bytes = self.max_memory_mb * 1024 * 1024
|
| 193 |
+
target = max_bytes - needed_bytes
|
| 194 |
+
|
| 195 |
+
# Get entries sorted by creation time (oldest first)
|
| 196 |
+
entries = list(self._cache.items())
|
| 197 |
+
|
| 198 |
+
for key, entry in entries:
|
| 199 |
+
if self._memory_used_bytes <= target:
|
| 200 |
+
break
|
| 201 |
+
self._memory_used_bytes -= entry.size_bytes
|
| 202 |
+
del self._cache[key]
|
| 203 |
+
|
| 204 |
+
def invalidate(self, document_id: str, page_number: Optional[int] = None):
|
| 205 |
+
"""
|
| 206 |
+
Invalidate cache entries for a document.
|
| 207 |
+
|
| 208 |
+
Args:
|
| 209 |
+
document_id: Document to invalidate
|
| 210 |
+
page_number: Optional specific page (None = all pages)
|
| 211 |
+
"""
|
| 212 |
+
keys_to_remove = []
|
| 213 |
+
|
| 214 |
+
for key in self._cache.keys():
|
| 215 |
+
if key.startswith(f"{document_id}:"):
|
| 216 |
+
if page_number is None or f":p{page_number}:" in key:
|
| 217 |
+
keys_to_remove.append(key)
|
| 218 |
+
|
| 219 |
+
for key in keys_to_remove:
|
| 220 |
+
entry = self._cache.pop(key, None)
|
| 221 |
+
if entry:
|
| 222 |
+
self._memory_used_bytes -= entry.size_bytes
|
| 223 |
+
|
| 224 |
+
# Also remove from disk cache
|
| 225 |
+
if self._disk_cache_path:
|
| 226 |
+
for key in keys_to_remove:
|
| 227 |
+
disk_path = self._disk_cache_path / f"{key}.npy"
|
| 228 |
+
if disk_path.exists():
|
| 229 |
+
disk_path.unlink()
|
| 230 |
+
|
| 231 |
+
def clear(self):
|
| 232 |
+
"""Clear all cache entries."""
|
| 233 |
+
self._cache.clear()
|
| 234 |
+
self._memory_used_bytes = 0
|
| 235 |
+
|
| 236 |
+
# Clear disk cache
|
| 237 |
+
if self._disk_cache_path:
|
| 238 |
+
for f in self._disk_cache_path.glob("*.npy"):
|
| 239 |
+
f.unlink()
|
| 240 |
+
|
| 241 |
+
logger.info("Document cache cleared")
|
| 242 |
+
|
| 243 |
+
@property
|
| 244 |
+
def stats(self) -> Dict:
|
| 245 |
+
"""Get cache statistics."""
|
| 246 |
+
total = self._hits + self._misses
|
| 247 |
+
hit_rate = (self._hits / total * 100) if total > 0 else 0
|
| 248 |
+
|
| 249 |
+
return {
|
| 250 |
+
"hits": self._hits,
|
| 251 |
+
"misses": self._misses,
|
| 252 |
+
"hit_rate": f"{hit_rate:.1f}%",
|
| 253 |
+
"entries": len(self._cache),
|
| 254 |
+
"memory_used_mb": self._memory_used_bytes / (1024 * 1024),
|
| 255 |
+
"max_memory_mb": self.max_memory_mb,
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
# Global cache instance
|
| 260 |
+
_document_cache: Optional[DocumentCache] = None
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def get_document_cache() -> DocumentCache:
|
| 264 |
+
"""Get or create the global document cache."""
|
| 265 |
+
global _document_cache
|
| 266 |
+
if _document_cache is None:
|
| 267 |
+
_document_cache = DocumentCache()
|
| 268 |
+
return _document_cache
|