MHamdan commited on
Commit
d520909
·
1 Parent(s): a9dc537

Initial commit: SPARKNET framework

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +90 -0
  2. .streamlit/config.toml +14 -0
  3. CHANGELOG.md +232 -0
  4. Dockerfile +109 -0
  5. IMPLEMENTATION_REPORT.md +474 -0
  6. api/auth.py +320 -0
  7. api/routes/documents.py +553 -0
  8. api/routes/rag.py +415 -0
  9. api/schemas.py +302 -0
  10. config/document.yaml +147 -0
  11. config/rag.yaml +141 -0
  12. configs/rag.yaml +201 -0
  13. demo/README.md +185 -0
  14. demo/app.py +944 -0
  15. demo/llm_providers.py +339 -0
  16. demo/pages/1_🔬_Live_Processing.py +714 -0
  17. demo/pages/2_💬_Interactive_RAG.py +844 -0
  18. demo/pages/3_📊_Document_Comparison.py +528 -0
  19. demo/pages/4_🎯_Evidence_Viewer.py +529 -0
  20. demo/pages/5_📄_Document_Viewer.py +565 -0
  21. demo/rag_config.py +396 -0
  22. demo/requirements.txt +19 -0
  23. demo/state_manager.py +833 -0
  24. docker-compose.dev.yml +66 -0
  25. docker-compose.yml +163 -0
  26. docs/CLOUD_ARCHITECTURE.md +392 -0
  27. docs/DOCUMENT_INTELLIGENCE.md +470 -0
  28. docs/SPARKNET_Progress_Report.py +1432 -0
  29. examples/document_agent.py +240 -0
  30. examples/document_intelligence_demo.py +314 -0
  31. examples/document_processing.py +133 -0
  32. examples/document_rag_end_to_end.py +359 -0
  33. examples/rag_pipeline.py +192 -0
  34. nginx/nginx.conf +254 -0
  35. run_demo.py +110 -0
  36. run_demo.sh +52 -0
  37. scripts to get ideas from/ides.txt +151 -0
  38. src/agents/document_agent.py +661 -0
  39. src/cli/__init__.py +9 -0
  40. src/cli/docint.py +681 -0
  41. src/cli/document.py +322 -0
  42. src/cli/main.py +110 -0
  43. src/cli/rag.py +314 -0
  44. src/document/__init__.py +75 -0
  45. src/document/chunking/__init__.py +19 -0
  46. src/document/chunking/chunker.py +944 -0
  47. src/document/grounding/__init__.py +21 -0
  48. src/document/grounding/evidence.py +365 -0
  49. src/document/io/__init__.py +28 -0
  50. src/document/io/cache.py +268 -0
.dockerignore ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Git
2
+ .git
3
+ .gitignore
4
+ .gitattributes
5
+
6
+ # Python
7
+ __pycache__
8
+ *.py[cod]
9
+ *$py.class
10
+ *.so
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ .mypy_cache/
28
+ .pytest_cache/
29
+ .coverage
30
+ htmlcov/
31
+
32
+ # Virtual environments
33
+ venv/
34
+ ENV/
35
+ env/
36
+ .venv/
37
+ sparknet/
38
+
39
+ # IDE
40
+ .idea/
41
+ .vscode/
42
+ *.swp
43
+ *.swo
44
+ *~
45
+
46
+ # OS
47
+ .DS_Store
48
+ Thumbs.db
49
+
50
+ # Logs
51
+ *.log
52
+ logs/
53
+
54
+ # Local data (will be mounted as volumes)
55
+ data/vectorstore/
56
+ data/embedding_cache/
57
+ uploads/
58
+ outputs/
59
+
60
+ # Tests
61
+ tests/
62
+ .pytest_cache/
63
+
64
+ # Documentation
65
+ docs/
66
+ *.md
67
+ !README.md
68
+
69
+ # Notebooks
70
+ *.ipynb
71
+ .ipynb_checkpoints/
72
+
73
+ # Backup files
74
+ .backup/
75
+ *.bak
76
+
77
+ # Screenshots
78
+ screenshots/
79
+
80
+ # Development files
81
+ *.env.local
82
+ *.env.development
83
+ *.env.test
84
+
85
+ # Large files
86
+ *.pdf
87
+ *.pptx
88
+ *.docx
89
+ Dataset/
90
+ presentation/
.streamlit/config.toml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [server]
2
+ headless = true
3
+ port = 8501
4
+ enableCORS = false
5
+ maxUploadSize = 50
6
+
7
+ [theme]
8
+ primaryColor = "#4ECDC4"
9
+ backgroundColor = "#0e1117"
10
+ secondaryBackgroundColor = "#1a1a2e"
11
+ textColor = "#ffffff"
12
+
13
+ [browser]
14
+ gatherUsageStats = false
CHANGELOG.md ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPARKNET Changelog
2
+
3
+ All notable changes to the SPARKNET project are documented in this file.
4
+
5
+ ## [1.2.0] - 2026-01-20
6
+
7
+ ### Added (Phase 1B Continuation)
8
+
9
+ #### Table Extraction Preservation (FG-002) - HIGH PRIORITY
10
+ - **Enhanced SemanticChunker** (`src/document/chunking/chunker.py`)
11
+ - Table structure reconstruction from OCR regions
12
+ - Markdown table generation with proper formatting
13
+ - Header row detection using heuristics
14
+ - Structured data storage in `extra.table_structure`
15
+ - Cell positions preserved for evidence highlighting
16
+ - Searchable text includes header context for better embedding
17
+ - Configurable row/column thresholds
18
+
19
+ - **ChunkerConfig enhancements**
20
+ - `preserve_table_structure` - Enable markdown conversion
21
+ - `table_row_threshold` - Y-coordinate grouping threshold
22
+ - `table_col_threshold` - X-coordinate clustering threshold
23
+ - `detect_table_headers` - Automatic header detection
24
+
25
+ #### Nginx Configuration (TG-005)
26
+ - **Nginx Reverse Proxy** (`nginx/nginx.conf`)
27
+ - Production-ready reverse proxy configuration
28
+ - Rate limiting (30 req/s API, 5 req/s uploads)
29
+ - WebSocket support for Streamlit
30
+ - SSE support for RAG streaming
31
+ - Gzip compression
32
+ - Security headers (XSS, CSRF protection)
33
+ - SSL/TLS configuration (commented, ready for production)
34
+ - Connection limits and timeout tuning
35
+
36
+ #### Integration Tests (TG-006)
37
+ - **API Integration Tests** (`tests/integration/test_api_v2.py`)
38
+ - TestClient-based testing without server
39
+ - Health/status endpoint tests
40
+ - Authentication flow tests
41
+ - Document upload/process/index workflow
42
+ - RAG query and search tests
43
+ - Error handling verification
44
+ - Concurrency tests
45
+ - Performance benchmarks (marked slow)
46
+
47
+ - **Table Chunker Unit Tests** (`tests/unit/test_table_chunker.py`)
48
+ - Table structure reconstruction tests
49
+ - Markdown generation tests
50
+ - Header detection tests
51
+ - Column detection tests
52
+ - Edge case handling
53
+
54
+ #### Cross-Module State Synchronization (Phase 1B)
55
+ - **Enhanced State Manager** (`demo/state_manager.py`)
56
+ - Event system with pub/sub pattern
57
+ - `EventType` enum for type-safe events
58
+ - Evidence highlighting synchronization
59
+ - Page/chunk selection sync across modules
60
+ - RAG query/response sharing
61
+ - Module-specific state storage
62
+ - Sync version tracking for change detection
63
+ - Helper components: `render_evidence_panel()`, `render_document_selector()`
64
+
65
+ ---
66
+
67
+ ## [1.1.0] - 2026-01-20
68
+
69
+ ### Added
70
+
71
+ #### REST API (Phase 1B - TG-003)
72
+ - **Document API** (`api/routes/documents.py`)
73
+ - `POST /api/documents/upload` - Upload and process documents
74
+ - `GET /api/documents` - List all documents with filtering
75
+ - `GET /api/documents/{doc_id}` - Get document by ID
76
+ - `GET /api/documents/{doc_id}/detail` - Get detailed document info
77
+ - `GET /api/documents/{doc_id}/chunks` - Get document chunks
78
+ - `POST /api/documents/{doc_id}/process` - Trigger processing
79
+ - `POST /api/documents/{doc_id}/index` - Index to RAG
80
+ - `POST /api/documents/batch-index` - Batch index multiple documents
81
+ - `DELETE /api/documents/{doc_id}` - Delete a document
82
+
83
+ - **RAG API** (`api/routes/rag.py`)
84
+ - `POST /api/rag/query` - Execute RAG query with 5-agent pipeline
85
+ - `POST /api/rag/query/stream` - Stream RAG response (SSE)
86
+ - `POST /api/rag/search` - Semantic search without synthesis
87
+ - `GET /api/rag/store/status` - Get vector store status
88
+ - `DELETE /api/rag/store/collection/{name}` - Clear collection
89
+ - `GET /api/rag/cache/stats` - Get cache statistics
90
+ - `DELETE /api/rag/cache` - Clear query cache
91
+
92
+ - **API Schemas** (`api/schemas.py`)
93
+ - Request/response models for all endpoints
94
+ - Document, Query, Search, Citation schemas
95
+ - Pydantic validation with comprehensive field definitions
96
+
97
+ #### Authentication (Phase 1C - TG-002)
98
+ - **JWT Authentication** (`api/auth.py`)
99
+ - OAuth2 password bearer scheme
100
+ - `POST /api/auth/token` - Get access token
101
+ - `POST /api/auth/register` - Register new user
102
+ - `GET /api/auth/me` - Get current user info
103
+ - `GET /api/auth/users` - List users (admin only)
104
+ - `DELETE /api/auth/users/{username}` - Delete user (admin only)
105
+ - Password hashing with bcrypt
106
+ - Default admin user creation on startup
107
+
108
+ #### Extended Document Support (Phase 1B - FG-001)
109
+ - Added support for new document formats in document processing:
110
+ - **Word (.docx)** - Full text and table extraction
111
+ - **Excel (.xlsx, .xls)** - Multi-sheet extraction
112
+ - **PowerPoint (.pptx)** - Slide-by-slide text extraction
113
+ - **Text (.txt)** - Plain text processing
114
+ - **Markdown (.md)** - Markdown file support
115
+
116
+ #### Caching (Phase 1B - TG-004)
117
+ - **Cache Manager** (`src/utils/cache_manager.py`)
118
+ - Redis-based caching with in-memory fallback
119
+ - `QueryCache` - Cache RAG query results (1 hour TTL)
120
+ - `EmbeddingCache` - Cache embeddings (24 hour TTL)
121
+ - `@cached` decorator for function-level caching
122
+ - Automatic cache cleanup and size limits
123
+
124
+ #### Docker Containerization (Phase 1C - TG-007)
125
+ - **Dockerfile** - Multi-stage build
126
+ - Production stage with optimized image
127
+ - Development stage with hot reload
128
+ - Health checks and proper dependencies
129
+
130
+ - **docker-compose.yml** - Full stack deployment
131
+ - SPARKNET API service
132
+ - Streamlit Demo service
133
+ - Ollama LLM service with GPU support
134
+ - ChromaDB vector store
135
+ - Redis cache
136
+ - Optional Nginx reverse proxy
137
+
138
+ - **docker-compose.dev.yml** - Development configuration
139
+ - Volume mounts for code changes
140
+ - Hot reload enabled
141
+ - Connects to host Ollama
142
+
143
+ - **.dockerignore** - Optimized build context
144
+
145
+ ### Changed
146
+
147
+ #### API Main (`api/main.py`)
148
+ - Enhanced lifespan initialization with graceful degradation
149
+ - Added RAG component initialization
150
+ - Improved health check with component status
151
+ - New `/api/status` endpoint for comprehensive system status
152
+ - Better error handling allowing partial functionality
153
+
154
+ ### Technical Details
155
+
156
+ #### New Files Created
157
+ ```
158
+ api/
159
+ ├── auth.py # Authentication module
160
+ ├── schemas.py # Pydantic models
161
+ └── routes/
162
+ ├── documents.py # Document endpoints
163
+ └── rag.py # RAG endpoints
164
+
165
+ src/utils/
166
+ └── cache_manager.py # Redis/memory caching
167
+
168
+ docker/
169
+ ├── Dockerfile # Multi-stage build
170
+ ├── docker-compose.yml # Production stack
171
+ ├── docker-compose.dev.yml # Development stack
172
+ └── .dockerignore # Build optimization
173
+ ```
174
+
175
+ #### Dependencies Added
176
+ - `python-jose[cryptography]` - JWT tokens
177
+ - `passlib[bcrypt]` - Password hashing
178
+ - `python-multipart` - Form data handling
179
+ - `redis` - Redis client (optional)
180
+ - `python-docx` - Word document support
181
+ - `openpyxl` - Excel support
182
+ - `python-pptx` - PowerPoint support
183
+
184
+ #### Configuration
185
+ - `SPARKNET_SECRET_KEY` - JWT secret (environment variable)
186
+ - `REDIS_URL` - Redis connection string
187
+ - `OLLAMA_HOST` - Ollama server URL
188
+ - `CHROMA_HOST` / `CHROMA_PORT` - ChromaDB connection
189
+
190
+ ### API Quick Reference
191
+
192
+ ```bash
193
+ # Health check
194
+ curl http://localhost:8000/api/health
195
+
196
+ # Upload document
197
+ curl -X POST -F "file=@document.pdf" http://localhost:8000/api/documents/upload
198
+
199
+ # Query RAG
200
+ curl -X POST http://localhost:8000/api/rag/query \
201
+ -H "Content-Type: application/json" \
202
+ -d '{"query": "What are the main findings?"}'
203
+
204
+ # Get token
205
+ curl -X POST http://localhost:8000/api/auth/token \
206
+ -d "username=admin&password=admin123"
207
+ ```
208
+
209
+ ### Docker Quick Start
210
+
211
+ ```bash
212
+ # Production deployment
213
+ docker-compose up -d
214
+
215
+ # Development with hot reload
216
+ docker-compose -f docker-compose.dev.yml up
217
+
218
+ # Pull Ollama models
219
+ docker exec sparknet-ollama ollama pull llama3.2:latest
220
+ docker exec sparknet-ollama ollama pull mxbai-embed-large:latest
221
+ ```
222
+
223
+ ---
224
+
225
+ ## [1.0.0] - 2026-01-19
226
+
227
+ ### Initial Release
228
+ - Multi-Agent RAG Pipeline (5 agents)
229
+ - Document Processing Pipeline (OCR, Layout, Chunking)
230
+ - Streamlit Demo Application (5 modules)
231
+ - ChromaDB Vector Store
232
+ - Ollama LLM Integration
Dockerfile ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPARKNET Dockerfile
2
+ # Multi-stage build for optimized production image
3
+
4
+ # ============== Build Stage ==============
5
+ FROM python:3.11-slim as builder
6
+
7
+ WORKDIR /app
8
+
9
+ # Install build dependencies
10
+ RUN apt-get update && apt-get install -y --no-install-recommends \
11
+ build-essential \
12
+ gcc \
13
+ g++ \
14
+ && rm -rf /var/lib/apt/lists/*
15
+
16
+ # Copy requirements first for caching
17
+ COPY requirements.txt .
18
+ COPY api/requirements.txt ./api_requirements.txt
19
+
20
+ # Create virtual environment and install dependencies
21
+ RUN python -m venv /opt/venv
22
+ ENV PATH="/opt/venv/bin:$PATH"
23
+
24
+ RUN pip install --no-cache-dir --upgrade pip && \
25
+ pip install --no-cache-dir -r requirements.txt && \
26
+ pip install --no-cache-dir -r api_requirements.txt
27
+
28
+ # ============== Production Stage ==============
29
+ FROM python:3.11-slim as production
30
+
31
+ LABEL maintainer="SPARKNET Team"
32
+ LABEL description="SPARKNET: Multi-Agentic Document Intelligence Platform"
33
+ LABEL version="1.0.0"
34
+
35
+ WORKDIR /app
36
+
37
+ # Install runtime dependencies
38
+ RUN apt-get update && apt-get install -y --no-install-recommends \
39
+ # PDF processing
40
+ poppler-utils \
41
+ libpoppler-cpp-dev \
42
+ # Image processing
43
+ libgl1-mesa-glx \
44
+ libglib2.0-0 \
45
+ libsm6 \
46
+ libxext6 \
47
+ libxrender-dev \
48
+ # OCR support
49
+ tesseract-ocr \
50
+ tesseract-ocr-eng \
51
+ # Utilities
52
+ curl \
53
+ wget \
54
+ && rm -rf /var/lib/apt/lists/*
55
+
56
+ # Copy virtual environment from builder
57
+ COPY --from=builder /opt/venv /opt/venv
58
+ ENV PATH="/opt/venv/bin:$PATH"
59
+
60
+ # Set Python environment
61
+ ENV PYTHONDONTWRITEBYTECODE=1 \
62
+ PYTHONUNBUFFERED=1 \
63
+ PYTHONPATH=/app
64
+
65
+ # Copy application code
66
+ COPY src/ ./src/
67
+ COPY api/ ./api/
68
+ COPY config/ ./config/
69
+ COPY demo/ ./demo/
70
+
71
+ # Create necessary directories
72
+ RUN mkdir -p /app/data/vectorstore \
73
+ /app/data/embedding_cache \
74
+ /app/uploads/documents \
75
+ /app/uploads/patents \
76
+ /app/outputs \
77
+ /app/logs
78
+
79
+ # Set permissions
80
+ RUN chmod -R 755 /app
81
+
82
+ # Expose ports
83
+ # 8000 - FastAPI
84
+ # 4000 - Streamlit
85
+ EXPOSE 8000 4000
86
+
87
+ # Health check
88
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
89
+ CMD curl -f http://localhost:8000/api/health || exit 1
90
+
91
+ # Default command - run FastAPI
92
+ CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8000"]
93
+
94
+ # ============== Development Stage ==============
95
+ FROM production as development
96
+
97
+ # Install development dependencies
98
+ RUN pip install --no-cache-dir \
99
+ pytest \
100
+ pytest-asyncio \
101
+ pytest-cov \
102
+ black \
103
+ flake8 \
104
+ mypy \
105
+ ipython \
106
+ jupyter
107
+
108
+ # Development command with hot reload
109
+ CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
IMPLEMENTATION_REPORT.md ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPARKNET Implementation Report
2
+ ## Agentic Document Intelligence Platform
3
+
4
+ **Report Date:** January 2025
5
+ **Version:** 0.1.0
6
+
7
+ ---
8
+
9
+ ## Executive Summary
10
+
11
+ SPARKNET is an enterprise-grade **Agentic Document Intelligence Platform** that follows FAANG best practices for:
12
+ - **Modular Architecture**: Clean separation of concerns with well-defined interfaces
13
+ - **Local-First Privacy**: All processing happens locally via Ollama
14
+ - **Evidence Grounding**: Every extraction includes verifiable source references
15
+ - **Production-Ready**: Type-safe, tested, configurable, and scalable
16
+
17
+ ---
18
+
19
+ ## 1. What Has Been Implemented
20
+
21
+ ### 1.1 Core Subsystems
22
+
23
+ | Subsystem | Location | Status | Description |
24
+ |-----------|----------|--------|-------------|
25
+ | **Document Intelligence** | `src/document_intelligence/` | Complete | Vision-first document understanding |
26
+ | **Legacy Document Pipeline** | `src/document/` | Complete | OCR, layout, chunking pipeline |
27
+ | **RAG Subsystem** | `src/rag/` | Complete | Vector search with grounded retrieval |
28
+ | **Multi-Agent System** | `src/agents/` | Complete | ReAct-style agents with tools |
29
+ | **LLM Integration** | `src/llm/` | Complete | Ollama client with routing |
30
+ | **CLI** | `src/cli/` | Complete | Full command-line interface |
31
+ | **API** | `api/` | Complete | FastAPI REST endpoints |
32
+ | **Demo UI** | `demo/` | Complete | Streamlit dashboard |
33
+
34
+ ### 1.2 Document Intelligence Module (`src/document_intelligence/`)
35
+
36
+ **Architecture (FAANG-inspired: Google DocAI pattern):**
37
+
38
+ ```
39
+ src/document_intelligence/
40
+ ├── chunks/ # Core data models (BoundingBox, DocumentChunk, TableChunk)
41
+ │ ├── models.py # Pydantic models with full type safety
42
+ │ └── __init__.py
43
+ ├── io/ # Document loading with caching
44
+ │ ├── base.py # Abstract interfaces
45
+ │ ├── pdf.py # PyMuPDF-based PDF loading
46
+ │ ├── image.py # PIL image loading
47
+ │ └── cache.py # LRU page caching
48
+ ├── models/ # ML model interfaces
49
+ │ ├── base.py # BaseModel, BatchableModel
50
+ │ ├── ocr.py # OCRModel interface
51
+ │ ├── layout.py # LayoutModel interface
52
+ │ ├── table.py # TableModel interface
53
+ │ └── vlm.py # VisionLanguageModel interface
54
+ ├── parsing/ # Document parsing pipeline
55
+ │ ├── parser.py # DocumentParser orchestrator
56
+ │ └── chunking.py # SemanticChunker
57
+ ├── grounding/ # Visual evidence
58
+ │ ├── evidence.py # EvidenceBuilder, EvidenceTracker
59
+ │ └── crops.py # Image cropping utilities
60
+ ├── extraction/ # Field extraction
61
+ │ ├── schema.py # ExtractionSchema, FieldSpec
62
+ │ ├── extractor.py # FieldExtractor
63
+ │ └── validator.py # ExtractionValidator
64
+ ├── tools/ # Agent tools
65
+ │ ├── document_tools.py # ParseDocumentTool, ExtractFieldsTool, etc.
66
+ │ └── rag_tools.py # IndexDocumentTool, RetrieveChunksTool, RAGAnswerTool
67
+ └── agent_adapter.py # EnhancedDocumentAgent integration
68
+ ```
69
+
70
+ **Key Features:**
71
+ - **Zero-Shot Capability**: Works across document formats without training
72
+ - **Schema-Driven Extraction**: Define fields using JSON Schema or Pydantic
73
+ - **Abstention Policy**: Never guesses - abstains when confidence is low
74
+ - **Visual Grounding**: Every extraction includes page, bbox, snippet, confidence
75
+
76
+ ### 1.3 RAG Subsystem (`src/rag/`)
77
+
78
+ **Architecture (FAANG-inspired: Meta FAISS + Google Vertex AI pattern):**
79
+
80
+ ```
81
+ src/rag/
82
+ ├── store.py # VectorStore interface + ChromaVectorStore
83
+ ├── embeddings.py # OllamaEmbedding + OpenAIEmbedding (feature-flagged)
84
+ ├── indexer.py # DocumentIndexer for chunked documents
85
+ ├── retriever.py # DocumentRetriever with evidence support
86
+ ├── generator.py # GroundedGenerator with citations
87
+ ├── docint_bridge.py # Bridge to document_intelligence subsystem
88
+ └── __init__.py # Clean exports
89
+ ```
90
+
91
+ **Key Features:**
92
+ - **Local-First Embeddings**: Ollama `nomic-embed-text` by default
93
+ - **Cloud Opt-In**: OpenAI embeddings disabled by default, feature-flagged
94
+ - **Metadata Filtering**: Filter by document_id, chunk_type, page_range
95
+ - **Citation Generation**: Answers include `[1]`, `[2]` references
96
+ - **Confidence-Based Abstention**: Returns "I don't know" when uncertain
97
+
98
+ ### 1.4 Multi-Agent System (`src/agents/`)
99
+
100
+ **Agents Implemented:**
101
+ | Agent | Purpose | Model |
102
+ |-------|---------|-------|
103
+ | `ExecutorAgent` | Task execution with tools | llama3.1:8b |
104
+ | `DocumentAgent` | ReAct-style document analysis | llama3.1:8b |
105
+ | `PlannerAgent` | Task decomposition | mistral |
106
+ | `CriticAgent` | Output validation | phi3 |
107
+ | `MemoryAgent` | Context management | llama3.2 |
108
+ | `VisionOCRAgent` | Vision-based OCR | llava (optional) |
109
+
110
+ ### 1.5 CLI Commands
111
+
112
+ ```bash
113
+ # Document Intelligence
114
+ sparknet docint parse document.pdf -o result.json
115
+ sparknet docint extract invoice.pdf --preset invoice
116
+ sparknet docint ask document.pdf "What is the total?"
117
+ sparknet docint classify document.pdf
118
+
119
+ # RAG Operations
120
+ sparknet docint index document.pdf # Index into vector store
121
+ sparknet docint index-stats # Show index statistics
122
+ sparknet docint retrieve "payment terms" -k 10 # Semantic search
123
+ sparknet docint ask doc.pdf "question" --use-rag # RAG-powered Q&A
124
+
125
+ # Legacy Document Commands
126
+ sparknet document parse invoice.pdf
127
+ sparknet document extract contract.pdf -f "party_name"
128
+ sparknet rag index *.pdf --collection my_docs
129
+ sparknet rag search "query" --top 10
130
+ ```
131
+
132
+ ---
133
+
134
+ ## 2. How to Execute SPARKNET
135
+
136
+ ### 2.1 Prerequisites
137
+
138
+ ```bash
139
+ # 1. System Requirements
140
+ # - Python 3.10+
141
+ # - NVIDIA GPU with CUDA 12.0+ (optional but recommended)
142
+ # - 16GB+ RAM
143
+ # - 50GB+ disk space
144
+
145
+ # 2. Install Ollama (if not installed)
146
+ curl -fsSL https://ollama.com/install.sh | sh
147
+
148
+ # 3. Start Ollama server
149
+ ollama serve
150
+ ```
151
+
152
+ ### 2.2 Installation
153
+
154
+ ```bash
155
+ cd /home/mhamdan/SPARKNET
156
+
157
+ # Option A: Use existing virtual environment
158
+ source sparknet/bin/activate
159
+
160
+ # Option B: Create new environment
161
+ python3 -m venv sparknet
162
+ source sparknet/bin/activate
163
+
164
+ # Install dependencies
165
+ pip install -r requirements.txt
166
+ pip install -r demo/requirements.txt
167
+
168
+ # Install SPARKNET in development mode
169
+ pip install -e .
170
+ ```
171
+
172
+ ### 2.3 Download Required Models
173
+
174
+ ```bash
175
+ # Embedding model (required for RAG)
176
+ ollama pull nomic-embed-text:latest
177
+
178
+ # LLM models (at least one required)
179
+ ollama pull llama3.2:latest # Fast, 2GB
180
+ ollama pull llama3.1:8b # General purpose, 5GB
181
+ ollama pull mistral:latest # Good reasoning, 4GB
182
+
183
+ # Optional: Larger models for complex tasks
184
+ ollama pull qwen2.5:14b # Complex reasoning, 9GB
185
+ ```
186
+
187
+ ### 2.4 Running the Demo UI
188
+
189
+ **Method 1: Using the launcher script**
190
+ ```bash
191
+ cd /home/mhamdan/SPARKNET
192
+ ./run_demo.sh 8501
193
+ ```
194
+
195
+ **Method 2: Direct Streamlit command**
196
+ ```bash
197
+ cd /home/mhamdan/SPARKNET
198
+ source sparknet/bin/activate
199
+ streamlit run demo/app.py --server.port 8501
200
+ ```
201
+
202
+ **Method 3: Bind to specific IP (for remote access)**
203
+ ```bash
204
+ streamlit run demo/app.py \
205
+ --server.address 172.24.50.21 \
206
+ --server.port 8501 \
207
+ --server.headless true
208
+ ```
209
+
210
+ **Access at:** http://172.24.50.21:8501 or http://localhost:8501
211
+
212
+ ### 2.5 Running the API Server
213
+
214
+ ```bash
215
+ cd /home/mhamdan/SPARKNET
216
+ source sparknet/bin/activate
217
+ uvicorn api.main:app --host 0.0.0.0 --port 8000 --reload
218
+ ```
219
+
220
+ **API Endpoints:**
221
+ - `GET /health` - Health check
222
+ - `POST /api/documents/parse` - Parse document
223
+ - `POST /api/documents/extract` - Extract fields
224
+ - `POST /api/rag/index` - Index document
225
+ - `POST /api/rag/query` - Query RAG
226
+
227
+ ### 2.6 Running Examples
228
+
229
+ ```bash
230
+ cd /home/mhamdan/SPARKNET
231
+ source sparknet/bin/activate
232
+
233
+ # Document Intelligence Demo
234
+ python examples/document_intelligence_demo.py
235
+
236
+ # RAG End-to-End Pipeline
237
+ python examples/document_rag_end_to_end.py
238
+
239
+ # Simple Agent Task
240
+ python examples/simple_task.py
241
+
242
+ # Document Agent
243
+ python examples/document_agent.py
244
+ ```
245
+
246
+ ### 2.7 Running Tests
247
+
248
+ ```bash
249
+ cd /home/mhamdan/SPARKNET
250
+ source sparknet/bin/activate
251
+
252
+ # Run all tests
253
+ pytest tests/ -v
254
+
255
+ # Run specific test suites
256
+ pytest tests/unit/test_document_intelligence.py -v
257
+ pytest tests/unit/test_rag_integration.py -v
258
+
259
+ # Run with coverage
260
+ pytest tests/ --cov=src --cov-report=html
261
+ ```
262
+
263
+ ---
264
+
265
+ ## 3. Configuration
266
+
267
+ ### 3.1 RAG Configuration (`configs/rag.yaml`)
268
+
269
+ ```yaml
270
+ vector_store:
271
+ type: chroma
272
+ chroma:
273
+ persist_directory: "./.sparknet/chroma_db"
274
+ collection_name: "sparknet_documents"
275
+ distance_metric: cosine
276
+
277
+ embeddings:
278
+ provider: ollama # Local-first
279
+ ollama:
280
+ model: nomic-embed-text
281
+ base_url: "http://localhost:11434"
282
+ openai:
283
+ enabled: false # Disabled by default
284
+
285
+ generator:
286
+ provider: ollama
287
+ ollama:
288
+ model: llama3.2
289
+ abstain_on_low_confidence: true
290
+ abstain_threshold: 0.3
291
+ ```
292
+
293
+ ### 3.2 Document Configuration (`config/document.yaml`)
294
+
295
+ ```yaml
296
+ ocr:
297
+ engine: paddleocr # or tesseract
298
+ languages: ["en"]
299
+ confidence_threshold: 0.5
300
+
301
+ layout:
302
+ enabled: true
303
+ reading_order: true
304
+
305
+ chunking:
306
+ min_chunk_chars: 10
307
+ max_chunk_chars: 4000
308
+ target_chunk_chars: 500
309
+ ```
310
+
311
+ ---
312
+
313
+ ## 4. FAANG Best Practices Applied
314
+
315
+ ### 4.1 Google-Inspired Patterns
316
+ - **DocAI Architecture**: Modular vision-first document understanding
317
+ - **Structured Output**: Schema-driven extraction with validation
318
+ - **Abstention Policy**: Never hallucinate, return "I don't know"
319
+
320
+ ### 4.2 Meta-Inspired Patterns
321
+ - **FAISS Integration**: Fast similarity search (optional alongside ChromaDB)
322
+ - **RAG Pipeline**: Retrieve-then-generate with citations
323
+
324
+ ### 4.3 Amazon-Inspired Patterns
325
+ - **Textract-like API**: Structured field extraction with confidence scores
326
+ - **Evidence Grounding**: Every output traceable to source
327
+
328
+ ### 4.4 Microsoft-Inspired Patterns
329
+ - **Form Recognizer Pattern**: Pre-built schemas for invoices, contracts
330
+ - **Confidence Thresholds**: Configurable abstention levels
331
+
332
+ ### 4.5 Apple-Inspired Patterns
333
+ - **Privacy-First**: All processing local by default
334
+ - **Opt-In Cloud**: OpenAI and cloud services disabled by default
335
+
336
+ ---
337
+
338
+ ## 5. Quick Start Commands
339
+
340
+ ```bash
341
+ # === SETUP ===
342
+ cd /home/mhamdan/SPARKNET
343
+ source sparknet/bin/activate
344
+ ollama serve & # Start in background
345
+
346
+ # === DEMO UI ===
347
+ streamlit run demo/app.py --server.port 8501
348
+
349
+ # === CLI USAGE ===
350
+ # Parse a document
351
+ python -m src.cli.main docint parse Dataset/IBM*.pdf -o result.json
352
+
353
+ # Index for RAG
354
+ python -m src.cli.main docint index Dataset/*.pdf
355
+
356
+ # Ask questions with RAG
357
+ python -m src.cli.main docint ask Dataset/IBM*.pdf "What is this document about?" --use-rag
358
+
359
+ # === PYTHON API ===
360
+ python -c "
361
+ from src.document_intelligence import DocumentParser
362
+ parser = DocumentParser()
363
+ result = parser.parse('Dataset/IBM N_A.pdf')
364
+ print(f'Parsed {len(result.chunks)} chunks')
365
+ "
366
+
367
+ # === RUN TESTS ===
368
+ pytest tests/unit/ -v
369
+ ```
370
+
371
+ ---
372
+
373
+ ## 6. Troubleshooting
374
+
375
+ ### Issue: Ollama not running
376
+ ```bash
377
+ # Check status
378
+ curl http://localhost:11434/api/tags
379
+
380
+ # Start Ollama
381
+ ollama serve
382
+
383
+ # If port in use
384
+ pkill ollama && ollama serve
385
+ ```
386
+
387
+ ### Issue: Missing models
388
+ ```bash
389
+ ollama list # See installed models
390
+ ollama pull nomic-embed-text # Install embedding model
391
+ ollama pull llama3.2 # Install LLM
392
+ ```
393
+
394
+ ### Issue: ChromaDB errors
395
+ ```bash
396
+ # Reset vector store
397
+ rm -rf .sparknet/chroma_db
398
+ ```
399
+
400
+ ### Issue: Import errors
401
+ ```bash
402
+ # Ensure in correct directory
403
+ cd /home/mhamdan/SPARKNET
404
+
405
+ # Ensure venv activated
406
+ source sparknet/bin/activate
407
+
408
+ # Reinstall
409
+ pip install -e .
410
+ ```
411
+
412
+ ---
413
+
414
+ ## 7. Architecture Diagram
415
+
416
+ ```
417
+ ┌─────────────────────────────────────────────────────────────────┐
418
+ │ SPARKNET Platform │
419
+ ├─────────────────────────────────────────────────────────────────┤
420
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
421
+ │ │ Streamlit │ │ FastAPI │ │ CLI │ Interfaces │
422
+ │ │ Demo │ │ API │ │ Commands │ │
423
+ │ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │
424
+ ├─────────┴────────────────┴────────────────┴─────────────────────┤
425
+ │ │
426
+ │ ┌──────────────────────────────────────────────────────────┐ │
427
+ │ │ Agent Layer │ │
428
+ │ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │
429
+ │ │ │ Document │ │ Executor │ │ Planner │ │ Critic │ │ │
430
+ │ │ │ Agent │ │ Agent │ │ Agent │ │ Agent │ │ │
431
+ │ │ └────┬─────┘ └────┬─────┘ └────┬─────┘ └────┬─────┘ │ │
432
+ │ └───────┴────────────┴────────────┴────────────┴───────────┘ │
433
+ │ │
434
+ │ ┌────────────────────┐ ┌─────────────────────────────────┐ │
435
+ │ │ Document Intel │ │ RAG Subsystem │ │
436
+ │ │ ┌───────┐ ┌──────┐ │ │ ┌─────────┐ ┌─────────────────┐ │ │
437
+ │ │ │Parser │ │Extract│ │ │ │Indexer │ │ Retriever │ │ │
438
+ │ │ └───────┘ └──────┘ │ │ └─────────┘ └─────────────────┘ │ │
439
+ │ │ ┌───────┐ ┌──────┐ │ │ ┌─────────┐ ┌─────────────────┐ │ │
440
+ │ │ │Ground │ │Valid │ │ │ │Embedder │ │ Generator │ │ │
441
+ │ │ └───────┘ └──────┘ │ │ └─────────┘ └──────��──────────┘ │ │
442
+ │ └────────────────────┘ └─────────────────────────────────┘ │
443
+ │ │
444
+ │ ┌─────────────────────────────────────────────────────────┐ │
445
+ │ │ Infrastructure │ │
446
+ │ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │
447
+ │ │ │ Ollama │ │ ChromaDB │ │ GPU │ │ Cache │ │ │
448
+ │ │ │ Client │ │ Store │ │ Manager │ │ Layer │ │ │
449
+ │ │ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │ │
450
+ │ └─────────────────────────────────────────────────────────┘ │
451
+ └─────────────────────────────────────────────────────────────────┘
452
+ ```
453
+
454
+ ---
455
+
456
+ ## 8. Files Modified/Created in Recent Session
457
+
458
+ | File | Action | Description |
459
+ |------|--------|-------------|
460
+ | `src/rag/docint_bridge.py` | Created | Bridge between document_intelligence and RAG |
461
+ | `src/document_intelligence/tools/rag_tools.py` | Created | RAG tools for agents |
462
+ | `src/document_intelligence/tools/__init__.py` | Modified | Added RAG tool exports |
463
+ | `src/document_intelligence/tools/document_tools.py` | Modified | Enhanced AnswerQuestionTool with RAG |
464
+ | `src/cli/docint.py` | Modified | Added index, retrieve, delete-index commands |
465
+ | `src/rag/__init__.py` | Modified | Added bridge exports |
466
+ | `configs/rag.yaml` | Created | RAG configuration file |
467
+ | `tests/unit/test_rag_integration.py` | Created | RAG integration tests |
468
+ | `examples/document_rag_end_to_end.py` | Created | End-to-end RAG example |
469
+
470
+ ---
471
+
472
+ **Report Complete**
473
+
474
+ For questions or issues, refer to the troubleshooting section above or check the test files for usage examples.
api/auth.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SPARKNET Authentication Module
3
+ JWT-based authentication with OAuth2 support.
4
+ """
5
+
6
+ from fastapi import Depends, HTTPException, status
7
+ from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
8
+ from jose import JWTError, jwt
9
+ from passlib.context import CryptContext
10
+ from pydantic import BaseModel
11
+ from datetime import datetime, timedelta
12
+ from typing import Optional, List
13
+ from pathlib import Path
14
+ import os
15
+ import json
16
+ import uuid
17
+
18
+ # Configuration (use environment variables in production)
19
+ SECRET_KEY = os.getenv("SPARKNET_SECRET_KEY", "sparknet-super-secret-key-change-in-production")
20
+ ALGORITHM = "HS256"
21
+ ACCESS_TOKEN_EXPIRE_MINUTES = 30
22
+
23
+ # Password hashing
24
+ pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
25
+
26
+ # OAuth2 scheme
27
+ oauth2_scheme = OAuth2PasswordBearer(tokenUrl="api/auth/token", auto_error=False)
28
+
29
+ # Simple file-based user store (replace with database in production)
30
+ USERS_FILE = Path(__file__).parent.parent / "data" / "users.json"
31
+ USERS_FILE.parent.mkdir(parents=True, exist_ok=True)
32
+
33
+
34
+ class User(BaseModel):
35
+ """User model."""
36
+ user_id: str
37
+ username: str
38
+ email: str
39
+ hashed_password: str
40
+ is_active: bool = True
41
+ is_admin: bool = False
42
+ scopes: List[str] = []
43
+ created_at: datetime = None
44
+
45
+ class Config:
46
+ json_encoders = {
47
+ datetime: lambda v: v.isoformat() if v else None
48
+ }
49
+
50
+
51
+ class UserInDB(User):
52
+ """User model with password hash."""
53
+ pass
54
+
55
+
56
+ class TokenData(BaseModel):
57
+ """JWT token payload."""
58
+ username: Optional[str] = None
59
+ user_id: Optional[str] = None
60
+ scopes: List[str] = []
61
+
62
+
63
+ def _load_users() -> dict:
64
+ """Load users from file."""
65
+ if USERS_FILE.exists():
66
+ try:
67
+ with open(USERS_FILE) as f:
68
+ data = json.load(f)
69
+ return {u["username"]: User(**u) for u in data}
70
+ except Exception:
71
+ pass
72
+ return {}
73
+
74
+
75
+ def _save_users(users: dict):
76
+ """Save users to file."""
77
+ with open(USERS_FILE, "w") as f:
78
+ json.dump([u.dict() for u in users.values()], f, default=str, indent=2)
79
+
80
+
81
+ def verify_password(plain_password: str, hashed_password: str) -> bool:
82
+ """Verify a password against its hash."""
83
+ return pwd_context.verify(plain_password, hashed_password)
84
+
85
+
86
+ def get_password_hash(password: str) -> str:
87
+ """Hash a password."""
88
+ return pwd_context.hash(password)
89
+
90
+
91
+ def get_user(username: str) -> Optional[UserInDB]:
92
+ """Get a user by username."""
93
+ users = _load_users()
94
+ if username in users:
95
+ return UserInDB(**users[username].dict())
96
+ return None
97
+
98
+
99
+ def authenticate_user(username: str, password: str) -> Optional[UserInDB]:
100
+ """Authenticate a user."""
101
+ user = get_user(username)
102
+ if not user:
103
+ return None
104
+ if not verify_password(password, user.hashed_password):
105
+ return None
106
+ return user
107
+
108
+
109
+ def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -> str:
110
+ """Create a JWT access token."""
111
+ to_encode = data.copy()
112
+ if expires_delta:
113
+ expire = datetime.utcnow() + expires_delta
114
+ else:
115
+ expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
116
+ to_encode.update({"exp": expire})
117
+ encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
118
+ return encoded_jwt
119
+
120
+
121
+ async def get_current_user(token: str = Depends(oauth2_scheme)) -> Optional[UserInDB]:
122
+ """Get the current user from JWT token."""
123
+ if not token:
124
+ return None
125
+
126
+ try:
127
+ payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
128
+ username: str = payload.get("sub")
129
+ if username is None:
130
+ return None
131
+ token_data = TokenData(
132
+ username=username,
133
+ user_id=payload.get("user_id"),
134
+ scopes=payload.get("scopes", [])
135
+ )
136
+ except JWTError:
137
+ return None
138
+
139
+ user = get_user(token_data.username)
140
+ return user
141
+
142
+
143
+ async def get_current_active_user(
144
+ current_user: Optional[UserInDB] = Depends(get_current_user)
145
+ ) -> Optional[UserInDB]:
146
+ """Get current active user (authentication optional)."""
147
+ if current_user and not current_user.is_active:
148
+ return None
149
+ return current_user
150
+
151
+
152
+ async def require_auth(
153
+ current_user: Optional[UserInDB] = Depends(get_current_user)
154
+ ) -> UserInDB:
155
+ """Require authentication (raises exception if not authenticated)."""
156
+ credentials_exception = HTTPException(
157
+ status_code=status.HTTP_401_UNAUTHORIZED,
158
+ detail="Could not validate credentials",
159
+ headers={"WWW-Authenticate": "Bearer"},
160
+ )
161
+ if not current_user:
162
+ raise credentials_exception
163
+ if not current_user.is_active:
164
+ raise HTTPException(status_code=400, detail="Inactive user")
165
+ return current_user
166
+
167
+
168
+ async def require_admin(
169
+ current_user: UserInDB = Depends(require_auth)
170
+ ) -> UserInDB:
171
+ """Require admin privileges."""
172
+ if not current_user.is_admin:
173
+ raise HTTPException(
174
+ status_code=status.HTTP_403_FORBIDDEN,
175
+ detail="Admin privileges required"
176
+ )
177
+ return current_user
178
+
179
+
180
+ def create_user(username: str, email: str, password: str, is_admin: bool = False) -> User:
181
+ """Create a new user."""
182
+ users = _load_users()
183
+
184
+ if username in users:
185
+ raise ValueError(f"User {username} already exists")
186
+
187
+ user = User(
188
+ user_id=str(uuid.uuid4()),
189
+ username=username,
190
+ email=email,
191
+ hashed_password=get_password_hash(password),
192
+ is_active=True,
193
+ is_admin=is_admin,
194
+ scopes=["read", "write"] if not is_admin else ["read", "write", "admin"],
195
+ created_at=datetime.now()
196
+ )
197
+
198
+ users[username] = user
199
+ _save_users(users)
200
+ return user
201
+
202
+
203
+ def delete_user(username: str) -> bool:
204
+ """Delete a user."""
205
+ users = _load_users()
206
+ if username in users:
207
+ del users[username]
208
+ _save_users(users)
209
+ return True
210
+ return False
211
+
212
+
213
+ # Initialize default admin user if none exists
214
+ def init_default_admin():
215
+ """Create default admin user if no users exist."""
216
+ users = _load_users()
217
+ if not users:
218
+ try:
219
+ create_user(
220
+ username="admin",
221
+ email="admin@sparknet.local",
222
+ password="admin123", # Change in production!
223
+ is_admin=True
224
+ )
225
+ print("Default admin user created: admin / admin123")
226
+ except Exception as e:
227
+ print(f"Could not create default admin: {e}")
228
+
229
+
230
+ # Auth routes
231
+ from fastapi import APIRouter
232
+
233
+ auth_router = APIRouter()
234
+
235
+
236
+ @auth_router.post("/token")
237
+ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
238
+ """OAuth2 compatible token login."""
239
+ user = authenticate_user(form_data.username, form_data.password)
240
+ if not user:
241
+ raise HTTPException(
242
+ status_code=status.HTTP_401_UNAUTHORIZED,
243
+ detail="Incorrect username or password",
244
+ headers={"WWW-Authenticate": "Bearer"},
245
+ )
246
+ access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
247
+ access_token = create_access_token(
248
+ data={
249
+ "sub": user.username,
250
+ "user_id": user.user_id,
251
+ "scopes": user.scopes
252
+ },
253
+ expires_delta=access_token_expires
254
+ )
255
+ return {
256
+ "access_token": access_token,
257
+ "token_type": "bearer",
258
+ "expires_in": ACCESS_TOKEN_EXPIRE_MINUTES * 60
259
+ }
260
+
261
+
262
+ @auth_router.post("/register")
263
+ async def register_user(
264
+ username: str,
265
+ email: str,
266
+ password: str,
267
+ ):
268
+ """Register a new user."""
269
+ try:
270
+ user = create_user(username, email, password)
271
+ return {
272
+ "user_id": user.user_id,
273
+ "username": user.username,
274
+ "email": user.email,
275
+ "message": "User created successfully"
276
+ }
277
+ except ValueError as e:
278
+ raise HTTPException(status_code=400, detail=str(e))
279
+
280
+
281
+ @auth_router.get("/me")
282
+ async def read_users_me(current_user: UserInDB = Depends(require_auth)):
283
+ """Get current user information."""
284
+ return {
285
+ "user_id": current_user.user_id,
286
+ "username": current_user.username,
287
+ "email": current_user.email,
288
+ "is_active": current_user.is_active,
289
+ "is_admin": current_user.is_admin,
290
+ "scopes": current_user.scopes
291
+ }
292
+
293
+
294
+ @auth_router.get("/users")
295
+ async def list_users(current_user: UserInDB = Depends(require_admin)):
296
+ """List all users (admin only)."""
297
+ users = _load_users()
298
+ return [
299
+ {
300
+ "user_id": u.user_id,
301
+ "username": u.username,
302
+ "email": u.email,
303
+ "is_active": u.is_active,
304
+ "is_admin": u.is_admin
305
+ }
306
+ for u in users.values()
307
+ ]
308
+
309
+
310
+ @auth_router.delete("/users/{username}")
311
+ async def delete_user_endpoint(
312
+ username: str,
313
+ current_user: UserInDB = Depends(require_admin)
314
+ ):
315
+ """Delete a user (admin only)."""
316
+ if username == current_user.username:
317
+ raise HTTPException(status_code=400, detail="Cannot delete yourself")
318
+ if delete_user(username):
319
+ return {"status": "deleted", "username": username}
320
+ raise HTTPException(status_code=404, detail=f"User not found: {username}")
api/routes/documents.py ADDED
@@ -0,0 +1,553 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SPARKNET Document API Routes
3
+ Endpoints for document upload, processing, and management.
4
+ """
5
+
6
+ from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Depends, BackgroundTasks
7
+ from fastapi.responses import StreamingResponse
8
+ from typing import List, Optional
9
+ from pathlib import Path
10
+ from datetime import datetime
11
+ import hashlib
12
+ import shutil
13
+ import uuid
14
+ import io
15
+ import sys
16
+
17
+ # Add project root to path
18
+ PROJECT_ROOT = Path(__file__).parent.parent.parent
19
+ sys.path.insert(0, str(PROJECT_ROOT))
20
+
21
+ from api.schemas import (
22
+ DocumentUploadResponse, DocumentResponse, DocumentMetadata,
23
+ DocumentDetailResponse, ChunksResponse, ChunkInfo,
24
+ OCRRegionInfo, LayoutRegionInfo, DocumentStatus,
25
+ IndexRequest, IndexResponse, BatchIndexRequest, BatchIndexResponse
26
+ )
27
+ from loguru import logger
28
+
29
+ router = APIRouter()
30
+
31
+ # In-memory document store (replace with database in production)
32
+ _documents = {}
33
+ _processing_tasks = {}
34
+
35
+ # Supported file types
36
+ SUPPORTED_EXTENSIONS = {
37
+ '.pdf': 'application/pdf',
38
+ '.png': 'image/png',
39
+ '.jpg': 'image/jpeg',
40
+ '.jpeg': 'image/jpeg',
41
+ '.tiff': 'image/tiff',
42
+ '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
43
+ '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
44
+ '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
45
+ '.txt': 'text/plain',
46
+ '.md': 'text/markdown',
47
+ }
48
+
49
+ UPLOAD_DIR = PROJECT_ROOT / "uploads" / "documents"
50
+ UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
51
+
52
+
53
+ def generate_doc_id(filename: str, content: bytes) -> str:
54
+ """Generate unique document ID from filename and content hash."""
55
+ content_hash = hashlib.md5(content[:4096]).hexdigest()[:8]
56
+ timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
57
+ return f"doc_{timestamp}_{content_hash}"
58
+
59
+
60
+ async def process_document_task(doc_id: str, file_path: Path, file_type: str):
61
+ """Background task to process a document."""
62
+ try:
63
+ logger.info(f"Processing document: {doc_id}")
64
+ _documents[doc_id]["status"] = DocumentStatus.PROCESSING
65
+
66
+ # Try to use actual document processor
67
+ try:
68
+ from src.document.pipeline.processor import DocumentProcessor, PipelineConfig
69
+
70
+ config = PipelineConfig(
71
+ ocr_enabled=True,
72
+ layout_enabled=True,
73
+ chunking_enabled=True,
74
+ )
75
+ processor = DocumentProcessor(config)
76
+ result = processor.process(str(file_path))
77
+
78
+ # Extract data from result
79
+ chunks = []
80
+ for i, chunk in enumerate(getattr(result, 'chunks', [])):
81
+ chunks.append({
82
+ "chunk_id": f"{doc_id}_chunk_{i}",
83
+ "doc_id": doc_id,
84
+ "text": getattr(chunk, 'text', str(chunk)),
85
+ "chunk_type": getattr(chunk, 'chunk_type', 'text'),
86
+ "page_num": getattr(chunk, 'page', 0),
87
+ "confidence": getattr(chunk, 'confidence', 1.0),
88
+ "bbox": getattr(chunk, 'bbox', None),
89
+ })
90
+
91
+ _documents[doc_id].update({
92
+ "status": DocumentStatus.COMPLETED,
93
+ "raw_text": getattr(result, 'raw_text', ''),
94
+ "chunks": chunks,
95
+ "page_count": getattr(result, 'page_count', 1),
96
+ "ocr_regions": getattr(result, 'ocr_regions', []),
97
+ "layout_regions": getattr(result, 'layout_regions', []),
98
+ "processing_time": getattr(result, 'processing_time', 0.0),
99
+ "updated_at": datetime.now(),
100
+ })
101
+
102
+ logger.success(f"Document {doc_id} processed successfully: {len(chunks)} chunks")
103
+
104
+ except Exception as proc_error:
105
+ logger.warning(f"Full processor unavailable: {proc_error}, using fallback")
106
+ # Fallback: simple text extraction
107
+ raw_text = ""
108
+
109
+ if file_type in ['.pdf']:
110
+ try:
111
+ import fitz
112
+ doc = fitz.open(str(file_path))
113
+ for page in doc:
114
+ raw_text += page.get_text() + "\n"
115
+ page_count = len(doc)
116
+ doc.close()
117
+ except Exception as e:
118
+ logger.error(f"PDF extraction failed: {e}")
119
+ page_count = 1
120
+
121
+ elif file_type in ['.txt', '.md']:
122
+ raw_text = file_path.read_text(errors='ignore')
123
+ page_count = 1
124
+
125
+ elif file_type == '.docx':
126
+ try:
127
+ from docx import Document
128
+ doc = Document(str(file_path))
129
+ raw_text = "\n".join([p.text for p in doc.paragraphs])
130
+ page_count = max(1, len(raw_text) // 3000)
131
+ except Exception as e:
132
+ logger.error(f"DOCX extraction failed: {e}")
133
+ page_count = 1
134
+
135
+ elif file_type == '.xlsx':
136
+ try:
137
+ import pandas as pd
138
+ df_dict = pd.read_excel(str(file_path), sheet_name=None)
139
+ for sheet_name, df in df_dict.items():
140
+ raw_text += f"\n=== Sheet: {sheet_name} ===\n"
141
+ raw_text += df.to_string() + "\n"
142
+ page_count = len(df_dict)
143
+ except Exception as e:
144
+ logger.error(f"XLSX extraction failed: {e}")
145
+ page_count = 1
146
+
147
+ elif file_type == '.pptx':
148
+ try:
149
+ from pptx import Presentation
150
+ prs = Presentation(str(file_path))
151
+ for i, slide in enumerate(prs.slides):
152
+ raw_text += f"\n=== Slide {i+1} ===\n"
153
+ for shape in slide.shapes:
154
+ if hasattr(shape, "text"):
155
+ raw_text += shape.text + "\n"
156
+ page_count = len(prs.slides)
157
+ except Exception as e:
158
+ logger.error(f"PPTX extraction failed: {e}")
159
+ page_count = 1
160
+
161
+ # Create simple chunks
162
+ chunks = []
163
+ chunk_size = 1000
164
+ text_chunks = [raw_text[i:i+chunk_size] for i in range(0, len(raw_text), chunk_size - 100)]
165
+ for i, text in enumerate(text_chunks):
166
+ if text.strip():
167
+ chunks.append({
168
+ "chunk_id": f"{doc_id}_chunk_{i}",
169
+ "doc_id": doc_id,
170
+ "text": text.strip(),
171
+ "chunk_type": "text",
172
+ "page_num": min(i * chunk_size // 3000 + 1, page_count),
173
+ "confidence": 1.0,
174
+ "bbox": None,
175
+ })
176
+
177
+ _documents[doc_id].update({
178
+ "status": DocumentStatus.COMPLETED,
179
+ "raw_text": raw_text,
180
+ "chunks": chunks,
181
+ "page_count": page_count,
182
+ "ocr_regions": [],
183
+ "layout_regions": [],
184
+ "processing_time": 0.0,
185
+ "updated_at": datetime.now(),
186
+ })
187
+
188
+ logger.info(f"Document {doc_id} processed with fallback: {len(chunks)} chunks")
189
+
190
+ except Exception as e:
191
+ logger.error(f"Document processing failed for {doc_id}: {e}")
192
+ _documents[doc_id]["status"] = DocumentStatus.ERROR
193
+ _documents[doc_id]["error"] = str(e)
194
+
195
+
196
+ @router.post("/upload", response_model=DocumentUploadResponse)
197
+ async def upload_document(
198
+ background_tasks: BackgroundTasks,
199
+ file: UploadFile = File(...),
200
+ auto_process: bool = Query(True, description="Automatically process after upload"),
201
+ auto_index: bool = Query(False, description="Automatically index to RAG after processing"),
202
+ ):
203
+ """
204
+ Upload a document for processing.
205
+
206
+ Supported formats: PDF, PNG, JPG, DOCX, XLSX, PPTX, TXT, MD
207
+ """
208
+ # Validate file extension
209
+ file_ext = Path(file.filename).suffix.lower()
210
+ if file_ext not in SUPPORTED_EXTENSIONS:
211
+ raise HTTPException(
212
+ status_code=400,
213
+ detail=f"Unsupported file type: {file_ext}. Supported: {list(SUPPORTED_EXTENSIONS.keys())}"
214
+ )
215
+
216
+ # Read file content
217
+ content = await file.read()
218
+ if len(content) == 0:
219
+ raise HTTPException(status_code=400, detail="Empty file uploaded")
220
+
221
+ # Generate document ID
222
+ doc_id = generate_doc_id(file.filename, content)
223
+
224
+ # Save file
225
+ file_path = UPLOAD_DIR / f"{doc_id}{file_ext}"
226
+ with open(file_path, "wb") as f:
227
+ f.write(content)
228
+
229
+ # Create document record
230
+ _documents[doc_id] = {
231
+ "doc_id": doc_id,
232
+ "filename": file.filename,
233
+ "file_type": file_ext,
234
+ "file_path": str(file_path),
235
+ "status": DocumentStatus.PENDING,
236
+ "raw_text": "",
237
+ "chunks": [],
238
+ "page_count": 0,
239
+ "ocr_regions": [],
240
+ "layout_regions": [],
241
+ "indexed": False,
242
+ "indexed_chunks": 0,
243
+ "processing_time": None,
244
+ "created_at": datetime.now(),
245
+ "updated_at": None,
246
+ "auto_index": auto_index,
247
+ }
248
+
249
+ # Start processing in background
250
+ if auto_process:
251
+ background_tasks.add_task(process_document_task, doc_id, file_path, file_ext)
252
+ status = DocumentStatus.PROCESSING
253
+ message = "Document uploaded and processing started"
254
+ else:
255
+ status = DocumentStatus.PENDING
256
+ message = "Document uploaded successfully. Call /process to begin processing."
257
+
258
+ _documents[doc_id]["status"] = status
259
+
260
+ return DocumentUploadResponse(
261
+ doc_id=doc_id,
262
+ filename=file.filename,
263
+ status=status,
264
+ message=message,
265
+ created_at=_documents[doc_id]["created_at"]
266
+ )
267
+
268
+
269
+ @router.get("", response_model=List[DocumentMetadata])
270
+ async def list_documents(
271
+ status: Optional[DocumentStatus] = Query(None, description="Filter by status"),
272
+ indexed: Optional[bool] = Query(None, description="Filter by indexed status"),
273
+ limit: int = Query(50, ge=1, le=200),
274
+ offset: int = Query(0, ge=0),
275
+ ):
276
+ """List all documents with optional filtering."""
277
+ docs = list(_documents.values())
278
+
279
+ # Apply filters
280
+ if status:
281
+ docs = [d for d in docs if d["status"] == status]
282
+ if indexed is not None:
283
+ docs = [d for d in docs if d.get("indexed", False) == indexed]
284
+
285
+ # Apply pagination
286
+ docs = docs[offset:offset + limit]
287
+
288
+ return [
289
+ DocumentMetadata(
290
+ doc_id=d["doc_id"],
291
+ filename=d["filename"],
292
+ file_type=d["file_type"],
293
+ page_count=d.get("page_count", 0),
294
+ chunk_count=len(d.get("chunks", [])),
295
+ text_length=len(d.get("raw_text", "")),
296
+ status=d["status"],
297
+ indexed=d.get("indexed", False),
298
+ indexed_chunks=d.get("indexed_chunks", 0),
299
+ processing_time=d.get("processing_time"),
300
+ created_at=d["created_at"],
301
+ updated_at=d.get("updated_at"),
302
+ )
303
+ for d in docs
304
+ ]
305
+
306
+
307
+ @router.get("/{doc_id}", response_model=DocumentResponse)
308
+ async def get_document(
309
+ doc_id: str,
310
+ include_text: bool = Query(False, description="Include full raw text"),
311
+ ):
312
+ """Get document by ID."""
313
+ if doc_id not in _documents:
314
+ raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")
315
+
316
+ d = _documents[doc_id]
317
+
318
+ return DocumentResponse(
319
+ doc_id=d["doc_id"],
320
+ filename=d["filename"],
321
+ file_type=d["file_type"],
322
+ status=d["status"],
323
+ metadata=DocumentMetadata(
324
+ doc_id=d["doc_id"],
325
+ filename=d["filename"],
326
+ file_type=d["file_type"],
327
+ page_count=d.get("page_count", 0),
328
+ chunk_count=len(d.get("chunks", [])),
329
+ text_length=len(d.get("raw_text", "")),
330
+ status=d["status"],
331
+ indexed=d.get("indexed", False),
332
+ indexed_chunks=d.get("indexed_chunks", 0),
333
+ processing_time=d.get("processing_time"),
334
+ created_at=d["created_at"],
335
+ updated_at=d.get("updated_at"),
336
+ ),
337
+ raw_text=d.get("raw_text") if include_text else None,
338
+ preview=d.get("raw_text", "")[:500] if d.get("raw_text") else None,
339
+ )
340
+
341
+
342
+ @router.get("/{doc_id}/detail", response_model=DocumentDetailResponse)
343
+ async def get_document_detail(doc_id: str):
344
+ """Get detailed document information including chunks and regions."""
345
+ if doc_id not in _documents:
346
+ raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")
347
+
348
+ d = _documents[doc_id]
349
+
350
+ return DocumentDetailResponse(
351
+ doc_id=d["doc_id"],
352
+ filename=d["filename"],
353
+ status=d["status"],
354
+ metadata=DocumentMetadata(
355
+ doc_id=d["doc_id"],
356
+ filename=d["filename"],
357
+ file_type=d["file_type"],
358
+ page_count=d.get("page_count", 0),
359
+ chunk_count=len(d.get("chunks", [])),
360
+ text_length=len(d.get("raw_text", "")),
361
+ status=d["status"],
362
+ indexed=d.get("indexed", False),
363
+ indexed_chunks=d.get("indexed_chunks", 0),
364
+ processing_time=d.get("processing_time"),
365
+ created_at=d["created_at"],
366
+ updated_at=d.get("updated_at"),
367
+ ),
368
+ chunks=[ChunkInfo(**c) for c in d.get("chunks", [])],
369
+ ocr_regions=[OCRRegionInfo(**r) for r in d.get("ocr_regions", []) if isinstance(r, dict)],
370
+ layout_regions=[LayoutRegionInfo(**r) for r in d.get("layout_regions", []) if isinstance(r, dict)],
371
+ )
372
+
373
+
374
+ @router.get("/{doc_id}/chunks", response_model=ChunksResponse)
375
+ async def get_document_chunks(
376
+ doc_id: str,
377
+ page: Optional[int] = Query(None, description="Filter by page number"),
378
+ chunk_type: Optional[str] = Query(None, description="Filter by chunk type"),
379
+ ):
380
+ """Get all chunks for a document."""
381
+ if doc_id not in _documents:
382
+ raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")
383
+
384
+ d = _documents[doc_id]
385
+ chunks = d.get("chunks", [])
386
+
387
+ # Apply filters
388
+ if page is not None:
389
+ chunks = [c for c in chunks if c.get("page_num") == page]
390
+ if chunk_type:
391
+ chunks = [c for c in chunks if c.get("chunk_type") == chunk_type]
392
+
393
+ return ChunksResponse(
394
+ doc_id=doc_id,
395
+ total_chunks=len(chunks),
396
+ chunks=[ChunkInfo(**c) for c in chunks],
397
+ )
398
+
399
+
400
+ @router.post("/{doc_id}/process")
401
+ async def process_document(
402
+ doc_id: str,
403
+ background_tasks: BackgroundTasks,
404
+ force: bool = Query(False, description="Force reprocessing"),
405
+ ):
406
+ """Trigger document processing."""
407
+ if doc_id not in _documents:
408
+ raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")
409
+
410
+ d = _documents[doc_id]
411
+
412
+ if d["status"] == DocumentStatus.PROCESSING:
413
+ raise HTTPException(status_code=400, detail="Document is already being processed")
414
+
415
+ if d["status"] == DocumentStatus.COMPLETED and not force:
416
+ raise HTTPException(
417
+ status_code=400,
418
+ detail="Document already processed. Use force=true to reprocess."
419
+ )
420
+
421
+ file_path = Path(d["file_path"])
422
+ if not file_path.exists():
423
+ raise HTTPException(status_code=404, detail="Document file not found")
424
+
425
+ background_tasks.add_task(process_document_task, doc_id, file_path, d["file_type"])
426
+ _documents[doc_id]["status"] = DocumentStatus.PROCESSING
427
+
428
+ return {"doc_id": doc_id, "status": "processing", "message": "Processing started"}
429
+
430
+
431
+ @router.delete("/{doc_id}")
432
+ async def delete_document(doc_id: str):
433
+ """Delete a document."""
434
+ if doc_id not in _documents:
435
+ raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")
436
+
437
+ d = _documents[doc_id]
438
+
439
+ # Delete file
440
+ file_path = Path(d["file_path"])
441
+ if file_path.exists():
442
+ file_path.unlink()
443
+
444
+ # Remove from store
445
+ del _documents[doc_id]
446
+
447
+ return {"doc_id": doc_id, "status": "deleted", "message": "Document deleted successfully"}
448
+
449
+
450
+ @router.post("/{doc_id}/index", response_model=IndexResponse)
451
+ async def index_document(doc_id: str, force_reindex: bool = Query(False)):
452
+ """Index a document to the RAG vector store."""
453
+ if doc_id not in _documents:
454
+ raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")
455
+
456
+ d = _documents[doc_id]
457
+
458
+ if d["status"] != DocumentStatus.COMPLETED:
459
+ raise HTTPException(
460
+ status_code=400,
461
+ detail=f"Document not ready for indexing. Current status: {d['status']}"
462
+ )
463
+
464
+ if d.get("indexed") and not force_reindex:
465
+ return IndexResponse(
466
+ doc_id=doc_id,
467
+ status="already_indexed",
468
+ chunks_indexed=d.get("indexed_chunks", 0),
469
+ message="Document already indexed. Use force_reindex=true to reindex."
470
+ )
471
+
472
+ try:
473
+ # Try to use actual indexer
474
+ from src.rag.indexer import DocumentIndexer
475
+ from src.rag.embeddings import get_embedding_model
476
+ from src.rag.store import get_vector_store
477
+
478
+ embeddings = get_embedding_model()
479
+ store = get_vector_store()
480
+ indexer = DocumentIndexer(embeddings, store)
481
+
482
+ # Index chunks
483
+ chunks_to_index = d.get("chunks", [])
484
+ indexed_count = 0
485
+
486
+ for chunk in chunks_to_index:
487
+ try:
488
+ indexer.index_chunk(
489
+ text=chunk["text"],
490
+ document_id=doc_id,
491
+ chunk_id=chunk["chunk_id"],
492
+ metadata={
493
+ "filename": d["filename"],
494
+ "page_num": chunk.get("page_num"),
495
+ "chunk_type": chunk.get("chunk_type", "text"),
496
+ }
497
+ )
498
+ indexed_count += 1
499
+ except Exception as e:
500
+ logger.warning(f"Failed to index chunk {chunk['chunk_id']}: {e}")
501
+
502
+ _documents[doc_id]["indexed"] = True
503
+ _documents[doc_id]["indexed_chunks"] = indexed_count
504
+ _documents[doc_id]["status"] = DocumentStatus.INDEXED
505
+
506
+ return IndexResponse(
507
+ doc_id=doc_id,
508
+ status="indexed",
509
+ chunks_indexed=indexed_count,
510
+ message=f"Successfully indexed {indexed_count} chunks"
511
+ )
512
+
513
+ except Exception as e:
514
+ logger.error(f"Indexing failed for {doc_id}: {e}")
515
+ raise HTTPException(status_code=500, detail=f"Indexing failed: {str(e)}")
516
+
517
+
518
+ @router.post("/batch-index", response_model=BatchIndexResponse)
519
+ async def batch_index_documents(request: BatchIndexRequest):
520
+ """Batch index multiple documents."""
521
+ results = []
522
+ successful = 0
523
+ failed = 0
524
+
525
+ for doc_id in request.doc_ids:
526
+ try:
527
+ result = await index_document(doc_id, request.force_reindex)
528
+ results.append(result)
529
+ if result.status in ["indexed", "already_indexed"]:
530
+ successful += 1
531
+ else:
532
+ failed += 1
533
+ except HTTPException as e:
534
+ results.append(IndexResponse(
535
+ doc_id=doc_id,
536
+ status="error",
537
+ chunks_indexed=0,
538
+ message=e.detail
539
+ ))
540
+ failed += 1
541
+
542
+ return BatchIndexResponse(
543
+ total_requested=len(request.doc_ids),
544
+ successful=successful,
545
+ failed=failed,
546
+ results=results
547
+ )
548
+
549
+
550
+ # Export document store for other modules
551
+ def get_document_store():
552
+ """Get the in-memory document store."""
553
+ return _documents
api/routes/rag.py ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SPARKNET RAG API Routes
3
+ Endpoints for RAG queries, search, and indexing management.
4
+ """
5
+
6
+ from fastapi import APIRouter, HTTPException, Query, Depends
7
+ from fastapi.responses import StreamingResponse
8
+ from typing import List, Optional
9
+ from pathlib import Path
10
+ from datetime import datetime
11
+ import time
12
+ import json
13
+ import sys
14
+ import asyncio
15
+
16
+ # Add project root to path
17
+ PROJECT_ROOT = Path(__file__).parent.parent.parent
18
+ sys.path.insert(0, str(PROJECT_ROOT))
19
+
20
+ from api.schemas import (
21
+ QueryRequest, RAGResponse, Citation, QueryPlan, QueryIntentType,
22
+ SearchRequest, SearchResponse, SearchResult,
23
+ StoreStatus, CollectionInfo
24
+ )
25
+ from loguru import logger
26
+
27
+ router = APIRouter()
28
+
29
+ # Simple in-memory cache for query results
30
+ _query_cache = {}
31
+ CACHE_TTL_SECONDS = 3600 # 1 hour
32
+
33
+
34
+ def get_cache_key(query: str, doc_ids: Optional[List[str]]) -> str:
35
+ """Generate cache key for query."""
36
+ import hashlib
37
+ doc_str = ",".join(sorted(doc_ids)) if doc_ids else "all"
38
+ content = f"{query}:{doc_str}"
39
+ return hashlib.md5(content.encode()).hexdigest()
40
+
41
+
42
+ def get_cached_response(cache_key: str) -> Optional[RAGResponse]:
43
+ """Get cached response if valid."""
44
+ if cache_key in _query_cache:
45
+ cached = _query_cache[cache_key]
46
+ if time.time() - cached["timestamp"] < CACHE_TTL_SECONDS:
47
+ response = cached["response"]
48
+ response.from_cache = True
49
+ return response
50
+ else:
51
+ del _query_cache[cache_key]
52
+ return None
53
+
54
+
55
+ def cache_response(cache_key: str, response: RAGResponse):
56
+ """Cache a query response."""
57
+ _query_cache[cache_key] = {
58
+ "response": response,
59
+ "timestamp": time.time()
60
+ }
61
+ # Limit cache size
62
+ if len(_query_cache) > 1000:
63
+ oldest_key = min(_query_cache, key=lambda k: _query_cache[k]["timestamp"])
64
+ del _query_cache[oldest_key]
65
+
66
+
67
+ def _get_rag_system():
68
+ """Get or initialize the RAG system."""
69
+ try:
70
+ from src.rag.agentic.orchestrator import AgenticRAG, RAGConfig
71
+
72
+ config = RAGConfig(
73
+ model_name="llama3.2:latest",
74
+ max_revision_attempts=2,
75
+ retrieval_top_k=10,
76
+ final_top_k=5,
77
+ min_confidence=0.5,
78
+ )
79
+ return AgenticRAG(config)
80
+ except Exception as e:
81
+ logger.error(f"Failed to initialize RAG system: {e}")
82
+ return None
83
+
84
+
85
+ @router.post("/query", response_model=RAGResponse)
86
+ async def query_documents(request: QueryRequest):
87
+ """
88
+ Execute a RAG query across indexed documents.
89
+
90
+ The query goes through the 5-agent pipeline:
91
+ 1. QueryPlanner - Intent classification and query decomposition
92
+ 2. Retriever - Hybrid dense+sparse search
93
+ 3. Reranker - Cross-encoder reranking with MMR
94
+ 4. Synthesizer - Answer generation with citations
95
+ 5. Critic - Hallucination detection and validation
96
+ """
97
+ start_time = time.time()
98
+
99
+ # Check cache if enabled
100
+ if request.use_cache:
101
+ cache_key = get_cache_key(request.query, request.doc_ids)
102
+ cached = get_cached_response(cache_key)
103
+ if cached:
104
+ cached.latency_ms = (time.time() - start_time) * 1000
105
+ return cached
106
+
107
+ try:
108
+ # Initialize RAG system
109
+ rag = _get_rag_system()
110
+ if not rag:
111
+ raise HTTPException(status_code=503, detail="RAG system not available")
112
+
113
+ # Build filters
114
+ filters = {}
115
+ if request.doc_ids:
116
+ filters["document_id"] = {"$in": request.doc_ids}
117
+
118
+ # Execute query
119
+ logger.info(f"Executing RAG query: {request.query[:50]}...")
120
+
121
+ result = rag.query(
122
+ query=request.query,
123
+ filters=filters if filters else None,
124
+ top_k=request.top_k,
125
+ )
126
+
127
+ # Build response
128
+ citations = []
129
+ for i, source in enumerate(result.get("sources", [])):
130
+ citations.append(Citation(
131
+ citation_id=i + 1,
132
+ doc_id=source.get("document_id", "unknown"),
133
+ document_name=source.get("filename", source.get("document_id", "unknown")),
134
+ chunk_id=source.get("chunk_id", f"chunk_{i}"),
135
+ chunk_text=source.get("text", "")[:300],
136
+ page_num=source.get("page_num"),
137
+ relevance_score=source.get("relevance_score", source.get("score", 0.0)),
138
+ bbox=source.get("bbox"),
139
+ ))
140
+
141
+ # Query plan info
142
+ query_plan = None
143
+ if "plan" in result:
144
+ plan = result["plan"]
145
+ query_plan = QueryPlan(
146
+ intent=QueryIntentType(plan.get("intent", "factoid").lower()),
147
+ sub_queries=plan.get("sub_queries", []),
148
+ keywords=plan.get("keywords", []),
149
+ strategy=plan.get("strategy", "hybrid"),
150
+ )
151
+
152
+ response = RAGResponse(
153
+ query=request.query,
154
+ answer=result.get("answer", "I could not find an answer to your question."),
155
+ confidence=result.get("confidence", 0.0),
156
+ citations=citations,
157
+ source_count=len(citations),
158
+ query_plan=query_plan,
159
+ from_cache=False,
160
+ validation=result.get("validation"),
161
+ latency_ms=(time.time() - start_time) * 1000,
162
+ revision_count=result.get("revision_count", 0),
163
+ )
164
+
165
+ # Cache successful responses
166
+ if request.use_cache and response.confidence >= request.min_confidence:
167
+ cache_key = get_cache_key(request.query, request.doc_ids)
168
+ cache_response(cache_key, response)
169
+
170
+ return response
171
+
172
+ except HTTPException:
173
+ raise
174
+ except Exception as e:
175
+ logger.error(f"RAG query failed: {e}")
176
+ raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
177
+
178
+
179
+ @router.post("/query/stream")
180
+ async def query_documents_stream(request: QueryRequest):
181
+ """
182
+ Stream RAG response for real-time updates.
183
+
184
+ Returns Server-Sent Events (SSE) with partial responses.
185
+ """
186
+ async def generate():
187
+ try:
188
+ # Initialize RAG system
189
+ rag = _get_rag_system()
190
+ if not rag:
191
+ yield f"data: {json.dumps({'error': 'RAG system not available'})}\n\n"
192
+ return
193
+
194
+ # Send planning stage
195
+ yield f"data: {json.dumps({'stage': 'planning', 'message': 'Analyzing query...'})}\n\n"
196
+ await asyncio.sleep(0.1)
197
+
198
+ # Build filters
199
+ filters = {}
200
+ if request.doc_ids:
201
+ filters["document_id"] = {"$in": request.doc_ids}
202
+
203
+ # Send retrieval stage
204
+ yield f"data: {json.dumps({'stage': 'retrieving', 'message': 'Searching documents...'})}\n\n"
205
+
206
+ # Execute query (in chunks if streaming supported)
207
+ result = rag.query(
208
+ query=request.query,
209
+ filters=filters if filters else None,
210
+ top_k=request.top_k,
211
+ )
212
+
213
+ # Send sources
214
+ yield f"data: {json.dumps({'stage': 'sources', 'count': len(result.get('sources', []))})}\n\n"
215
+
216
+ # Send synthesis stage
217
+ yield f"data: {json.dumps({'stage': 'synthesizing', 'message': 'Generating answer...'})}\n\n"
218
+
219
+ # Stream answer in chunks
220
+ answer = result.get("answer", "")
221
+ chunk_size = 50
222
+ for i in range(0, len(answer), chunk_size):
223
+ chunk = answer[i:i+chunk_size]
224
+ yield f"data: {json.dumps({'stage': 'answer', 'chunk': chunk})}\n\n"
225
+ await asyncio.sleep(0.02)
226
+
227
+ # Send final result
228
+ citations = []
229
+ for i, source in enumerate(result.get("sources", [])):
230
+ citations.append({
231
+ "citation_id": i + 1,
232
+ "doc_id": source.get("document_id", "unknown"),
233
+ "chunk_text": source.get("text", "")[:200],
234
+ "relevance_score": source.get("score", 0.0),
235
+ })
236
+
237
+ final = {
238
+ "stage": "complete",
239
+ "confidence": result.get("confidence", 0.0),
240
+ "citations": citations,
241
+ "validation": result.get("validation"),
242
+ }
243
+ yield f"data: {json.dumps(final)}\n\n"
244
+
245
+ except Exception as e:
246
+ logger.error(f"Streaming query failed: {e}")
247
+ yield f"data: {json.dumps({'error': str(e)})}\n\n"
248
+
249
+ return StreamingResponse(
250
+ generate(),
251
+ media_type="text/event-stream",
252
+ headers={
253
+ "Cache-Control": "no-cache",
254
+ "Connection": "keep-alive",
255
+ }
256
+ )
257
+
258
+
259
+ @router.post("/search", response_model=SearchResponse)
260
+ async def search_documents(request: SearchRequest):
261
+ """
262
+ Semantic search across indexed documents.
263
+
264
+ Returns matching chunks without answer synthesis.
265
+ """
266
+ start_time = time.time()
267
+
268
+ try:
269
+ from src.rag.store import get_vector_store
270
+ from src.rag.embeddings import get_embedding_model
271
+
272
+ store = get_vector_store()
273
+ embeddings = get_embedding_model()
274
+
275
+ # Generate query embedding
276
+ query_embedding = embeddings.embed_query(request.query)
277
+
278
+ # Build filter
279
+ where_filter = None
280
+ if request.doc_ids:
281
+ where_filter = {"document_id": {"$in": request.doc_ids}}
282
+
283
+ # Search
284
+ results = store.similarity_search_with_score(
285
+ query_embedding=query_embedding,
286
+ k=request.top_k,
287
+ where=where_filter,
288
+ )
289
+
290
+ # Filter by minimum score
291
+ search_results = []
292
+ for doc, score in results:
293
+ if score >= request.min_score:
294
+ search_results.append(SearchResult(
295
+ chunk_id=doc.metadata.get("chunk_id", "unknown"),
296
+ doc_id=doc.metadata.get("document_id", "unknown"),
297
+ document_name=doc.metadata.get("filename", "unknown"),
298
+ text=doc.page_content,
299
+ score=score,
300
+ page_num=doc.metadata.get("page_num"),
301
+ chunk_type=doc.metadata.get("chunk_type", "text"),
302
+ ))
303
+
304
+ return SearchResponse(
305
+ query=request.query,
306
+ total_results=len(search_results),
307
+ results=search_results,
308
+ latency_ms=(time.time() - start_time) * 1000,
309
+ )
310
+
311
+ except Exception as e:
312
+ logger.error(f"Search failed: {e}")
313
+ # Fallback: return empty results
314
+ return SearchResponse(
315
+ query=request.query,
316
+ total_results=0,
317
+ results=[],
318
+ latency_ms=(time.time() - start_time) * 1000,
319
+ )
320
+
321
+
322
+ @router.get("/store/status", response_model=StoreStatus)
323
+ async def get_store_status():
324
+ """Get vector store status and statistics."""
325
+ try:
326
+ from src.rag.store import get_vector_store
327
+
328
+ store = get_vector_store()
329
+
330
+ # Get collection info
331
+ collection = store._collection
332
+ count = collection.count()
333
+
334
+ # Get unique documents
335
+ all_metadata = collection.get(include=["metadatas"])
336
+ doc_ids = set()
337
+ for meta in all_metadata.get("metadatas", []):
338
+ if meta and "document_id" in meta:
339
+ doc_ids.add(meta["document_id"])
340
+
341
+ collections = [CollectionInfo(
342
+ name=store.collection_name,
343
+ document_count=len(doc_ids),
344
+ chunk_count=count,
345
+ embedding_dimension=store.embedding_dimension if hasattr(store, 'embedding_dimension') else 1024,
346
+ )]
347
+
348
+ return StoreStatus(
349
+ status="healthy",
350
+ collections=collections,
351
+ total_documents=len(doc_ids),
352
+ total_chunks=count,
353
+ )
354
+
355
+ except Exception as e:
356
+ logger.error(f"Store status check failed: {e}")
357
+ return StoreStatus(
358
+ status="error",
359
+ collections=[],
360
+ total_documents=0,
361
+ total_chunks=0,
362
+ )
363
+
364
+
365
+ @router.delete("/store/collection/{collection_name}")
366
+ async def clear_collection(collection_name: str, confirm: bool = Query(False)):
367
+ """Clear a vector store collection (dangerous operation)."""
368
+ if not confirm:
369
+ raise HTTPException(
370
+ status_code=400,
371
+ detail="This operation will delete all data. Set confirm=true to proceed."
372
+ )
373
+
374
+ try:
375
+ from src.rag.store import get_vector_store
376
+
377
+ store = get_vector_store()
378
+ if store.collection_name != collection_name:
379
+ raise HTTPException(status_code=404, detail=f"Collection not found: {collection_name}")
380
+
381
+ # Clear collection
382
+ store._collection.delete(where={})
383
+
384
+ return {"status": "cleared", "collection": collection_name, "message": "Collection cleared successfully"}
385
+
386
+ except HTTPException:
387
+ raise
388
+ except Exception as e:
389
+ logger.error(f"Collection clear failed: {e}")
390
+ raise HTTPException(status_code=500, detail=f"Clear failed: {str(e)}")
391
+
392
+
393
+ @router.get("/cache/stats")
394
+ async def get_cache_stats():
395
+ """Get query cache statistics."""
396
+ current_time = time.time()
397
+ valid_entries = sum(
398
+ 1 for v in _query_cache.values()
399
+ if current_time - v["timestamp"] < CACHE_TTL_SECONDS
400
+ )
401
+
402
+ return {
403
+ "total_entries": len(_query_cache),
404
+ "valid_entries": valid_entries,
405
+ "expired_entries": len(_query_cache) - valid_entries,
406
+ "ttl_seconds": CACHE_TTL_SECONDS,
407
+ }
408
+
409
+
410
+ @router.delete("/cache")
411
+ async def clear_cache():
412
+ """Clear the query cache."""
413
+ count = len(_query_cache)
414
+ _query_cache.clear()
415
+ return {"status": "cleared", "entries_removed": count}
api/schemas.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SPARKNET API Schemas
3
+ Pydantic models for request/response validation.
4
+ """
5
+
6
+ from pydantic import BaseModel, Field, ConfigDict
7
+ from typing import List, Dict, Any, Optional
8
+ from datetime import datetime
9
+ from enum import Enum
10
+
11
+
12
+ # ==================== Enums ====================
13
+
14
+ class DocumentStatus(str, Enum):
15
+ PENDING = "pending"
16
+ PROCESSING = "processing"
17
+ COMPLETED = "completed"
18
+ INDEXED = "indexed"
19
+ ERROR = "error"
20
+
21
+
22
+ class QueryIntentType(str, Enum):
23
+ FACTOID = "factoid"
24
+ COMPARISON = "comparison"
25
+ AGGREGATION = "aggregation"
26
+ CAUSAL = "causal"
27
+ PROCEDURAL = "procedural"
28
+ DEFINITION = "definition"
29
+ LIST = "list"
30
+ MULTI_HOP = "multi_hop"
31
+
32
+
33
+ class AnswerFormat(str, Enum):
34
+ PROSE = "prose"
35
+ BULLET_POINTS = "bullet_points"
36
+ TABLE = "table"
37
+ STEP_BY_STEP = "step_by_step"
38
+
39
+
40
+ # ==================== Document Schemas ====================
41
+
42
+ class DocumentUploadResponse(BaseModel):
43
+ """Response after uploading a document."""
44
+ model_config = ConfigDict(from_attributes=True)
45
+
46
+ doc_id: str = Field(..., description="Unique document identifier")
47
+ filename: str = Field(..., description="Original filename")
48
+ status: DocumentStatus = Field(..., description="Document status")
49
+ message: str = Field(..., description="Status message")
50
+ created_at: datetime = Field(default_factory=datetime.now)
51
+
52
+
53
+ class DocumentMetadata(BaseModel):
54
+ """Document metadata information."""
55
+ model_config = ConfigDict(from_attributes=True)
56
+
57
+ doc_id: str
58
+ filename: str
59
+ file_type: str
60
+ page_count: int = 0
61
+ chunk_count: int = 0
62
+ text_length: int = 0
63
+ status: DocumentStatus
64
+ indexed: bool = False
65
+ indexed_chunks: int = 0
66
+ processing_time: Optional[float] = None
67
+ created_at: datetime
68
+ updated_at: Optional[datetime] = None
69
+
70
+
71
+ class DocumentResponse(BaseModel):
72
+ """Full document response with metadata."""
73
+ model_config = ConfigDict(from_attributes=True)
74
+
75
+ doc_id: str
76
+ filename: str
77
+ file_type: str
78
+ status: DocumentStatus
79
+ metadata: DocumentMetadata
80
+ raw_text: Optional[str] = Field(None, description="Full extracted text (if requested)")
81
+ preview: Optional[str] = Field(None, description="Text preview (first 500 chars)")
82
+
83
+
84
+ class ChunkInfo(BaseModel):
85
+ """Information about a document chunk."""
86
+ model_config = ConfigDict(from_attributes=True)
87
+
88
+ chunk_id: str
89
+ doc_id: str
90
+ text: str
91
+ chunk_type: str = "text"
92
+ page_num: Optional[int] = None
93
+ confidence: float = 1.0
94
+ bbox: Optional[Dict[str, float]] = None
95
+ metadata: Dict[str, Any] = Field(default_factory=dict)
96
+
97
+
98
+ class ChunksResponse(BaseModel):
99
+ """Response containing document chunks."""
100
+ doc_id: str
101
+ total_chunks: int
102
+ chunks: List[ChunkInfo]
103
+
104
+
105
+ class OCRRegionInfo(BaseModel):
106
+ """OCR region information."""
107
+ region_id: str
108
+ text: str
109
+ confidence: float
110
+ page_num: int
111
+ bbox: Dict[str, float]
112
+
113
+
114
+ class LayoutRegionInfo(BaseModel):
115
+ """Layout region information."""
116
+ region_id: str
117
+ region_type: str
118
+ confidence: float
119
+ page_num: int
120
+ bbox: Dict[str, float]
121
+
122
+
123
+ class DocumentDetailResponse(BaseModel):
124
+ """Detailed document response with all extracted data."""
125
+ doc_id: str
126
+ filename: str
127
+ status: DocumentStatus
128
+ metadata: DocumentMetadata
129
+ chunks: List[ChunkInfo]
130
+ ocr_regions: List[OCRRegionInfo] = Field(default_factory=list)
131
+ layout_regions: List[LayoutRegionInfo] = Field(default_factory=list)
132
+
133
+
134
+ # ==================== RAG Query Schemas ====================
135
+
136
+ class QueryRequest(BaseModel):
137
+ """RAG query request."""
138
+ query: str = Field(..., min_length=1, max_length=2000, description="Query text")
139
+ doc_ids: Optional[List[str]] = Field(None, description="Filter by document IDs")
140
+ top_k: int = Field(5, ge=1, le=20, description="Number of chunks to retrieve")
141
+ answer_format: AnswerFormat = Field(AnswerFormat.PROSE, description="Desired answer format")
142
+ include_sources: bool = Field(True, description="Include source citations")
143
+ min_confidence: float = Field(0.5, ge=0.0, le=1.0, description="Minimum confidence threshold")
144
+ use_cache: bool = Field(True, description="Use cached results if available")
145
+
146
+
147
+ class Citation(BaseModel):
148
+ """Citation/source reference."""
149
+ citation_id: int = Field(..., description="Citation number [1], [2], etc.")
150
+ doc_id: str
151
+ document_name: str
152
+ chunk_id: str
153
+ chunk_text: str
154
+ page_num: Optional[int] = None
155
+ relevance_score: float
156
+ bbox: Optional[Dict[str, float]] = None
157
+
158
+
159
+ class QueryPlan(BaseModel):
160
+ """Query planning information."""
161
+ intent: QueryIntentType
162
+ sub_queries: List[str] = Field(default_factory=list)
163
+ keywords: List[str] = Field(default_factory=list)
164
+ strategy: str = "hybrid"
165
+
166
+
167
+ class RAGResponse(BaseModel):
168
+ """Complete RAG response."""
169
+ query: str
170
+ answer: str
171
+ confidence: float = Field(..., ge=0.0, le=1.0)
172
+ citations: List[Citation] = Field(default_factory=list)
173
+ source_count: int = 0
174
+ query_plan: Optional[QueryPlan] = None
175
+ from_cache: bool = False
176
+ validation: Optional[Dict[str, Any]] = None
177
+ latency_ms: Optional[float] = None
178
+ revision_count: int = 0
179
+
180
+
181
+ class SearchRequest(BaseModel):
182
+ """Semantic search request."""
183
+ query: str = Field(..., min_length=1, max_length=1000)
184
+ doc_ids: Optional[List[str]] = None
185
+ top_k: int = Field(10, ge=1, le=50)
186
+ min_score: float = Field(0.0, ge=0.0, le=1.0)
187
+
188
+
189
+ class SearchResult(BaseModel):
190
+ """Single search result."""
191
+ chunk_id: str
192
+ doc_id: str
193
+ document_name: str
194
+ text: str
195
+ score: float
196
+ page_num: Optional[int] = None
197
+ chunk_type: str = "text"
198
+
199
+
200
+ class SearchResponse(BaseModel):
201
+ """Search response with results."""
202
+ query: str
203
+ total_results: int
204
+ results: List[SearchResult]
205
+ latency_ms: float
206
+
207
+
208
+ # ==================== Indexing Schemas ====================
209
+
210
+ class IndexRequest(BaseModel):
211
+ """Request to index a document."""
212
+ doc_id: str = Field(..., description="Document ID to index")
213
+ force_reindex: bool = Field(False, description="Force reindexing if already indexed")
214
+
215
+
216
+ class IndexResponse(BaseModel):
217
+ """Indexing response."""
218
+ doc_id: str
219
+ status: str
220
+ chunks_indexed: int
221
+ message: str
222
+
223
+
224
+ class BatchIndexRequest(BaseModel):
225
+ """Batch indexing request."""
226
+ doc_ids: List[str]
227
+ force_reindex: bool = False
228
+
229
+
230
+ class BatchIndexResponse(BaseModel):
231
+ """Batch indexing response."""
232
+ total_requested: int
233
+ successful: int
234
+ failed: int
235
+ results: List[IndexResponse]
236
+
237
+
238
+ # ==================== System Schemas ====================
239
+
240
+ class HealthResponse(BaseModel):
241
+ """Health check response."""
242
+ status: str = Field(..., description="healthy, degraded, or unhealthy")
243
+ version: str
244
+ components: Dict[str, bool]
245
+
246
+
247
+ class SystemStatus(BaseModel):
248
+ """Detailed system status."""
249
+ status: str
250
+ version: str
251
+ uptime_seconds: float
252
+ components: Dict[str, bool]
253
+ statistics: Dict[str, Any]
254
+ models: Dict[str, str]
255
+
256
+
257
+ class CollectionInfo(BaseModel):
258
+ """Vector store collection information."""
259
+ name: str
260
+ document_count: int
261
+ chunk_count: int
262
+ embedding_dimension: int
263
+
264
+
265
+ class StoreStatus(BaseModel):
266
+ """Vector store status."""
267
+ status: str
268
+ collections: List[CollectionInfo]
269
+ total_documents: int
270
+ total_chunks: int
271
+
272
+
273
+ # ==================== Authentication Schemas ====================
274
+
275
+ class UserCreate(BaseModel):
276
+ """User creation request."""
277
+ username: str = Field(..., min_length=3, max_length=50)
278
+ email: str
279
+ password: str = Field(..., min_length=8)
280
+
281
+
282
+ class UserResponse(BaseModel):
283
+ """User response (no password)."""
284
+ user_id: str
285
+ username: str
286
+ email: str
287
+ is_active: bool = True
288
+ created_at: datetime
289
+
290
+
291
+ class Token(BaseModel):
292
+ """JWT token response."""
293
+ access_token: str
294
+ token_type: str = "bearer"
295
+ expires_in: int
296
+
297
+
298
+ class TokenData(BaseModel):
299
+ """Token payload data."""
300
+ username: Optional[str] = None
301
+ user_id: Optional[str] = None
302
+ scopes: List[str] = Field(default_factory=list)
config/document.yaml ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPARKNET Document Processing Configuration
2
+ # ===========================================
3
+
4
+ # OCR Configuration
5
+ ocr:
6
+ # Engine selection: "paddleocr" (default) or "tesseract"
7
+ engine: paddleocr
8
+
9
+ # PaddleOCR settings
10
+ paddleocr:
11
+ lang: en
12
+ use_gpu: false
13
+ det_db_thresh: 0.3
14
+ det_db_box_thresh: 0.5
15
+ rec_algorithm: CRNN
16
+ show_log: false
17
+
18
+ # Tesseract settings
19
+ tesseract:
20
+ lang: eng
21
+ config: "--psm 3" # Page segmentation mode
22
+ oem: 3 # OCR Engine mode (LSTM)
23
+
24
+ # Preprocessing
25
+ preprocessing:
26
+ deskew: true
27
+ denoise: false
28
+ contrast_enhance: false
29
+
30
+ # Layout Detection Configuration
31
+ layout:
32
+ # Detection method: "rule_based" (default) or "model_based"
33
+ method: rule_based
34
+
35
+ # Rule-based settings
36
+ rule_based:
37
+ merge_threshold: 20 # Pixels to merge nearby regions
38
+ column_detection: true
39
+ min_region_area: 100
40
+
41
+ # Confidence thresholds
42
+ thresholds:
43
+ text: 0.5
44
+ title: 0.7
45
+ table: 0.6
46
+ figure: 0.6
47
+ list: 0.5
48
+
49
+ # Reading Order Configuration
50
+ reading_order:
51
+ # Reconstruction method: "rule_based" (default)
52
+ method: rule_based
53
+
54
+ # Column detection
55
+ column_gap_threshold: 50 # Minimum gap between columns
56
+ reading_direction: ltr # Left-to-right
57
+
58
+ # Line grouping
59
+ line_height_tolerance: 0.5
60
+
61
+ # Chunking Configuration
62
+ chunking:
63
+ # Chunk size limits
64
+ target_size: 512 # Target tokens per chunk
65
+ max_size: 1024 # Maximum tokens per chunk
66
+ min_size: 50 # Minimum tokens per chunk
67
+
68
+ # Overlap for context
69
+ overlap_size: 50 # Tokens to overlap between chunks
70
+
71
+ # Semantic chunking
72
+ semantic_boundaries: true
73
+ respect_paragraphs: true
74
+ respect_sections: true
75
+
76
+ # Grounding/Evidence Configuration
77
+ grounding:
78
+ # Image cropping for evidence
79
+ include_images: true
80
+ crop_padding: 10 # Pixels around regions
81
+ max_image_size: 512
82
+ image_format: PNG # PNG or JPEG
83
+ image_quality: 85 # JPEG quality
84
+
85
+ # Snippet settings
86
+ max_snippet_length: 200
87
+ include_context: true
88
+
89
+ # Pipeline Configuration
90
+ pipeline:
91
+ # PDF rendering
92
+ render_dpi: 300
93
+
94
+ # Caching
95
+ enable_caching: true
96
+ cache_directory: ./data/cache
97
+
98
+ # Processing options
99
+ parallel_pages: false
100
+ max_pages: null # Limit pages (null for all)
101
+
102
+ # Output options
103
+ include_ocr_regions: true
104
+ include_layout_regions: true
105
+ generate_full_text: true
106
+
107
+ # Validation Configuration
108
+ validation:
109
+ # Critic settings
110
+ critic:
111
+ confidence_threshold: 0.7
112
+ evidence_required: true
113
+ strict_mode: false
114
+ max_fields_per_request: 10
115
+
116
+ # Verifier settings
117
+ verifier:
118
+ fuzzy_match: true
119
+ case_sensitive: false
120
+ min_match_ratio: 0.6
121
+ strong_threshold: 0.9
122
+ moderate_threshold: 0.7
123
+ weak_threshold: 0.5
124
+
125
+ # LLM Configuration for DocumentAgent
126
+ agent:
127
+ # Ollama settings
128
+ ollama_base_url: http://localhost:11434
129
+ default_model: llama3.2:3b
130
+
131
+ # Model routing by complexity
132
+ model_routing:
133
+ simple: llama3.2:1b
134
+ standard: llama3.2:3b
135
+ complex: llama3.1:8b
136
+ analysis: llama3.1:70b # For heavy analysis (optional)
137
+
138
+ # Agent behavior
139
+ max_iterations: 10
140
+ temperature: 0.1
141
+ timeout: 120 # Seconds
142
+
143
+ # Logging Configuration
144
+ logging:
145
+ level: INFO # DEBUG, INFO, WARNING, ERROR
146
+ format: "{time} | {level} | {message}"
147
+ file: null # Log file path (null for stderr only)
config/rag.yaml ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPARKNET RAG Configuration
2
+ # ===========================
3
+
4
+ # Vector Store Configuration
5
+ vector_store:
6
+ # Store type: "chromadb" (default)
7
+ type: chromadb
8
+
9
+ # ChromaDB settings
10
+ chromadb:
11
+ persist_directory: ./data/vectorstore
12
+ collection_name: sparknet_documents
13
+ anonymized_telemetry: false
14
+
15
+ # Search settings
16
+ default_top_k: 5
17
+ similarity_threshold: 0.7
18
+
19
+ # Embedding Configuration
20
+ embeddings:
21
+ # Adapter type: "ollama" (default) or "openai"
22
+ adapter_type: ollama
23
+
24
+ # Ollama settings (local, default)
25
+ ollama:
26
+ base_url: http://localhost:11434
27
+ model: nomic-embed-text # Options: nomic-embed-text, mxbai-embed-large, all-minilm
28
+
29
+ # OpenAI settings (optional, feature-flagged)
30
+ openai:
31
+ enabled: false
32
+ model: text-embedding-3-small # Options: text-embedding-3-small, text-embedding-3-large
33
+ # api_key: ${OPENAI_API_KEY} # Use env var
34
+
35
+ # Common settings
36
+ batch_size: 32
37
+ timeout: 60
38
+
39
+ # Caching
40
+ enable_cache: true
41
+ cache_directory: ./data/embedding_cache
42
+
43
+ # Indexer Configuration
44
+ indexer:
45
+ # Batch processing
46
+ batch_size: 32
47
+
48
+ # Metadata to index
49
+ include_bbox: true
50
+ include_page: true
51
+ include_chunk_type: true
52
+
53
+ # Filtering
54
+ skip_empty_chunks: true
55
+ min_chunk_length: 10
56
+
57
+ # Retriever Configuration
58
+ retriever:
59
+ # Search parameters
60
+ default_top_k: 5
61
+ similarity_threshold: 0.7
62
+ max_results: 20
63
+
64
+ # Reranking (future)
65
+ enable_reranking: false
66
+ rerank_top_k: 10
67
+
68
+ # Evidence settings
69
+ include_evidence: true
70
+ evidence_snippet_length: 200
71
+
72
+ # Generator Configuration
73
+ generator:
74
+ # LLM provider: "ollama" (default) or "openai"
75
+ llm_provider: ollama
76
+
77
+ # Ollama settings
78
+ ollama:
79
+ base_url: http://localhost:11434
80
+ model: llama3.2:3b # Options: llama3.2:3b, llama3.1:8b, mistral
81
+
82
+ # OpenAI settings (optional)
83
+ openai:
84
+ model: gpt-4o-mini # Options: gpt-4o-mini, gpt-4o
85
+ # api_key: ${OPENAI_API_KEY} # Use env var
86
+
87
+ # Generation settings
88
+ temperature: 0.1
89
+ max_tokens: 1024
90
+ timeout: 120
91
+
92
+ # Citation settings
93
+ require_citations: true
94
+ citation_format: "[{index}]"
95
+
96
+ # Abstention settings
97
+ abstain_on_low_confidence: true
98
+ confidence_threshold: 0.6
99
+
100
+ # Query Processing
101
+ query:
102
+ # Query expansion
103
+ expand_queries: false
104
+ max_expansions: 3
105
+
106
+ # Hybrid search (future)
107
+ enable_hybrid: false
108
+ keyword_weight: 0.3
109
+ semantic_weight: 0.7
110
+
111
+ # Metadata Filtering
112
+ filters:
113
+ # Supported filter types
114
+ supported:
115
+ - document_id
116
+ - chunk_type
117
+ - page
118
+ - confidence_min
119
+
120
+ # Default filters (applied to all queries)
121
+ defaults: {}
122
+
123
+ # Performance Settings
124
+ performance:
125
+ # Connection pooling
126
+ max_connections: 10
127
+
128
+ # Timeouts
129
+ embedding_timeout: 60
130
+ search_timeout: 30
131
+ generation_timeout: 120
132
+
133
+ # Caching
134
+ query_cache_enabled: true
135
+ query_cache_ttl: 3600 # Seconds
136
+
137
+ # Logging
138
+ logging:
139
+ level: INFO
140
+ include_queries: false # Log user queries (privacy consideration)
141
+ include_latency: true
configs/rag.yaml ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG (Retrieval-Augmented Generation) Configuration
2
+ # SPARKNET Document Intelligence Integration
3
+
4
+ # =============================================================================
5
+ # Vector Store Settings
6
+ # =============================================================================
7
+ vector_store:
8
+ # Store type: "chroma" (default) or "memory" (for testing)
9
+ type: chroma
10
+
11
+ # ChromaDB settings
12
+ chroma:
13
+ # Persistence directory for vector store
14
+ persist_directory: "./.sparknet/chroma_db"
15
+
16
+ # Collection name for document chunks
17
+ collection_name: "sparknet_documents"
18
+
19
+ # Distance metric: "cosine" (default), "l2", or "ip"
20
+ distance_metric: cosine
21
+
22
+ # Anonymized telemetry (set to false to disable)
23
+ anonymized_telemetry: false
24
+
25
+ # =============================================================================
26
+ # Embedding Settings
27
+ # =============================================================================
28
+ embeddings:
29
+ # Provider: "ollama" (default, local) or "openai" (cloud, requires API key)
30
+ provider: ollama
31
+
32
+ # Ollama settings (local, privacy-preserving)
33
+ ollama:
34
+ # Model name for embeddings
35
+ # Recommended: nomic-embed-text (768 dims) or mxbai-embed-large (1024 dims)
36
+ model: nomic-embed-text
37
+
38
+ # Ollama server URL
39
+ base_url: "http://localhost:11434"
40
+
41
+ # Request timeout in seconds
42
+ timeout: 30
43
+
44
+ # OpenAI settings (cloud, disabled by default)
45
+ openai:
46
+ # IMPORTANT: OpenAI is disabled by default for privacy
47
+ # Set to true only if you explicitly need cloud embeddings
48
+ enabled: false
49
+
50
+ # Model name (if enabled)
51
+ model: text-embedding-3-small
52
+
53
+ # API key (from environment variable OPENAI_API_KEY)
54
+ # Never store API keys in config files
55
+ api_key_env: OPENAI_API_KEY
56
+
57
+ # Caching settings
58
+ cache:
59
+ # Enable embedding cache for faster re-processing
60
+ enabled: true
61
+
62
+ # Maximum cache entries
63
+ max_entries: 10000
64
+
65
+ # =============================================================================
66
+ # Indexer Settings
67
+ # =============================================================================
68
+ indexer:
69
+ # Batch size for embedding generation
70
+ batch_size: 32
71
+
72
+ # Include bounding box metadata
73
+ include_bbox: true
74
+
75
+ # Include page numbers
76
+ include_page: true
77
+
78
+ # Include chunk type labels
79
+ include_chunk_type: true
80
+
81
+ # Skip empty chunks
82
+ skip_empty_chunks: true
83
+
84
+ # Minimum chunk text length (characters)
85
+ min_chunk_length: 10
86
+
87
+ # =============================================================================
88
+ # Retriever Settings
89
+ # =============================================================================
90
+ retriever:
91
+ # Default number of results to return
92
+ default_top_k: 5
93
+
94
+ # Maximum results to return
95
+ max_results: 20
96
+
97
+ # Minimum similarity score (0.0 - 1.0)
98
+ # Chunks below this threshold are filtered out
99
+ similarity_threshold: 0.5
100
+
101
+ # Enable result reranking (experimental)
102
+ enable_reranking: false
103
+
104
+ # Number of results to rerank
105
+ rerank_top_k: 10
106
+
107
+ # Include evidence references in results
108
+ include_evidence: true
109
+
110
+ # Maximum snippet length in evidence
111
+ evidence_snippet_length: 200
112
+
113
+ # =============================================================================
114
+ # Generator Settings (Answer Generation)
115
+ # =============================================================================
116
+ generator:
117
+ # LLM provider for answer generation: "ollama" (default) or "openai"
118
+ provider: ollama
119
+
120
+ # Ollama settings (local)
121
+ ollama:
122
+ # Model for answer generation
123
+ # Recommended: llama3.2, mistral, or phi3
124
+ model: llama3.2
125
+
126
+ # Ollama server URL
127
+ base_url: "http://localhost:11434"
128
+
129
+ # Request timeout in seconds
130
+ timeout: 60
131
+
132
+ # Generation parameters
133
+ temperature: 0.1
134
+ max_tokens: 1024
135
+
136
+ # OpenAI settings (cloud, disabled by default)
137
+ openai:
138
+ enabled: false
139
+ model: gpt-4o-mini
140
+ api_key_env: OPENAI_API_KEY
141
+ temperature: 0.1
142
+ max_tokens: 1024
143
+
144
+ # Confidence settings
145
+ min_confidence: 0.5
146
+
147
+ # Abstention policy
148
+ # When true, the system will refuse to answer if confidence is too low
149
+ abstain_on_low_confidence: true
150
+ abstain_threshold: 0.3
151
+
152
+ # Maximum context length for LLM
153
+ max_context_length: 8000
154
+
155
+ # Require citations in answers
156
+ require_citations: true
157
+
158
+ # =============================================================================
159
+ # Document Intelligence Integration
160
+ # =============================================================================
161
+ document_intelligence:
162
+ # Parser settings
163
+ parser:
164
+ render_dpi: 200
165
+ max_pages: null # null = no limit
166
+
167
+ # Extraction settings
168
+ extraction:
169
+ min_field_confidence: 0.5
170
+ abstain_on_low_confidence: true
171
+
172
+ # Grounding settings
173
+ grounding:
174
+ enable_crops: true
175
+ crop_output_dir: "./.sparknet/crops"
176
+
177
+ # =============================================================================
178
+ # Performance Settings
179
+ # =============================================================================
180
+ performance:
181
+ # Number of parallel workers for batch processing
182
+ num_workers: 4
183
+
184
+ # Chunk processing batch size
185
+ chunk_batch_size: 100
186
+
187
+ # Enable async processing where supported
188
+ async_enabled: true
189
+
190
+ # =============================================================================
191
+ # Logging Settings
192
+ # =============================================================================
193
+ logging:
194
+ # Log level: DEBUG, INFO, WARNING, ERROR
195
+ level: INFO
196
+
197
+ # Log RAG queries and results
198
+ log_queries: false
199
+
200
+ # Log embedding operations
201
+ log_embeddings: false
demo/README.md ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPARKNET Demo Application
2
+
3
+ An interactive Streamlit demo showcasing SPARKNET's document intelligence capabilities.
4
+
5
+ ## Features
6
+
7
+ - **📄 Document Processing**: Upload and process documents with OCR
8
+ - **🔍 Field Extraction**: Extract structured data with evidence grounding
9
+ - **💬 RAG Q&A**: Interactive question answering with citations
10
+ - **🏷️ Classification**: Automatic document type detection
11
+ - **📊 Analytics**: Processing statistics and insights
12
+ - **🔬 Live Processing**: Real-time pipeline visualization
13
+ - **📊 Document Comparison**: Compare multiple documents
14
+
15
+ ## Quick Start
16
+
17
+ ### 1. Install Dependencies
18
+
19
+ ```bash
20
+ # From project root
21
+ pip install -r demo/requirements.txt
22
+
23
+ # Or install all SPARKNET dependencies
24
+ pip install -r requirements.txt
25
+ ```
26
+
27
+ ### 2. Start Ollama (Optional, for live processing)
28
+
29
+ ```bash
30
+ ollama serve
31
+
32
+ # Pull required models
33
+ ollama pull llama3.2:3b
34
+ ollama pull nomic-embed-text
35
+ ```
36
+
37
+ ### 3. Run the Demo
38
+
39
+ ```bash
40
+ # From project root
41
+ streamlit run demo/app.py
42
+
43
+ # Or with custom port
44
+ streamlit run demo/app.py --server.port 8501
45
+ ```
46
+
47
+ ### 4. Open in Browser
48
+
49
+ Navigate to http://localhost:8501
50
+
51
+ ## Demo Pages
52
+
53
+ | Page | Description |
54
+ |------|-------------|
55
+ | **Home** | Overview and feature cards |
56
+ | **Document Processing** | Upload/select documents for OCR processing |
57
+ | **Field Extraction** | Extract structured fields with evidence |
58
+ | **RAG Q&A** | Ask questions about indexed documents |
59
+ | **Classification** | Classify document types |
60
+ | **Analytics** | View processing statistics |
61
+ | **Live Processing** | Watch pipeline in real-time |
62
+ | **Interactive RAG** | Chat-style document Q&A |
63
+ | **Document Comparison** | Compare documents side by side |
64
+
65
+ ## Sample Documents
66
+
67
+ The demo uses patent pledge documents from the `Dataset/` folder:
68
+
69
+ - Apple 11.11.2011.pdf
70
+ - IBM 11.01.2005.pdf
71
+ - Google 08.02.2012.pdf
72
+ - And more...
73
+
74
+ ## Screenshots
75
+
76
+ ### Home Page
77
+ ```
78
+ ┌─────────────────────────────────────────┐
79
+ │ 🔥 SPARKNET │
80
+ │ Agentic Document Intelligence Platform │
81
+ ├─────────────────────────────────────────┤
82
+ │ [Doc Processing] [Extraction] [RAG] │
83
+ │ │
84
+ │ Feature cards with gradients... │
85
+ └─────────────────────────────────────────┘
86
+ ```
87
+
88
+ ### RAG Q&A
89
+ ```
90
+ ┌─────────────────────────────────────────┐
91
+ │ 💬 Ask a question... │
92
+ ├─────────────────────────────────────────┤
93
+ │ User: What patents are covered? │
94
+ │ │
95
+ │ Assistant: Based on the documents... │
96
+ │ [📚 View Sources] │
97
+ │ [1] Apple - Page 1: "..." │
98
+ │ [2] IBM - Page 2: "..." │
99
+ └─────────────────────────────────────────┘
100
+ ```
101
+
102
+ ## Configuration
103
+
104
+ ### Environment Variables
105
+
106
+ ```bash
107
+ # Ollama URL (default: http://localhost:11434)
108
+ export OLLAMA_BASE_URL=http://localhost:11434
109
+
110
+ # ChromaDB path (default: ./data/vectorstore)
111
+ export CHROMA_PERSIST_DIR=./data/vectorstore
112
+ ```
113
+
114
+ ### Streamlit Config
115
+
116
+ Create `.streamlit/config.toml`:
117
+
118
+ ```toml
119
+ [theme]
120
+ primaryColor = "#FF6B6B"
121
+ backgroundColor = "#FFFFFF"
122
+ secondaryBackgroundColor = "#F0F2F6"
123
+ textColor = "#262730"
124
+
125
+ [server]
126
+ maxUploadSize = 50
127
+ ```
128
+
129
+ ## Development
130
+
131
+ ### Adding New Pages
132
+
133
+ 1. Create a new file in `demo/pages/`:
134
+ ```
135
+ demo/pages/4_🆕_New_Feature.py
136
+ ```
137
+
138
+ 2. Follow the naming convention: `{order}_{emoji}_{name}.py`
139
+
140
+ 3. Import project modules:
141
+ ```python
142
+ import sys
143
+ from pathlib import Path
144
+ PROJECT_ROOT = Path(__file__).parent.parent.parent
145
+ sys.path.insert(0, str(PROJECT_ROOT))
146
+ ```
147
+
148
+ ### Customizing Styles
149
+
150
+ Edit the CSS in `app.py`:
151
+
152
+ ```python
153
+ st.markdown("""
154
+ <style>
155
+ .main-header { ... }
156
+ .evidence-box { ... }
157
+ </style>
158
+ """, unsafe_allow_html=True)
159
+ ```
160
+
161
+ ## Troubleshooting
162
+
163
+ ### "ModuleNotFoundError: No module named 'src'"
164
+
165
+ Make sure you're running from the project root:
166
+ ```bash
167
+ cd /path/to/SPARKNET
168
+ streamlit run demo/app.py
169
+ ```
170
+
171
+ ### Ollama Not Connected
172
+
173
+ 1. Check if Ollama is running: `curl http://localhost:11434/api/tags`
174
+ 2. Start Ollama: `ollama serve`
175
+
176
+ ### ChromaDB Errors
177
+
178
+ Install ChromaDB:
179
+ ```bash
180
+ pip install chromadb
181
+ ```
182
+
183
+ ## License
184
+
185
+ Part of the SPARKNET project. See main LICENSE file.
demo/app.py ADDED
@@ -0,0 +1,944 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SPARKNET Demo Application
3
+
4
+ A Streamlit-based demo showcasing:
5
+ - Document Processing Pipeline
6
+ - Field Extraction with Evidence
7
+ - RAG Search and Q&A
8
+ - Document Classification
9
+ - Evidence Visualization
10
+ """
11
+
12
+ import streamlit as st
13
+ import sys
14
+ import os
15
+ from pathlib import Path
16
+ import json
17
+ import time
18
+ from datetime import datetime
19
+
20
+ # Add project root to path
21
+ PROJECT_ROOT = Path(__file__).parent.parent
22
+ sys.path.insert(0, str(PROJECT_ROOT))
23
+
24
+ # Page configuration
25
+ st.set_page_config(
26
+ page_title="SPARKNET Document Intelligence",
27
+ page_icon="🔥",
28
+ layout="wide",
29
+ initial_sidebar_state="expanded",
30
+ )
31
+
32
+ # Custom CSS
33
+ st.markdown("""
34
+ <style>
35
+ .main-header {
36
+ font-size: 2.5rem;
37
+ font-weight: bold;
38
+ background: linear-gradient(90deg, #FF6B6B, #4ECDC4);
39
+ -webkit-background-clip: text;
40
+ -webkit-text-fill-color: transparent;
41
+ margin-bottom: 0.5rem;
42
+ }
43
+ .sub-header {
44
+ color: #666;
45
+ font-size: 1.1rem;
46
+ margin-bottom: 2rem;
47
+ }
48
+ .metric-card {
49
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
50
+ border-radius: 10px;
51
+ padding: 1rem;
52
+ color: white;
53
+ }
54
+ .evidence-box {
55
+ background-color: #f0f7ff;
56
+ border-left: 4px solid #4ECDC4;
57
+ padding: 1rem;
58
+ margin: 0.5rem 0;
59
+ border-radius: 0 8px 8px 0;
60
+ }
61
+ .chunk-card {
62
+ background-color: #fafafa;
63
+ border: 1px solid #e0e0e0;
64
+ border-radius: 8px;
65
+ padding: 1rem;
66
+ margin: 0.5rem 0;
67
+ }
68
+ .confidence-high { color: #22c55e; font-weight: bold; }
69
+ .confidence-medium { color: #eab308; font-weight: bold; }
70
+ .confidence-low { color: #ef4444; font-weight: bold; }
71
+ .stTabs [data-baseweb="tab-list"] {
72
+ gap: 8px;
73
+ }
74
+ .stTabs [data-baseweb="tab"] {
75
+ padding: 10px 20px;
76
+ background-color: #f0f2f6;
77
+ border-radius: 8px;
78
+ }
79
+ </style>
80
+ """, unsafe_allow_html=True)
81
+
82
+
83
+ def get_sample_documents():
84
+ """Get list of sample documents from Dataset folder."""
85
+ dataset_path = PROJECT_ROOT / "Dataset"
86
+ if dataset_path.exists():
87
+ return sorted([f.name for f in dataset_path.glob("*.pdf")])
88
+ return []
89
+
90
+
91
+ def format_confidence(confidence: float) -> str:
92
+ """Format confidence with color coding."""
93
+ if confidence >= 0.8:
94
+ return f'<span class="confidence-high">{confidence:.1%}</span>'
95
+ elif confidence >= 0.6:
96
+ return f'<span class="confidence-medium">{confidence:.1%}</span>'
97
+ else:
98
+ return f'<span class="confidence-low">{confidence:.1%}</span>'
99
+
100
+
101
+ def render_header():
102
+ """Render the main header."""
103
+ col1, col2 = st.columns([3, 1])
104
+ with col1:
105
+ st.markdown('<div class="main-header">🔥 SPARKNET</div>', unsafe_allow_html=True)
106
+ st.markdown('<div class="sub-header">Agentic Document Intelligence Platform</div>', unsafe_allow_html=True)
107
+ with col2:
108
+ st.image("https://img.shields.io/badge/version-0.1.0-blue", width=100)
109
+
110
+
111
+ def render_sidebar():
112
+ """Render the sidebar with navigation."""
113
+ with st.sidebar:
114
+ st.markdown("## Navigation")
115
+
116
+ page = st.radio(
117
+ "Select Feature",
118
+ [
119
+ "🏠 Home",
120
+ "📄 Document Processing",
121
+ "🔍 Field Extraction",
122
+ "💬 RAG Q&A",
123
+ "🏷️ Classification",
124
+ "📊 Analytics",
125
+ ],
126
+ label_visibility="collapsed",
127
+ )
128
+
129
+ st.markdown("---")
130
+ st.markdown("### System Status")
131
+
132
+ # Check component status
133
+ ollama_status = check_ollama_status()
134
+ st.markdown(f"**Ollama:** {'🟢 Online' if ollama_status else '🔴 Offline'}")
135
+
136
+ chromadb_status = check_chromadb_status()
137
+ st.markdown(f"**ChromaDB:** {'🟢 Ready' if chromadb_status else '🔴 Not initialized'}")
138
+
139
+ st.markdown("---")
140
+ st.markdown("### Sample Documents")
141
+ docs = get_sample_documents()
142
+ st.markdown(f"**Available:** {len(docs)} PDFs")
143
+
144
+ return page
145
+
146
+
147
+ def check_ollama_status():
148
+ """Check if Ollama is running."""
149
+ try:
150
+ import httpx
151
+ with httpx.Client(timeout=2.0) as client:
152
+ resp = client.get("http://localhost:11434/api/tags")
153
+ return resp.status_code == 200
154
+ except:
155
+ return False
156
+
157
+
158
+ def check_chromadb_status():
159
+ """Check if ChromaDB is available."""
160
+ try:
161
+ import chromadb
162
+ return True
163
+ except:
164
+ return False
165
+
166
+
167
+ def render_home_page():
168
+ """Render the home page."""
169
+ st.markdown("## Welcome to SPARKNET")
170
+
171
+ st.markdown("""
172
+ SPARKNET is an enterprise-grade **Agentic Document Intelligence Platform** that combines:
173
+
174
+ - **📄 Document Processing**: OCR with PaddleOCR/Tesseract, layout detection, semantic chunking
175
+ - **🔍 RAG Subsystem**: Vector search with ChromaDB, grounded retrieval with citations
176
+ - **🤖 Multi-Agent System**: ReAct-style agents with tool use and validation
177
+ - **🏠 Local-First**: Privacy-preserving inference via Ollama
178
+ - **📎 Evidence Grounding**: Every extraction includes bbox, page, chunk_id references
179
+ """)
180
+
181
+ st.markdown("---")
182
+
183
+ # Feature cards
184
+ col1, col2, col3, col4 = st.columns(4)
185
+
186
+ with col1:
187
+ st.markdown("""
188
+ <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
189
+ border-radius: 12px; padding: 1.5rem; color: white; text-align: center;">
190
+ <h3>📄</h3>
191
+ <h4>Document Processing</h4>
192
+ <p style="font-size: 0.9rem;">OCR, Layout Detection, Chunking</p>
193
+ </div>
194
+ """, unsafe_allow_html=True)
195
+
196
+ with col2:
197
+ st.markdown("""
198
+ <div style="background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
199
+ border-radius: 12px; padding: 1.5rem; color: white; text-align: center;">
200
+ <h3>🔍</h3>
201
+ <h4>Field Extraction</h4>
202
+ <p style="font-size: 0.9rem;">Structured Data with Evidence</p>
203
+ </div>
204
+ """, unsafe_allow_html=True)
205
+
206
+ with col3:
207
+ st.markdown("""
208
+ <div style="background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
209
+ border-radius: 12px; padding: 1.5rem; color: white; text-align: center;">
210
+ <h3>💬</h3>
211
+ <h4>RAG Q&A</h4>
212
+ <p style="font-size: 0.9rem;">Grounded Answers with Citations</p>
213
+ </div>
214
+ """, unsafe_allow_html=True)
215
+
216
+ with col4:
217
+ st.markdown("""
218
+ <div style="background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%);
219
+ border-radius: 12px; padding: 1.5rem; color: white; text-align: center;">
220
+ <h3>🏷️</h3>
221
+ <h4>Classification</h4>
222
+ <p style="font-size: 0.9rem;">Document Type Detection</p>
223
+ </div>
224
+ """, unsafe_allow_html=True)
225
+
226
+ st.markdown("---")
227
+
228
+ # Quick start
229
+ st.markdown("### Quick Start")
230
+
231
+ with st.expander("📚 How to Use This Demo", expanded=True):
232
+ st.markdown("""
233
+ 1. **Document Processing**: Upload or select a PDF to process with OCR
234
+ 2. **Field Extraction**: Define fields to extract with evidence grounding
235
+ 3. **RAG Q&A**: Ask questions about indexed documents
236
+ 4. **Classification**: Automatically classify document types
237
+
238
+ **Sample Documents**: The demo includes real patent documents from major tech companies.
239
+ """)
240
+
241
+ # Sample documents preview
242
+ st.markdown("### Available Sample Documents")
243
+ docs = get_sample_documents()
244
+
245
+ if docs:
246
+ cols = st.columns(4)
247
+ for i, doc in enumerate(docs[:8]):
248
+ with cols[i % 4]:
249
+ company = doc.split()[0] if doc else "Unknown"
250
+ st.markdown(f"""
251
+ <div style="background: #f8f9fa; border-radius: 8px; padding: 0.8rem;
252
+ margin: 0.3rem 0; border: 1px solid #e0e0e0;">
253
+ <strong>📄 {company}</strong>
254
+ <br><small style="color: #666;">{doc[:30]}...</small>
255
+ </div>
256
+ """, unsafe_allow_html=True)
257
+
258
+
259
+ def render_document_processing_page():
260
+ """Render the document processing page."""
261
+ st.markdown("## 📄 Document Processing Pipeline")
262
+
263
+ st.markdown("""
264
+ Process documents through our intelligent pipeline:
265
+ **OCR → Layout Detection → Reading Order → Semantic Chunking → Grounding**
266
+ """)
267
+
268
+ # Document selection
269
+ col1, col2 = st.columns([2, 1])
270
+
271
+ with col1:
272
+ upload_option = st.radio(
273
+ "Document Source",
274
+ ["Select from samples", "Upload new document"],
275
+ horizontal=True,
276
+ )
277
+
278
+ if upload_option == "Select from samples":
279
+ docs = get_sample_documents()
280
+ if docs:
281
+ selected_doc = st.selectbox("Select a document", docs)
282
+ doc_path = PROJECT_ROOT / "Dataset" / selected_doc
283
+ else:
284
+ st.warning("No sample documents found")
285
+ doc_path = None
286
+ else:
287
+ uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
288
+ if uploaded_file:
289
+ # Save temporarily
290
+ temp_path = PROJECT_ROOT / "data" / "temp" / uploaded_file.name
291
+ temp_path.parent.mkdir(parents=True, exist_ok=True)
292
+ with open(temp_path, "wb") as f:
293
+ f.write(uploaded_file.read())
294
+ doc_path = temp_path
295
+ else:
296
+ doc_path = None
297
+
298
+ with col2:
299
+ st.markdown("### Processing Options")
300
+ ocr_engine = st.selectbox("OCR Engine", ["paddleocr", "tesseract"])
301
+ max_pages = st.slider("Max Pages", 1, 20, 5)
302
+ render_dpi = st.selectbox("Render DPI", [150, 200, 300], index=2)
303
+
304
+ st.markdown("---")
305
+
306
+ if doc_path and st.button("🚀 Process Document", type="primary"):
307
+ process_document_demo(doc_path, ocr_engine, max_pages, render_dpi)
308
+
309
+
310
+ def process_document_demo(doc_path, ocr_engine, max_pages, render_dpi):
311
+ """Demo document processing."""
312
+
313
+ progress_bar = st.progress(0)
314
+ status_text = st.empty()
315
+
316
+ # Simulate processing stages
317
+ stages = [
318
+ ("Loading document...", 0.1),
319
+ ("Running OCR extraction...", 0.3),
320
+ ("Detecting layout regions...", 0.5),
321
+ ("Reconstructing reading order...", 0.7),
322
+ ("Creating semantic chunks...", 0.9),
323
+ ("Finalizing...", 1.0),
324
+ ]
325
+
326
+ for stage_text, progress in stages:
327
+ status_text.text(stage_text)
328
+ progress_bar.progress(progress)
329
+ time.sleep(0.5)
330
+
331
+ status_text.text("✅ Processing complete!")
332
+
333
+ # Try actual processing
334
+ try:
335
+ from src.document.pipeline import process_document, PipelineConfig
336
+ from src.document.ocr import OCRConfig
337
+
338
+ config = PipelineConfig(
339
+ ocr=OCRConfig(engine=ocr_engine),
340
+ render_dpi=render_dpi,
341
+ max_pages=max_pages,
342
+ )
343
+
344
+ with st.spinner("Running actual document processing..."):
345
+ result = process_document(str(doc_path), config=config)
346
+
347
+ # Display results
348
+ render_processing_results(result)
349
+
350
+ except Exception as e:
351
+ st.warning(f"Live processing unavailable: {e}")
352
+ st.info("Showing demo results instead...")
353
+ render_demo_processing_results(str(doc_path))
354
+
355
+
356
+ def render_processing_results(result):
357
+ """Render actual processing results."""
358
+
359
+ # Metrics
360
+ col1, col2, col3, col4 = st.columns(4)
361
+
362
+ with col1:
363
+ st.metric("Pages", result.metadata.num_pages)
364
+ with col2:
365
+ st.metric("Chunks", result.metadata.total_chunks)
366
+ with col3:
367
+ st.metric("Characters", f"{result.metadata.total_characters:,}")
368
+ with col4:
369
+ conf = result.metadata.ocr_confidence_avg or 0
370
+ st.metric("OCR Confidence", f"{conf:.1%}")
371
+
372
+ st.markdown("---")
373
+
374
+ # Tabs for different views
375
+ tab1, tab2, tab3 = st.tabs(["📝 Extracted Text", "📦 Chunks", "🗺️ Layout"])
376
+
377
+ with tab1:
378
+ st.markdown("### Full Extracted Text")
379
+ st.text_area(
380
+ "Document Text",
381
+ result.full_text[:5000] + "..." if len(result.full_text) > 5000 else result.full_text,
382
+ height=400,
383
+ )
384
+
385
+ with tab2:
386
+ st.markdown("### Document Chunks")
387
+ for i, chunk in enumerate(result.chunks[:10]):
388
+ with st.expander(f"Chunk {i+1}: {chunk.chunk_type.value} (Page {chunk.page + 1})"):
389
+ st.markdown(f"**ID:** `{chunk.chunk_id}`")
390
+ st.markdown(f"**Confidence:** {format_confidence(chunk.confidence)}", unsafe_allow_html=True)
391
+ st.markdown(f"**BBox:** ({chunk.bbox.x_min:.0f}, {chunk.bbox.y_min:.0f}) → ({chunk.bbox.x_max:.0f}, {chunk.bbox.y_max:.0f})")
392
+ st.markdown("**Text:**")
393
+ st.text(chunk.text[:500])
394
+
395
+ with tab3:
396
+ st.markdown("### Layout Regions")
397
+ if result.layout_regions:
398
+ layout_data = []
399
+ for r in result.layout_regions:
400
+ layout_data.append({
401
+ "Type": r.layout_type.value,
402
+ "Page": r.page + 1,
403
+ "Confidence": f"{r.confidence:.1%}",
404
+ "Position": f"({r.bbox.x_min:.0f}, {r.bbox.y_min:.0f})",
405
+ })
406
+ st.dataframe(layout_data, width='stretch')
407
+ else:
408
+ st.info("No layout regions detected")
409
+
410
+
411
+ def render_demo_processing_results(doc_path):
412
+ """Render demo processing results when actual processing unavailable."""
413
+
414
+ doc_name = Path(doc_path).name
415
+
416
+ # Simulated metrics
417
+ col1, col2, col3, col4 = st.columns(4)
418
+
419
+ with col1:
420
+ st.metric("Pages", 12)
421
+ with col2:
422
+ st.metric("Chunks", 47)
423
+ with col3:
424
+ st.metric("Characters", "15,234")
425
+ with col4:
426
+ st.metric("OCR Confidence", "94.2%")
427
+
428
+ st.markdown("---")
429
+
430
+ # Demo chunks
431
+ demo_chunks = [
432
+ {
433
+ "type": "title",
434
+ "page": 1,
435
+ "confidence": 0.98,
436
+ "text": f"PATENT PLEDGE - {doc_name.split()[0]}",
437
+ "bbox": "(100, 50) → (700, 100)",
438
+ },
439
+ {
440
+ "type": "text",
441
+ "page": 1,
442
+ "confidence": 0.95,
443
+ "text": "This Patent Pledge is made by the undersigned company to promote innovation and reduce patent-related barriers...",
444
+ "bbox": "(100, 150) → (700, 300)",
445
+ },
446
+ {
447
+ "type": "text",
448
+ "page": 1,
449
+ "confidence": 0.92,
450
+ "text": "The company hereby pledges not to assert any patent claims against any party making, using, or selling products...",
451
+ "bbox": "(100, 320) → (700, 500)",
452
+ },
453
+ ]
454
+
455
+ tab1, tab2 = st.tabs(["📝 Extracted Text", "📦 Chunks"])
456
+
457
+ with tab1:
458
+ st.markdown("### Full Extracted Text")
459
+ demo_text = f"""
460
+ PATENT PLEDGE - {doc_name.split()[0]}
461
+
462
+ This Patent Pledge is made by the undersigned company to promote innovation
463
+ and reduce patent-related barriers in the technology industry.
464
+
465
+ DEFINITIONS:
466
+ 1. "Covered Patents" means all patents and patent applications owned by
467
+ the Pledgor that cover fundamental technologies.
468
+ 2. "Open Source Software" means software distributed under licenses
469
+ approved by the Open Source Initiative.
470
+
471
+ PLEDGE:
472
+ The company hereby pledges not to assert any Covered Patents against
473
+ any party making, using, selling, or distributing Open Source Software.
474
+
475
+ This pledge is irrevocable and shall remain in effect for the life
476
+ of all Covered Patents.
477
+
478
+ [Document continues with legal terms and conditions...]
479
+ """
480
+ st.text_area("Document Text", demo_text, height=400)
481
+
482
+ with tab2:
483
+ st.markdown("### Document Chunks")
484
+ for i, chunk in enumerate(demo_chunks):
485
+ with st.expander(f"Chunk {i+1}: {chunk['type']} (Page {chunk['page']})"):
486
+ st.markdown(f"**Confidence:** {format_confidence(chunk['confidence'])}", unsafe_allow_html=True)
487
+ st.markdown(f"**BBox:** {chunk['bbox']}")
488
+ st.markdown("**Text:**")
489
+ st.text(chunk["text"])
490
+
491
+
492
+ def render_extraction_page():
493
+ """Render the field extraction page."""
494
+ st.markdown("## 🔍 Field Extraction with Evidence")
495
+
496
+ st.markdown("""
497
+ Extract structured fields from documents with **evidence grounding**.
498
+ Every extracted value includes its source location (page, bbox, chunk_id).
499
+ """)
500
+
501
+ col1, col2 = st.columns([2, 1])
502
+
503
+ with col1:
504
+ # Document selection
505
+ docs = get_sample_documents()
506
+ if docs:
507
+ selected_doc = st.selectbox("Select Document", docs, key="extract_doc")
508
+
509
+ st.markdown("### Fields to Extract")
510
+
511
+ # Predefined schemas
512
+ schema_type = st.selectbox(
513
+ "Extraction Schema",
514
+ ["Patent/Legal Document", "Invoice", "Contract", "Custom"],
515
+ )
516
+
517
+ if schema_type == "Patent/Legal Document":
518
+ default_fields = ["document_title", "company_name", "effective_date", "key_terms", "parties_involved"]
519
+ elif schema_type == "Invoice":
520
+ default_fields = ["invoice_number", "date", "total_amount", "vendor_name", "line_items"]
521
+ elif schema_type == "Contract":
522
+ default_fields = ["contract_title", "parties", "effective_date", "term_length", "key_obligations"]
523
+ else:
524
+ default_fields = ["field_1", "field_2"]
525
+
526
+ fields = st.multiselect(
527
+ "Select fields to extract",
528
+ default_fields,
529
+ default=default_fields[:3],
530
+ )
531
+
532
+ with col2:
533
+ st.markdown("### Extraction Options")
534
+ validate = st.checkbox("Validate with Critic", value=True)
535
+ include_evidence = st.checkbox("Include Evidence", value=True)
536
+ confidence_threshold = st.slider("Min Confidence", 0.0, 1.0, 0.7)
537
+
538
+ st.markdown("---")
539
+
540
+ if fields and st.button("🔍 Extract Fields", type="primary"):
541
+ extract_fields_demo(selected_doc, fields, validate, include_evidence)
542
+
543
+
544
+ def extract_fields_demo(doc_name, fields, validate, include_evidence):
545
+ """Demo field extraction."""
546
+
547
+ with st.spinner("Extracting fields..."):
548
+ time.sleep(1.5)
549
+
550
+ st.success("✅ Extraction complete!")
551
+
552
+ # Demo results
553
+ company = doc_name.split()[0] if doc_name else "Company"
554
+
555
+ demo_extractions = {
556
+ "document_title": {
557
+ "value": f"{company} Patent Non-Assertion Pledge",
558
+ "confidence": 0.96,
559
+ "page": 1,
560
+ "evidence": f"Found in header: '{company} Patent Non-Assertion Pledge' at position (100, 50)",
561
+ },
562
+ "company_name": {
563
+ "value": company,
564
+ "confidence": 0.98,
565
+ "page": 1,
566
+ "evidence": f"Identified as pledgor: '{company}' mentioned 15 times throughout document",
567
+ },
568
+ "effective_date": {
569
+ "value": doc_name.split()[-1].replace(".pdf", "") if len(doc_name.split()) > 1 else "N/A",
570
+ "confidence": 0.85,
571
+ "page": 1,
572
+ "evidence": "Date found in document header",
573
+ },
574
+ "key_terms": {
575
+ "value": "Patent pledge, Open source, Non-assertion, Royalty-free",
576
+ "confidence": 0.89,
577
+ "page": 2,
578
+ "evidence": "Key terms identified from definitions section",
579
+ },
580
+ "parties_involved": {
581
+ "value": f"{company}, Open Source Community",
582
+ "confidence": 0.82,
583
+ "page": 1,
584
+ "evidence": "Parties identified from pledge declaration",
585
+ },
586
+ }
587
+
588
+ # Display results
589
+ st.markdown("### Extracted Fields")
590
+
591
+ for field in fields:
592
+ if field in demo_extractions:
593
+ data = demo_extractions[field]
594
+
595
+ col1, col2 = st.columns([3, 1])
596
+
597
+ with col1:
598
+ st.markdown(f"""
599
+ <div class="chunk-card">
600
+ <strong>{field.replace('_', ' ').title()}</strong>
601
+ <p style="font-size: 1.2rem; margin: 0.5rem 0;">{data['value']}</p>
602
+ </div>
603
+ """, unsafe_allow_html=True)
604
+
605
+ with col2:
606
+ st.markdown(f"**Confidence:** {format_confidence(data['confidence'])}", unsafe_allow_html=True)
607
+ st.markdown(f"**Page:** {data['page']}")
608
+
609
+ if include_evidence:
610
+ st.markdown(f"""
611
+ <div class="evidence-box">
612
+ 📎 <strong>Evidence:</strong> {data['evidence']}
613
+ </div>
614
+ """, unsafe_allow_html=True)
615
+
616
+ st.markdown("")
617
+
618
+ # Validation results
619
+ if validate:
620
+ st.markdown("---")
621
+ st.markdown("### Validation Results")
622
+
623
+ col1, col2, col3 = st.columns(3)
624
+ with col1:
625
+ st.metric("Fields Validated", len(fields))
626
+ with col2:
627
+ st.metric("Valid", len(fields) - 1)
628
+ with col3:
629
+ st.metric("Uncertain", 1)
630
+
631
+ st.info("💡 Critic validation: All fields have supporting evidence in the document.")
632
+
633
+
634
+ def render_rag_page():
635
+ """Render the RAG Q&A page."""
636
+ st.markdown("## 💬 RAG Question Answering")
637
+
638
+ st.markdown("""
639
+ Ask questions about indexed documents. Answers include **citations** pointing to
640
+ the exact source chunks with page numbers and text snippets.
641
+ """)
642
+
643
+ # Index status
644
+ col1, col2 = st.columns([2, 1])
645
+
646
+ with col1:
647
+ st.markdown("### Ask a Question")
648
+
649
+ # Preset questions
650
+ preset_questions = [
651
+ "What is the main purpose of this document?",
652
+ "What patents are covered by this pledge?",
653
+ "What are the key terms and definitions?",
654
+ "Who are the parties involved?",
655
+ "What are the conditions for the pledge?",
656
+ ]
657
+
658
+ question_mode = st.radio(
659
+ "Question Mode",
660
+ ["Select preset", "Custom question"],
661
+ horizontal=True,
662
+ )
663
+
664
+ if question_mode == "Select preset":
665
+ question = st.selectbox("Select a question", preset_questions)
666
+ else:
667
+ question = st.text_input("Enter your question")
668
+
669
+ col_a, col_b = st.columns(2)
670
+ with col_a:
671
+ top_k = st.slider("Number of sources", 1, 10, 5)
672
+ with col_b:
673
+ show_confidence = st.checkbox("Show confidence scores", value=True)
674
+
675
+ with col2:
676
+ st.markdown("### Index Status")
677
+ st.markdown("""
678
+ - **Documents indexed:** 3
679
+ - **Total chunks:** 147
680
+ - **Embedding model:** nomic-embed-text
681
+ - **Vector dimension:** 768
682
+ """)
683
+
684
+ st.markdown("---")
685
+
686
+ if question and st.button("🔍 Get Answer", type="primary"):
687
+ rag_query_demo(question, top_k, show_confidence)
688
+
689
+
690
+ def rag_query_demo(question, top_k, show_confidence):
691
+ """Demo RAG query."""
692
+
693
+ with st.spinner("Searching documents and generating answer..."):
694
+ time.sleep(1.5)
695
+
696
+ # Demo answer based on question
697
+ demo_answers = {
698
+ "purpose": {
699
+ "answer": "The main purpose of this document is to establish a **Patent Non-Assertion Pledge** where the company commits not to assert certain patent claims against parties using, making, or distributing Open Source Software. This pledge aims to promote innovation and reduce patent-related barriers in the technology industry.",
700
+ "confidence": 0.92,
701
+ "citations": [
702
+ {"index": 1, "page": 1, "snippet": "This Patent Pledge is made to promote innovation and reduce patent-related barriers...", "confidence": 0.95},
703
+ {"index": 2, "page": 1, "snippet": "The company hereby pledges not to assert any patent claims against any party...", "confidence": 0.91},
704
+ ],
705
+ },
706
+ "patents": {
707
+ "answer": "The pledge covers **all patents and patent applications** owned by the Pledgor that relate to fundamental technologies used in Open Source Software. Specifically, these are referred to as 'Covered Patents' in the document, defined as patents that cover essential features or functionalities.",
708
+ "confidence": 0.88,
709
+ "citations": [
710
+ {"index": 1, "page": 2, "snippet": "'Covered Patents' means all patents and patent applications owned by the Pledgor...", "confidence": 0.93},
711
+ {"index": 2, "page": 2, "snippet": "Patents covering fundamental technologies essential to Open Source implementations...", "confidence": 0.85},
712
+ ],
713
+ },
714
+ "default": {
715
+ "answer": "Based on the available documents, this appears to be a **Patent Pledge** document from a major technology company. The document establishes terms for patent non-assertion related to Open Source Software, with specific definitions and conditions outlined in the legal text.",
716
+ "confidence": 0.75,
717
+ "citations": [
718
+ {"index": 1, "page": 1, "snippet": "Patent Pledge document establishing non-assertion terms...", "confidence": 0.80},
719
+ ],
720
+ },
721
+ }
722
+
723
+ # Select answer based on question keywords
724
+ if "purpose" in question.lower() or "main" in question.lower():
725
+ result = demo_answers["purpose"]
726
+ elif "patent" in question.lower() and "cover" in question.lower():
727
+ result = demo_answers["patents"]
728
+ else:
729
+ result = demo_answers["default"]
730
+
731
+ # Display answer
732
+ st.markdown("### Answer")
733
+
734
+ st.markdown(f"""
735
+ <div style="background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
736
+ border-radius: 12px; padding: 1.5rem; margin: 1rem 0;">
737
+ {result['answer']}
738
+ </div>
739
+ """, unsafe_allow_html=True)
740
+
741
+ if show_confidence:
742
+ st.markdown(f"**Overall Confidence:** {format_confidence(result['confidence'])}", unsafe_allow_html=True)
743
+
744
+ # Citations
745
+ st.markdown("### 📚 Citations")
746
+
747
+ for citation in result["citations"][:top_k]:
748
+ st.markdown(f"""
749
+ <div class="evidence-box">
750
+ <strong>[{citation['index']}] Page {citation['page']}</strong>
751
+ {f' - Confidence: {citation["confidence"]:.0%}' if show_confidence else ''}
752
+ <br>
753
+ <em>"{citation['snippet']}"</em>
754
+ </div>
755
+ """, unsafe_allow_html=True)
756
+
757
+
758
+ def render_classification_page():
759
+ """Render the classification page."""
760
+ st.markdown("## 🏷️ Document Classification")
761
+
762
+ st.markdown("""
763
+ Automatically classify documents into predefined categories with confidence scores
764
+ and reasoning explanations.
765
+ """)
766
+
767
+ docs = get_sample_documents()
768
+
769
+ col1, col2 = st.columns([2, 1])
770
+
771
+ with col1:
772
+ if docs:
773
+ selected_doc = st.selectbox("Select Document to Classify", docs, key="classify_doc")
774
+
775
+ st.markdown("### Document Categories")
776
+ categories = [
777
+ "📜 Legal/Patent Document",
778
+ "📑 Contract/Agreement",
779
+ "📊 Financial Report",
780
+ "📋 Technical Specification",
781
+ "📄 General Business Document",
782
+ ]
783
+ st.markdown("\n".join([f"- {cat}" for cat in categories]))
784
+
785
+ with col2:
786
+ st.markdown("### Classification Options")
787
+ detailed_reasoning = st.checkbox("Show detailed reasoning", value=True)
788
+ multi_label = st.checkbox("Allow multiple categories", value=False)
789
+
790
+ st.markdown("---")
791
+
792
+ if st.button("🏷️ Classify Document", type="primary"):
793
+ classify_document_demo(selected_doc, detailed_reasoning)
794
+
795
+
796
+ def classify_document_demo(doc_name, detailed_reasoning):
797
+ """Demo document classification."""
798
+
799
+ with st.spinner("Analyzing document..."):
800
+ time.sleep(1.0)
801
+
802
+ st.success("✅ Classification complete!")
803
+
804
+ # Demo classification results
805
+ col1, col2 = st.columns([2, 1])
806
+
807
+ with col1:
808
+ st.markdown("### Primary Classification")
809
+ st.markdown("""
810
+ <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
811
+ border-radius: 12px; padding: 1.5rem; color: white; text-align: center;">
812
+ <h2 style="margin: 0;">📜 Legal/Patent Document</h2>
813
+ <p style="font-size: 1.2rem; margin: 0.5rem 0;">Patent Non-Assertion Pledge</p>
814
+ </div>
815
+ """, unsafe_allow_html=True)
816
+
817
+ with col2:
818
+ st.markdown("### Confidence Scores")
819
+ st.markdown(f"**Legal/Patent:** {format_confidence(0.94)}", unsafe_allow_html=True)
820
+ st.markdown(f"**Contract:** {format_confidence(0.72)}", unsafe_allow_html=True)
821
+ st.markdown(f"**Technical:** {format_confidence(0.15)}", unsafe_allow_html=True)
822
+ st.markdown(f"**Financial:** {format_confidence(0.08)}", unsafe_allow_html=True)
823
+
824
+ if detailed_reasoning:
825
+ st.markdown("---")
826
+ st.markdown("### Classification Reasoning")
827
+
828
+ st.markdown("""
829
+ <div class="evidence-box">
830
+ <strong>Why Legal/Patent Document?</strong>
831
+ <ul>
832
+ <li>Contains legal terminology: "pledge", "assert", "patent claims", "royalty-free"</li>
833
+ <li>Structured as a formal legal declaration</li>
834
+ <li>References specific patent-related definitions</li>
835
+ <li>Contains commitment/obligation language</li>
836
+ </ul>
837
+ </div>
838
+ """, unsafe_allow_html=True)
839
+
840
+ st.markdown("""
841
+ <div class="chunk-card">
842
+ <strong>Key Indicators Found:</strong>
843
+ <br>
844
+ • "Patent Pledge" - Document title indicator (weight: 0.35)<br>
845
+ • "hereby pledges" - Legal commitment language (weight: 0.25)<br>
846
+ • "Covered Patents" - Patent-specific terminology (weight: 0.20)<br>
847
+ • "Open Source Software" - Tech/IP context (weight: 0.15)
848
+ </div>
849
+ """, unsafe_allow_html=True)
850
+
851
+
852
+ def render_analytics_page():
853
+ """Render the analytics page."""
854
+ st.markdown("## 📊 Processing Analytics")
855
+
856
+ st.markdown("View statistics and insights about document processing.")
857
+
858
+ # Summary metrics
859
+ col1, col2, col3, col4 = st.columns(4)
860
+
861
+ with col1:
862
+ st.metric("Documents Processed", 24, delta="+3 today")
863
+ with col2:
864
+ st.metric("Total Chunks", 1247, delta="+156")
865
+ with col3:
866
+ st.metric("Avg. Confidence", "91.3%", delta="+2.1%")
867
+ with col4:
868
+ st.metric("Questions Answered", 89, delta="+12")
869
+
870
+ st.markdown("---")
871
+
872
+ # Charts
873
+ col1, col2 = st.columns(2)
874
+
875
+ with col1:
876
+ st.markdown("### Document Types Processed")
877
+ import pandas as pd
878
+
879
+ chart_data = pd.DataFrame({
880
+ "Type": ["Patent/Legal", "Contract", "Technical", "Financial", "Other"],
881
+ "Count": [12, 5, 4, 2, 1],
882
+ })
883
+ st.bar_chart(chart_data.set_index("Type"))
884
+
885
+ with col2:
886
+ st.markdown("### Processing Performance")
887
+ perf_data = pd.DataFrame({
888
+ "Stage": ["OCR", "Layout", "Chunking", "Indexing", "Retrieval"],
889
+ "Avg Time (s)": [2.3, 0.8, 0.5, 1.2, 0.3],
890
+ })
891
+ st.bar_chart(perf_data.set_index("Stage"))
892
+
893
+ st.markdown("---")
894
+
895
+ # Recent activity
896
+ st.markdown("### Recent Activity")
897
+
898
+ activities = [
899
+ {"time": "2 min ago", "action": "Processed", "document": "IBM N_A.pdf", "chunks": 42},
900
+ {"time": "15 min ago", "action": "Indexed", "document": "Apple 11.11.2011.pdf", "chunks": 67},
901
+ {"time": "1 hour ago", "action": "Queried", "document": "RAG Collection", "chunks": 5},
902
+ {"time": "2 hours ago", "action": "Classified", "document": "Google 08.02.2012.pdf", "chunks": 0},
903
+ ]
904
+
905
+ for activity in activities:
906
+ st.markdown(f"""
907
+ <div class="chunk-card">
908
+ <strong>{activity['time']}</strong> - {activity['action']} <em>{activity['document']}</em>
909
+ {f" ({activity['chunks']} chunks)" if activity['chunks'] > 0 else ""}
910
+ </div>
911
+ """, unsafe_allow_html=True)
912
+
913
+
914
+ def main():
915
+ """Main application."""
916
+ render_header()
917
+ page = render_sidebar()
918
+
919
+ # Route to appropriate page
920
+ if page == "🏠 Home":
921
+ render_home_page()
922
+ elif page == "📄 Document Processing":
923
+ render_document_processing_page()
924
+ elif page == "🔍 Field Extraction":
925
+ render_extraction_page()
926
+ elif page == "💬 RAG Q&A":
927
+ render_rag_page()
928
+ elif page == "🏷️ Classification":
929
+ render_classification_page()
930
+ elif page == "📊 Analytics":
931
+ render_analytics_page()
932
+
933
+ # Footer
934
+ st.markdown("---")
935
+ st.markdown(
936
+ "<div style='text-align: center; color: #666;'>"
937
+ "🔥 SPARKNET Document Intelligence Platform | Built with Streamlit"
938
+ "</div>",
939
+ unsafe_allow_html=True,
940
+ )
941
+
942
+
943
+ if __name__ == "__main__":
944
+ main()
demo/llm_providers.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Free LLM Providers for SPARKNET
3
+
4
+ Supports multiple free-tier LLM providers:
5
+ 1. HuggingFace Inference API (free, no payment required)
6
+ 2. Groq (free tier - very fast)
7
+ 3. Google Gemini (free tier)
8
+ 4. Local/Offline mode (simulated responses)
9
+ """
10
+
11
+ import os
12
+ import requests
13
+ from typing import Optional, Tuple, List
14
+ from dataclasses import dataclass
15
+ from loguru import logger
16
+
17
+ @dataclass
18
+ class LLMResponse:
19
+ text: str
20
+ model: str
21
+ provider: str
22
+ success: bool
23
+ error: Optional[str] = None
24
+
25
+
26
+ class HuggingFaceProvider:
27
+ """
28
+ HuggingFace Inference API - FREE tier available.
29
+
30
+ Models that work well on free tier:
31
+ - microsoft/DialoGPT-medium
32
+ - google/flan-t5-base
33
+ - mistralai/Mistral-7B-Instruct-v0.2 (may need Pro for heavy use)
34
+ - HuggingFaceH4/zephyr-7b-beta
35
+ """
36
+
37
+ API_URL = "https://api-inference.huggingface.co/models/"
38
+
39
+ # Free-tier friendly models
40
+ MODELS = {
41
+ "chat": "HuggingFaceH4/zephyr-7b-beta",
42
+ "chat_small": "microsoft/DialoGPT-medium",
43
+ "instruct": "google/flan-t5-large",
44
+ "embed": "sentence-transformers/all-MiniLM-L6-v2",
45
+ }
46
+
47
+ def __init__(self, api_token: Optional[str] = None):
48
+ """
49
+ Initialize HuggingFace provider.
50
+
51
+ Args:
52
+ api_token: HF token (optional but recommended for higher rate limits)
53
+ Get free token at: https://huggingface.co/settings/tokens
54
+ """
55
+ self.api_token = api_token or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN")
56
+ self.headers = {}
57
+ if self.api_token:
58
+ self.headers["Authorization"] = f"Bearer {self.api_token}"
59
+
60
+ def generate(self, prompt: str, model: Optional[str] = None, max_tokens: int = 500) -> LLMResponse:
61
+ """Generate text using HuggingFace Inference API."""
62
+ model = model or self.MODELS["chat"]
63
+ url = f"{self.API_URL}{model}"
64
+
65
+ payload = {
66
+ "inputs": prompt,
67
+ "parameters": {
68
+ "max_new_tokens": max_tokens,
69
+ "temperature": 0.7,
70
+ "do_sample": True,
71
+ "return_full_text": False,
72
+ }
73
+ }
74
+
75
+ try:
76
+ response = requests.post(url, headers=self.headers, json=payload, timeout=60)
77
+
78
+ if response.status_code == 503:
79
+ # Model is loading
80
+ return LLMResponse(
81
+ text="Model is loading, please try again in a moment...",
82
+ model=model,
83
+ provider="huggingface",
84
+ success=False,
85
+ error="Model loading"
86
+ )
87
+
88
+ response.raise_for_status()
89
+ result = response.json()
90
+
91
+ if isinstance(result, list) and len(result) > 0:
92
+ text = result[0].get("generated_text", "")
93
+ else:
94
+ text = str(result)
95
+
96
+ return LLMResponse(
97
+ text=text,
98
+ model=model,
99
+ provider="huggingface",
100
+ success=True
101
+ )
102
+
103
+ except Exception as e:
104
+ logger.error(f"HuggingFace API error: {e}")
105
+ return LLMResponse(
106
+ text="",
107
+ model=model,
108
+ provider="huggingface",
109
+ success=False,
110
+ error=str(e)
111
+ )
112
+
113
+ def embed(self, texts: List[str], model: Optional[str] = None) -> Tuple[List[List[float]], Optional[str]]:
114
+ """Generate embeddings using HuggingFace."""
115
+ model = model or self.MODELS["embed"]
116
+ url = f"{self.API_URL}{model}"
117
+
118
+ payload = {
119
+ "inputs": texts,
120
+ "options": {"wait_for_model": True}
121
+ }
122
+
123
+ try:
124
+ response = requests.post(url, headers=self.headers, json=payload, timeout=60)
125
+ response.raise_for_status()
126
+ embeddings = response.json()
127
+ return embeddings, None
128
+ except Exception as e:
129
+ logger.error(f"HuggingFace embed error: {e}")
130
+ return [], str(e)
131
+
132
+
133
+ class GroqProvider:
134
+ """
135
+ Groq - FREE tier with very fast inference.
136
+
137
+ Free tier includes:
138
+ - 14,400 requests/day for smaller models
139
+ - Very fast inference (fastest available)
140
+
141
+ Get free API key at: https://console.groq.com/keys
142
+ """
143
+
144
+ API_URL = "https://api.groq.com/openai/v1/chat/completions"
145
+
146
+ MODELS = {
147
+ "fast": "llama-3.1-8b-instant", # Fastest
148
+ "smart": "llama-3.3-70b-versatile", # Best quality
149
+ "small": "gemma2-9b-it", # Good balance
150
+ }
151
+
152
+ def __init__(self, api_key: Optional[str] = None):
153
+ self.api_key = api_key or os.environ.get("GROQ_API_KEY")
154
+ if not self.api_key:
155
+ logger.warning("No Groq API key found. Get free key at: https://console.groq.com/keys")
156
+
157
+ def generate(self, prompt: str, model: Optional[str] = None, max_tokens: int = 500) -> LLMResponse:
158
+ """Generate text using Groq API."""
159
+ if not self.api_key:
160
+ return LLMResponse(
161
+ text="",
162
+ model="",
163
+ provider="groq",
164
+ success=False,
165
+ error="No Groq API key configured"
166
+ )
167
+
168
+ model = model or self.MODELS["fast"]
169
+
170
+ headers = {
171
+ "Authorization": f"Bearer {self.api_key}",
172
+ "Content-Type": "application/json"
173
+ }
174
+
175
+ payload = {
176
+ "model": model,
177
+ "messages": [{"role": "user", "content": prompt}],
178
+ "max_tokens": max_tokens,
179
+ "temperature": 0.7,
180
+ }
181
+
182
+ try:
183
+ response = requests.post(self.API_URL, headers=headers, json=payload, timeout=30)
184
+ response.raise_for_status()
185
+ result = response.json()
186
+
187
+ text = result["choices"][0]["message"]["content"]
188
+
189
+ return LLMResponse(
190
+ text=text,
191
+ model=model,
192
+ provider="groq",
193
+ success=True
194
+ )
195
+
196
+ except Exception as e:
197
+ logger.error(f"Groq API error: {e}")
198
+ return LLMResponse(
199
+ text="",
200
+ model=model,
201
+ provider="groq",
202
+ success=False,
203
+ error=str(e)
204
+ )
205
+
206
+
207
+ class OfflineProvider:
208
+ """
209
+ Offline/Demo mode - no API required.
210
+
211
+ Provides simulated responses for demonstration purposes.
212
+ """
213
+
214
+ def __init__(self):
215
+ pass
216
+
217
+ def generate(self, prompt: str, context: str = "", **kwargs) -> LLMResponse:
218
+ """Generate a simulated response based on context."""
219
+
220
+ # Extract key information from context if provided
221
+ if context:
222
+ # Simple extractive response
223
+ sentences = context.split('.')
224
+ relevant = [s.strip() for s in sentences if len(s.strip()) > 20][:3]
225
+
226
+ if relevant:
227
+ response = f"Based on the documents, {relevant[0].lower()}."
228
+ if len(relevant) > 1:
229
+ response += f" Additionally, {relevant[1].lower()}."
230
+ else:
231
+ response = "Based on the available documents, I found relevant information but cannot generate a detailed response in offline mode."
232
+ else:
233
+ response = "I'm running in offline demo mode. To get AI-powered responses, please configure a free LLM provider (HuggingFace or Groq)."
234
+
235
+ return LLMResponse(
236
+ text=response,
237
+ model="offline",
238
+ provider="offline",
239
+ success=True
240
+ )
241
+
242
+ def embed(self, texts: List[str]) -> Tuple[List[List[float]], Optional[str]]:
243
+ """Generate simple bag-of-words style embeddings for demo."""
244
+ import hashlib
245
+
246
+ embeddings = []
247
+ for text in texts:
248
+ # Create deterministic pseudo-embeddings based on text hash
249
+ hash_bytes = hashlib.sha256(text.encode()).digest()
250
+ # Convert to 384-dim vector (same as MiniLM)
251
+ embedding = [((b % 200) - 100) / 100.0 for b in hash_bytes * 12][:384]
252
+ embeddings.append(embedding)
253
+
254
+ return embeddings, None
255
+
256
+
257
+ class UnifiedLLMProvider:
258
+ """
259
+ Unified interface for all LLM providers.
260
+
261
+ Automatically selects the best available provider.
262
+ """
263
+
264
+ def __init__(self):
265
+ self.providers = {}
266
+ self.active_provider = None
267
+ self.active_embed_provider = None
268
+
269
+ # Try to initialize providers in order of preference
270
+ self._init_providers()
271
+
272
+ def _init_providers(self):
273
+ """Initialize available providers."""
274
+
275
+ # Check for Groq (fastest, generous free tier)
276
+ groq_key = os.environ.get("GROQ_API_KEY")
277
+ if groq_key:
278
+ self.providers["groq"] = GroqProvider(groq_key)
279
+ self.active_provider = "groq"
280
+ logger.info("Using Groq provider (free tier)")
281
+
282
+ # Check for HuggingFace (always available, even without token)
283
+ hf_token = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN")
284
+ self.providers["huggingface"] = HuggingFaceProvider(hf_token)
285
+ if not self.active_provider:
286
+ self.active_provider = "huggingface"
287
+ logger.info("Using HuggingFace provider")
288
+
289
+ # HuggingFace for embeddings (always free)
290
+ self.active_embed_provider = "huggingface"
291
+
292
+ # Offline fallback
293
+ self.providers["offline"] = OfflineProvider()
294
+
295
+ logger.info(f"LLM Provider: {self.active_provider}, Embed Provider: {self.active_embed_provider}")
296
+
297
+ def generate(self, prompt: str, **kwargs) -> LLMResponse:
298
+ """Generate text using the best available provider."""
299
+ provider = self.providers.get(self.active_provider)
300
+
301
+ if provider:
302
+ response = provider.generate(prompt, **kwargs)
303
+ if response.success:
304
+ return response
305
+
306
+ # Fallback to offline
307
+ return self.providers["offline"].generate(prompt, **kwargs)
308
+
309
+ def embed(self, texts: List[str]) -> Tuple[List[List[float]], Optional[str]]:
310
+ """Generate embeddings using the best available provider."""
311
+ if self.active_embed_provider == "huggingface":
312
+ embeddings, error = self.providers["huggingface"].embed(texts)
313
+ if not error:
314
+ return embeddings, None
315
+
316
+ # Fallback to offline embeddings
317
+ return self.providers["offline"].embed(texts)
318
+
319
+ def get_status(self) -> dict:
320
+ """Get status of all providers."""
321
+ return {
322
+ "active_llm": self.active_provider,
323
+ "active_embed": self.active_embed_provider,
324
+ "available_providers": list(self.providers.keys()),
325
+ "groq_configured": "groq" in self.providers and self.providers["groq"].api_key is not None,
326
+ "huggingface_configured": self.providers["huggingface"].api_token is not None,
327
+ }
328
+
329
+
330
+ # Global instance
331
+ _llm_provider: Optional[UnifiedLLMProvider] = None
332
+
333
+
334
+ def get_llm_provider() -> UnifiedLLMProvider:
335
+ """Get or create the unified LLM provider."""
336
+ global _llm_provider
337
+ if _llm_provider is None:
338
+ _llm_provider = UnifiedLLMProvider()
339
+ return _llm_provider
demo/pages/1_🔬_Live_Processing.py ADDED
@@ -0,0 +1,714 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Live Document Processing Demo - SPARKNET
3
+
4
+ Real-time document processing with integrated state management and auto-indexing.
5
+ """
6
+
7
+ import streamlit as st
8
+ import sys
9
+ from pathlib import Path
10
+ import time
11
+ import io
12
+ import base64
13
+ from datetime import datetime
14
+ import hashlib
15
+
16
+ PROJECT_ROOT = Path(__file__).parent.parent.parent
17
+ sys.path.insert(0, str(PROJECT_ROOT))
18
+ sys.path.insert(0, str(PROJECT_ROOT / "demo"))
19
+
20
+ # Import state manager and RAG config
21
+ from state_manager import (
22
+ get_state_manager,
23
+ ProcessedDocument as StateDocument,
24
+ generate_doc_id,
25
+ render_global_status_bar,
26
+ )
27
+ from rag_config import (
28
+ get_unified_rag_system,
29
+ auto_index_processed_document,
30
+ check_ollama,
31
+ )
32
+
33
+ st.set_page_config(page_title="Live Processing - SPARKNET", page_icon="🔬", layout="wide")
34
+
35
+ # Custom CSS
36
+ st.markdown("""
37
+ <style>
38
+ .stage-card {
39
+ background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
40
+ padding: 15px;
41
+ border-radius: 10px;
42
+ margin: 10px 0;
43
+ border-left: 4px solid #4ECDC4;
44
+ }
45
+ .stage-active {
46
+ border-left-color: #ffc107;
47
+ animation: pulse 1s infinite;
48
+ }
49
+ .stage-done {
50
+ border-left-color: #28a745;
51
+ }
52
+ .stage-error {
53
+ border-left-color: #dc3545;
54
+ }
55
+ @keyframes pulse {
56
+ 0% { opacity: 1; }
57
+ 50% { opacity: 0.7; }
58
+ 100% { opacity: 1; }
59
+ }
60
+ .metric-card {
61
+ background: #161b22;
62
+ border-radius: 8px;
63
+ padding: 12px;
64
+ text-align: center;
65
+ border: 1px solid #30363d;
66
+ }
67
+ .metric-value {
68
+ font-size: 24px;
69
+ font-weight: bold;
70
+ color: #4ECDC4;
71
+ }
72
+ .metric-label {
73
+ font-size: 11px;
74
+ color: #8b949e;
75
+ text-transform: uppercase;
76
+ }
77
+ .action-btn {
78
+ margin: 5px;
79
+ }
80
+ .nav-card {
81
+ background: #0d1117;
82
+ border-radius: 10px;
83
+ padding: 15px;
84
+ margin: 10px 0;
85
+ border: 1px solid #30363d;
86
+ cursor: pointer;
87
+ }
88
+ .nav-card:hover {
89
+ border-color: #4ECDC4;
90
+ }
91
+ </style>
92
+ """, unsafe_allow_html=True)
93
+
94
+ # Initialize state manager
95
+ state_manager = get_state_manager()
96
+
97
+
98
+ def process_document_actual(file_bytes: bytes, filename: str, options: dict) -> dict:
99
+ """
100
+ Process document using the actual document processing pipeline.
101
+ Returns processing results with all extracted data.
102
+ """
103
+ import tempfile
104
+ import os
105
+
106
+ # Create temp file
107
+ suffix = Path(filename).suffix
108
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
109
+ tmp.write(file_bytes)
110
+ tmp_path = tmp.name
111
+
112
+ try:
113
+ # Try to use actual document processor
114
+ try:
115
+ from src.document.pipeline.processor import (
116
+ DocumentProcessor,
117
+ PipelineConfig,
118
+ )
119
+ from src.document.ocr import OCRConfig
120
+ from src.document.layout import LayoutConfig
121
+ from src.document.chunking.chunker import ChunkerConfig
122
+
123
+ # Configure chunking with table preservation options
124
+ chunker_config = ChunkerConfig(
125
+ preserve_table_structure=options.get("preserve_tables", True),
126
+ detect_table_headers=options.get("detect_headers", True),
127
+ chunk_tables=True,
128
+ chunk_figures=True,
129
+ include_captions=True,
130
+ )
131
+
132
+ # Configure layout detection
133
+ layout_config = LayoutConfig(
134
+ method="rule_based",
135
+ detect_tables=True,
136
+ detect_figures=True,
137
+ detect_headers=True,
138
+ detect_titles=True,
139
+ detect_lists=True,
140
+ min_confidence=0.3, # Lower threshold to detect more regions
141
+ heading_font_ratio=1.1, # More sensitive heading detection
142
+ )
143
+
144
+ # Configure pipeline with all options
145
+ config = PipelineConfig(
146
+ ocr=OCRConfig(engine=options.get("ocr_engine", "paddleocr")),
147
+ layout=layout_config,
148
+ chunking=chunker_config,
149
+ max_pages=options.get("max_pages", 10),
150
+ include_ocr_regions=True,
151
+ include_layout_regions=options.get("enable_layout", True),
152
+ generate_full_text=True,
153
+ )
154
+
155
+ processor = DocumentProcessor(config)
156
+ processor.initialize()
157
+
158
+ # Process document
159
+ result = processor.process(tmp_path)
160
+
161
+ # Convert to dict format for state
162
+ chunks_list = []
163
+ for chunk in result.chunks:
164
+ chunks_list.append({
165
+ "chunk_id": chunk.chunk_id,
166
+ "text": chunk.text,
167
+ "page": chunk.page,
168
+ "chunk_type": chunk.chunk_type.value,
169
+ "confidence": chunk.confidence,
170
+ "bbox": chunk.bbox.to_xyxy() if chunk.bbox else None,
171
+ })
172
+
173
+ ocr_regions = []
174
+ for region in result.ocr_regions:
175
+ ocr_regions.append({
176
+ "text": region.text,
177
+ "confidence": region.confidence,
178
+ "page": region.page,
179
+ "bbox": region.bbox.to_xyxy() if region.bbox else None,
180
+ })
181
+
182
+ layout_regions = []
183
+ for region in result.layout_regions:
184
+ layout_regions.append({
185
+ "id": region.id,
186
+ "type": region.type.value,
187
+ "confidence": region.confidence,
188
+ "page": region.page,
189
+ "bbox": region.bbox.to_xyxy() if region.bbox else None,
190
+ })
191
+
192
+ return {
193
+ "success": True,
194
+ "raw_text": result.full_text,
195
+ "chunks": chunks_list,
196
+ "ocr_regions": ocr_regions,
197
+ "layout_regions": layout_regions,
198
+ "page_count": result.metadata.num_pages,
199
+ "ocr_confidence": result.metadata.ocr_confidence_avg or 0.0,
200
+ "layout_confidence": result.metadata.layout_confidence_avg or 0.0,
201
+ }
202
+
203
+ except Exception as e:
204
+ # Fallback: Use simple text extraction
205
+ return process_document_fallback(file_bytes, filename, options, str(e))
206
+
207
+ finally:
208
+ # Cleanup
209
+ if os.path.exists(tmp_path):
210
+ os.unlink(tmp_path)
211
+
212
+
213
+ def process_document_fallback(file_bytes: bytes, filename: str, options: dict, reason: str) -> dict:
214
+ """
215
+ Fallback document processing using simple text extraction.
216
+ """
217
+ text = ""
218
+ page_count = 1
219
+
220
+ suffix = Path(filename).suffix.lower()
221
+
222
+ # Try PyMuPDF for PDFs
223
+ if suffix == ".pdf":
224
+ try:
225
+ import fitz
226
+ pdf_stream = io.BytesIO(file_bytes)
227
+ doc = fitz.open(stream=pdf_stream, filetype="pdf")
228
+ page_count = len(doc)
229
+ max_pages = min(options.get("max_pages", 5), page_count)
230
+
231
+ text_parts = []
232
+ for page_num in range(max_pages):
233
+ page = doc[page_num]
234
+ text_parts.append(f"--- Page {page_num + 1} ---\n{page.get_text()}")
235
+ text = "\n\n".join(text_parts)
236
+ doc.close()
237
+ except Exception as pdf_e:
238
+ text = f"PDF extraction failed: {pdf_e}"
239
+
240
+ elif suffix in [".txt", ".md"]:
241
+ try:
242
+ text = file_bytes.decode("utf-8")
243
+ except:
244
+ text = file_bytes.decode("latin-1", errors="ignore")
245
+
246
+ else:
247
+ text = f"Unsupported file type: {suffix}"
248
+
249
+ # Simple chunking
250
+ chunk_size = 500
251
+ overlap = 50
252
+ chunks = []
253
+
254
+ for i in range(0, len(text), chunk_size - overlap):
255
+ chunk_text = text[i:i + chunk_size]
256
+ if len(chunk_text.strip()) > 20:
257
+ chunks.append({
258
+ "chunk_id": f"chunk_{len(chunks)}",
259
+ "text": chunk_text,
260
+ "page": 0,
261
+ "chunk_type": "text",
262
+ "confidence": 0.9,
263
+ "bbox": None,
264
+ })
265
+
266
+ return {
267
+ "success": True,
268
+ "raw_text": text,
269
+ "chunks": chunks,
270
+ "ocr_regions": [],
271
+ "layout_regions": [],
272
+ "page_count": page_count,
273
+ "ocr_confidence": 0.9,
274
+ "layout_confidence": 0.0,
275
+ "fallback_reason": reason,
276
+ }
277
+
278
+
279
+ def get_page_images(file_bytes: bytes, filename: str, max_pages: int = 5) -> list:
280
+ """Extract page images from PDF for visualization."""
281
+ images = []
282
+ suffix = Path(filename).suffix.lower()
283
+
284
+ if suffix == ".pdf":
285
+ try:
286
+ import fitz
287
+ pdf_stream = io.BytesIO(file_bytes)
288
+ doc = fitz.open(stream=pdf_stream, filetype="pdf")
289
+ page_count = min(len(doc), max_pages)
290
+
291
+ for page_num in range(page_count):
292
+ page = doc[page_num]
293
+ pix = page.get_pixmap(dpi=100)
294
+ img_bytes = pix.tobytes("png")
295
+ images.append({
296
+ "page": page_num,
297
+ "data": base64.b64encode(img_bytes).decode(),
298
+ "width": pix.width,
299
+ "height": pix.height,
300
+ })
301
+ doc.close()
302
+ except:
303
+ pass
304
+
305
+ return images
306
+
307
+
308
+ # Header
309
+ st.markdown("# 🔬 Live Document Processing")
310
+ st.markdown("Process documents in real-time with auto-indexing to RAG")
311
+
312
+ # Global status bar
313
+ render_global_status_bar()
314
+
315
+ st.markdown("---")
316
+
317
+ # Main content
318
+ col_upload, col_status = st.columns([2, 1])
319
+
320
+ with col_upload:
321
+ st.markdown("### 📤 Upload Document")
322
+
323
+ uploaded_file = st.file_uploader(
324
+ "Choose a document",
325
+ type=["pdf", "txt", "md"],
326
+ help="Upload PDF, TXT, or MD files for processing"
327
+ )
328
+
329
+ # Or select from existing files
330
+ docs_path = PROJECT_ROOT / "Dataset"
331
+ existing_docs = sorted([f.name for f in docs_path.glob("*.pdf")]) if docs_path.exists() else []
332
+
333
+ if existing_docs:
334
+ st.markdown("**Or select from samples:**")
335
+ selected_sample = st.selectbox("Sample documents", ["-- Select --"] + existing_docs)
336
+
337
+ with col_status:
338
+ st.markdown("### 📊 System Status")
339
+
340
+ ollama_ok, models = check_ollama()
341
+ rag_system = get_unified_rag_system()
342
+
343
+ status_cols = st.columns(2)
344
+ with status_cols[0]:
345
+ if ollama_ok:
346
+ st.success(f"Ollama ({len(models)})")
347
+ else:
348
+ st.error("Ollama Offline")
349
+ with status_cols[1]:
350
+ if rag_system["status"] == "ready":
351
+ st.success("RAG Ready")
352
+ else:
353
+ st.error("RAG Error")
354
+
355
+ # State summary
356
+ summary = state_manager.get_summary()
357
+ st.metric("Processed Docs", summary["total_documents"])
358
+ st.metric("Indexed Chunks", summary["total_indexed_chunks"])
359
+
360
+ st.markdown("---")
361
+
362
+ # Processing Options
363
+ st.markdown("### ⚙️ Processing Options")
364
+
365
+ opt_cols = st.columns(4)
366
+ with opt_cols[0]:
367
+ ocr_engine = st.radio("OCR Engine", ["paddleocr", "tesseract"], horizontal=True,
368
+ help="PaddleOCR is faster and more accurate for most documents")
369
+ with opt_cols[1]:
370
+ max_pages = st.slider("Max pages", 1, 50, 10, help="Maximum number of pages to process")
371
+ with opt_cols[2]:
372
+ enable_layout = st.checkbox("Layout detection", value=True,
373
+ help="Detect tables, figures, headings and other layout elements")
374
+ with opt_cols[3]:
375
+ auto_index = st.checkbox("Auto-index to RAG", value=True,
376
+ help="Automatically index processed documents for RAG queries")
377
+
378
+ # Advanced options (expanded by default for visibility)
379
+ with st.expander("🔧 Advanced Options", expanded=False):
380
+ adv_cols = st.columns(3)
381
+ with adv_cols[0]:
382
+ preserve_tables = st.checkbox("Preserve table structure", value=True,
383
+ help="Convert tables to markdown format with structure")
384
+ with adv_cols[1]:
385
+ detect_headers = st.checkbox("Detect table headers", value=True,
386
+ help="Automatically identify header rows in tables")
387
+ with adv_cols[2]:
388
+ generate_embeddings = st.checkbox("Generate embeddings", value=True,
389
+ help="Create embeddings for semantic search")
390
+
391
+ # Determine what to process
392
+ file_to_process = None
393
+ file_bytes = None
394
+ filename = None
395
+
396
+ if uploaded_file is not None:
397
+ file_bytes = uploaded_file.read()
398
+ filename = uploaded_file.name
399
+ file_to_process = "upload"
400
+ elif existing_docs and selected_sample != "-- Select --":
401
+ file_path = docs_path / selected_sample
402
+ file_bytes = file_path.read_bytes()
403
+ filename = selected_sample
404
+ file_to_process = "sample"
405
+
406
+ # Process button
407
+ if file_to_process and st.button("🚀 Start Processing", type="primary", use_container_width=True):
408
+
409
+ # Generate document ID
410
+ content_hash = hashlib.md5(file_bytes[:1000]).hexdigest()[:8]
411
+ doc_id = generate_doc_id(filename, content_hash)
412
+
413
+ # Start processing in state manager
414
+ state_manager.start_processing(doc_id, filename)
415
+
416
+ # Pipeline stages
417
+ stages = [
418
+ ("loading", "📄 Loading Document", "Reading and preparing document..."),
419
+ ("ocr", f"🔍 {ocr_engine.upper()} Extraction", "Extracting text from document..."),
420
+ ("layout", "📐 Layout Detection", "Identifying document structure..."),
421
+ ("chunking", "✂️ Semantic Chunking", "Creating meaningful text chunks..."),
422
+ ("indexing", "📚 RAG Indexing", "Adding to vector store..."),
423
+ ]
424
+
425
+ # Progress container
426
+ progress_container = st.container()
427
+ results_container = st.container()
428
+
429
+ with progress_container:
430
+ progress_bar = st.progress(0)
431
+ status_text = st.empty()
432
+
433
+ # Metrics row
434
+ metric_cols = st.columns(5)
435
+ metric_placeholders = {
436
+ "pages": metric_cols[0].empty(),
437
+ "ocr_regions": metric_cols[1].empty(),
438
+ "layout_regions": metric_cols[2].empty(),
439
+ "chunks": metric_cols[3].empty(),
440
+ "confidence": metric_cols[4].empty(),
441
+ }
442
+
443
+ processing_start = time.time()
444
+ processing_result = None
445
+ error_msg = None
446
+
447
+ try:
448
+ # Stage 1: Loading
449
+ status_text.markdown("**📄 Loading document...**")
450
+ state_manager.update_processing(doc_id, "loading", 0.1, "Loading document...")
451
+ progress_bar.progress(10)
452
+ time.sleep(0.3)
453
+
454
+ # Get page images for visualization
455
+ page_images = get_page_images(file_bytes, filename, max_pages)
456
+ metric_placeholders["pages"].metric("Pages", len(page_images) if page_images else "N/A")
457
+
458
+ # Stage 2-3: OCR + Layout
459
+ status_text.markdown(f"**🔍 Running {ocr_engine.upper()}...**")
460
+ state_manager.update_processing(doc_id, "ocr", 0.3, f"Running {ocr_engine}...")
461
+ progress_bar.progress(30)
462
+
463
+ # Actual processing with all options
464
+ options = {
465
+ "ocr_engine": ocr_engine,
466
+ "max_pages": max_pages,
467
+ "enable_layout": enable_layout,
468
+ "preserve_tables": preserve_tables,
469
+ "detect_headers": detect_headers,
470
+ "generate_embeddings": generate_embeddings,
471
+ }
472
+ processing_result = process_document_actual(file_bytes, filename, options)
473
+
474
+ # Update metrics
475
+ metric_placeholders["pages"].metric("Pages", processing_result.get("page_count", 0))
476
+ metric_placeholders["ocr_regions"].metric("OCR Regions", len(processing_result.get("ocr_regions", [])))
477
+
478
+ status_text.markdown("**📐 Layout detection...**")
479
+ state_manager.update_processing(doc_id, "layout", 0.5, "Detecting layout...")
480
+ progress_bar.progress(50)
481
+ time.sleep(0.2)
482
+
483
+ metric_placeholders["layout_regions"].metric("Layout Regions", len(processing_result.get("layout_regions", [])))
484
+
485
+ # Stage 4: Chunking
486
+ status_text.markdown("**✂️ Creating chunks...**")
487
+ state_manager.update_processing(doc_id, "chunking", 0.7, "Creating chunks...")
488
+ progress_bar.progress(70)
489
+ time.sleep(0.2)
490
+
491
+ chunks = processing_result.get("chunks", [])
492
+ metric_placeholders["chunks"].metric("Chunks", len(chunks))
493
+ metric_placeholders["confidence"].metric(
494
+ "Confidence",
495
+ f"{processing_result.get('ocr_confidence', 0) * 100:.0f}%"
496
+ )
497
+
498
+ # Stage 5: RAG Indexing
499
+ indexed_count = 0
500
+ if auto_index and rag_system["status"] == "ready" and chunks:
501
+ status_text.markdown("**📚 Indexing to RAG...**")
502
+ state_manager.update_processing(doc_id, "indexing", 0.9, "Indexing to RAG...")
503
+ progress_bar.progress(90)
504
+
505
+ # Auto-index
506
+ index_result = auto_index_processed_document(
507
+ doc_id=doc_id,
508
+ text=processing_result.get("raw_text", ""),
509
+ chunks=chunks,
510
+ metadata={"filename": filename, "source": file_to_process}
511
+ )
512
+
513
+ if index_result["success"]:
514
+ indexed_count = index_result["num_chunks"]
515
+ state_manager.mark_indexed(doc_id, indexed_count)
516
+
517
+ # Complete
518
+ progress_bar.progress(100)
519
+ processing_time = time.time() - processing_start
520
+
521
+ # Add to state manager
522
+ state_doc = StateDocument(
523
+ doc_id=doc_id,
524
+ filename=filename,
525
+ file_type=Path(filename).suffix[1:].upper(),
526
+ raw_text=processing_result.get("raw_text", ""),
527
+ chunks=chunks,
528
+ page_count=processing_result.get("page_count", 1),
529
+ page_images=[img["data"] for img in page_images],
530
+ ocr_regions=processing_result.get("ocr_regions", []),
531
+ layout_data={"regions": processing_result.get("layout_regions", [])},
532
+ indexed=indexed_count > 0,
533
+ indexed_chunks=indexed_count,
534
+ processing_time=processing_time,
535
+ )
536
+ state_manager.add_document(state_doc)
537
+ state_manager.complete_processing(doc_id, success=True)
538
+ state_manager.set_active_document(doc_id)
539
+
540
+ status_text.success(f"✅ Processing complete in {processing_time:.2f}s!")
541
+
542
+ except Exception as e:
543
+ error_msg = str(e)
544
+ state_manager.complete_processing(doc_id, success=False, error=error_msg)
545
+ status_text.error(f"❌ Processing failed: {error_msg}")
546
+
547
+ # Results
548
+ if processing_result and processing_result.get("success"):
549
+ with results_container:
550
+ st.markdown("---")
551
+ st.markdown("### 📋 Processing Results")
552
+
553
+ # Summary cards
554
+ sum_cols = st.columns(5)
555
+ sum_cols[0].markdown(f"""
556
+ <div class="metric-card">
557
+ <div class="metric-value">{processing_result.get('page_count', 0)}</div>
558
+ <div class="metric-label">Pages</div>
559
+ </div>
560
+ """, unsafe_allow_html=True)
561
+ sum_cols[1].markdown(f"""
562
+ <div class="metric-card">
563
+ <div class="metric-value">{len(processing_result.get('ocr_regions', []))}</div>
564
+ <div class="metric-label">OCR Regions</div>
565
+ </div>
566
+ """, unsafe_allow_html=True)
567
+ sum_cols[2].markdown(f"""
568
+ <div class="metric-card">
569
+ <div class="metric-value">{len(processing_result.get('layout_regions', []))}</div>
570
+ <div class="metric-label">Layout Regions</div>
571
+ </div>
572
+ """, unsafe_allow_html=True)
573
+ sum_cols[3].markdown(f"""
574
+ <div class="metric-card">
575
+ <div class="metric-value">{len(chunks)}</div>
576
+ <div class="metric-label">Chunks</div>
577
+ </div>
578
+ """, unsafe_allow_html=True)
579
+ sum_cols[4].markdown(f"""
580
+ <div class="metric-card">
581
+ <div class="metric-value">{indexed_count}</div>
582
+ <div class="metric-label">Indexed</div>
583
+ </div>
584
+ """, unsafe_allow_html=True)
585
+
586
+ # Show fallback warning prominently if fallback was used
587
+ if processing_result.get("fallback_reason"):
588
+ st.error(f"⚠️ **Fallback Mode**: Document processor failed, using simple text extraction. Layout detection unavailable. Reason: {processing_result['fallback_reason']}")
589
+
590
+ # Tabs for detailed results
591
+ tab_text, tab_chunks, tab_layout, tab_pages = st.tabs([
592
+ "📝 Extracted Text",
593
+ "📦 Chunks",
594
+ "🗺️ Layout",
595
+ "📄 Pages"
596
+ ])
597
+
598
+ with tab_text:
599
+ text_preview = processing_result.get("raw_text", "")[:5000]
600
+ if len(processing_result.get("raw_text", "")) > 5000:
601
+ text_preview += "\n\n... [truncated] ..."
602
+ st.text_area("Full Text", text_preview, height=400)
603
+
604
+ if processing_result.get("fallback_reason"):
605
+ st.warning(f"Using fallback extraction: {processing_result['fallback_reason']}")
606
+
607
+ with tab_chunks:
608
+ for i, chunk in enumerate(chunks[:20]):
609
+ chunk_type = chunk.get("chunk_type", "text")
610
+ conf = chunk.get("confidence", 0)
611
+ color = "#4ECDC4" if conf > 0.8 else "#ffc107" if conf > 0.6 else "#dc3545"
612
+
613
+ with st.expander(f"[{i+1}] {chunk_type.upper()} - {chunk.get('text', '')[:50]}..."):
614
+ col1, col2, col3 = st.columns([2, 1, 1])
615
+ col1.markdown(f"**Chunk ID:** `{chunk.get('chunk_id', 'N/A')}`")
616
+ col2.markdown(f"**Page:** {chunk.get('page', 0) + 1}")
617
+ col3.markdown(f"**Confidence:** <span style='color:{color}'>{conf:.0%}</span>", unsafe_allow_html=True)
618
+ st.code(chunk.get("text", ""), language=None)
619
+
620
+ if len(chunks) > 20:
621
+ st.info(f"Showing 20 of {len(chunks)} chunks")
622
+
623
+ with tab_layout:
624
+ layout_regions = processing_result.get("layout_regions", [])
625
+ if layout_regions:
626
+ # Group by type
627
+ by_type = {}
628
+ for r in layout_regions:
629
+ t = r.get("type", "unknown")
630
+ by_type[t] = by_type.get(t, 0) + 1
631
+
632
+ st.markdown("**Detected Region Types:**")
633
+ type_cols = st.columns(min(len(by_type), 6))
634
+ for i, (rtype, count) in enumerate(by_type.items()):
635
+ type_cols[i % 6].metric(rtype.title(), count)
636
+
637
+ st.markdown("**Regions:**")
638
+ for r in layout_regions[:15]:
639
+ conf = r.get("confidence", 0)
640
+ color = "#4ECDC4" if conf > 0.8 else "#ffc107" if conf > 0.6 else "#dc3545"
641
+ st.markdown(f"- **{r.get('type', 'unknown').upper()}** (page {r.get('page', 0) + 1}) - Confidence: <span style='color:{color}'>{conf:.0%}</span>", unsafe_allow_html=True)
642
+ else:
643
+ # Provide helpful message based on cause
644
+ if processing_result.get("fallback_reason"):
645
+ st.warning("Layout detection unavailable - document processor is using fallback mode. Check the error message above.")
646
+ elif not enable_layout:
647
+ st.info("Layout detection is disabled. Enable it in the options above.")
648
+ else:
649
+ st.info("No layout regions detected. The document may have minimal structure or the OCR results didn't contain enough text patterns for layout analysis.")
650
+
651
+ with tab_pages:
652
+ if page_images:
653
+ for img_data in page_images:
654
+ st.markdown(f"**Page {img_data['page'] + 1}** ({img_data['width']}x{img_data['height']})")
655
+ st.image(
656
+ f"data:image/png;base64,{img_data['data']}",
657
+ use_container_width=True
658
+ )
659
+ else:
660
+ st.info("Page images not available")
661
+
662
+ # Navigation to other modules
663
+ st.markdown("---")
664
+ st.markdown("### 🔗 Continue With This Document")
665
+
666
+ nav_cols = st.columns(3)
667
+
668
+ with nav_cols[0]:
669
+ st.markdown("""
670
+ <div class="nav-card">
671
+ <h4>💬 Interactive RAG</h4>
672
+ <p style="color: #8b949e;">Ask questions about this document using the RAG system.</p>
673
+ </div>
674
+ """, unsafe_allow_html=True)
675
+ if st.button("Go to Interactive RAG", key="nav_rag", use_container_width=True):
676
+ st.switch_page("pages/2_💬_Interactive_RAG.py")
677
+
678
+ with nav_cols[1]:
679
+ st.markdown("""
680
+ <div class="nav-card">
681
+ <h4>📄 Document Viewer</h4>
682
+ <p style="color: #8b949e;">View chunks, layout, and visual annotations.</p>
683
+ </div>
684
+ """, unsafe_allow_html=True)
685
+ if st.button("Go to Document Viewer", key="nav_viewer", use_container_width=True):
686
+ st.switch_page("pages/5_📄_Document_Viewer.py")
687
+
688
+ with nav_cols[2]:
689
+ st.markdown("""
690
+ <div class="nav-card">
691
+ <h4>🎯 Evidence Viewer</h4>
692
+ <p style="color: #8b949e;">Inspect OCR regions and evidence grounding.</p>
693
+ </div>
694
+ """, unsafe_allow_html=True)
695
+ if st.button("Go to Evidence Viewer", key="nav_evidence", use_container_width=True):
696
+ st.switch_page("pages/4_🎯_Evidence_Viewer.py")
697
+
698
+ # Show recent processed documents
699
+ st.markdown("---")
700
+ st.markdown("### 📚 Recently Processed")
701
+
702
+ all_docs = state_manager.get_all_documents()
703
+ if all_docs:
704
+ for doc in reversed(all_docs[-5:]):
705
+ col1, col2, col3, col4 = st.columns([3, 1, 1, 1])
706
+ col1.markdown(f"**{doc.filename}** (`{doc.doc_id[:8]}...`)")
707
+ col2.markdown(f"📄 {doc.page_count} pages")
708
+ col3.markdown(f"📦 {len(doc.chunks)} chunks")
709
+ if doc.indexed:
710
+ col4.success(f"✓ Indexed ({doc.indexed_chunks})")
711
+ else:
712
+ col4.warning("Not indexed")
713
+ else:
714
+ st.info("No documents processed yet. Upload or select a document above.")
demo/pages/2_💬_Interactive_RAG.py ADDED
@@ -0,0 +1,844 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Interactive Multi-Agentic RAG - SPARKNET
3
+
4
+ Query your documents using the unified RAG system with document filtering
5
+ and real-time chunk inspection.
6
+ """
7
+
8
+ import streamlit as st
9
+ import sys
10
+ from pathlib import Path
11
+ import time
12
+ import hashlib
13
+
14
+ PROJECT_ROOT = Path(__file__).parent.parent.parent
15
+ sys.path.insert(0, str(PROJECT_ROOT))
16
+ sys.path.insert(0, str(PROJECT_ROOT / "demo"))
17
+
18
+ # Import unified RAG configuration and state manager
19
+ from rag_config import (
20
+ get_unified_rag_system,
21
+ get_store_stats,
22
+ index_document,
23
+ query_rag,
24
+ check_ollama,
25
+ get_indexed_documents,
26
+ search_similar_chunks,
27
+ )
28
+ from state_manager import (
29
+ get_state_manager,
30
+ render_global_status_bar,
31
+ )
32
+ import re
33
+ from collections import Counter
34
+
35
+
36
+ def clean_filename_for_question(filename: str) -> str:
37
+ """
38
+ Clean a filename to make it suitable for use in a question.
39
+ Handles cases like 'Red_Hat_NA.pdf' -> 'Red Hat' (removing short tokens).
40
+ """
41
+ # Remove extension
42
+ name = Path(filename).stem
43
+
44
+ # Replace separators with spaces
45
+ name = re.sub(r'[_\-\.]+', ' ', name)
46
+
47
+ # Split into words and filter
48
+ words = name.split()
49
+
50
+ # Remove very short tokens (like 'NA', 'V1', etc.) and numbers
51
+ cleaned_words = []
52
+ for word in words:
53
+ # Skip if too short (1-2 chars) unless it's a known acronym
54
+ if len(word) <= 2 and not word.upper() in ['AI', 'ML', 'NLP', 'API', 'UI', 'UX']:
55
+ continue
56
+ # Skip pure numbers or version-like strings
57
+ if re.match(r'^[vV]?\d+$', word):
58
+ continue
59
+ # Skip common file suffixes
60
+ if word.lower() in ['final', 'draft', 'copy', 'new', 'old', 'v1', 'v2']:
61
+ continue
62
+ cleaned_words.append(word)
63
+
64
+ # Join and clean up extra spaces
65
+ result = ' '.join(cleaned_words).strip()
66
+
67
+ # If result is too short, return None
68
+ if len(result) < 3:
69
+ return None
70
+
71
+ return result
72
+
73
+
74
+ def generate_dynamic_questions(state_manager, indexed_docs, max_questions=4):
75
+ """
76
+ Generate dynamic suggested questions based on indexed document content.
77
+
78
+ Analyzes:
79
+ - Document titles and filenames
80
+ - Chunk content for key topics
81
+ - Table presence
82
+ - Document types
83
+ - Detected entities and keywords
84
+ """
85
+ questions = []
86
+
87
+ # Get all indexed documents from state manager
88
+ all_docs = state_manager.get_all_documents()
89
+ indexed_doc_list = [d for d in all_docs if d.indexed]
90
+
91
+ if not indexed_doc_list and not indexed_docs:
92
+ # No documents indexed - return generic questions
93
+ return [
94
+ "What is the main topic of this document?",
95
+ "Summarize the key points",
96
+ "What are the main findings?",
97
+ "List the important details",
98
+ ]
99
+
100
+ # Collect document info
101
+ doc_names = []
102
+ all_text_samples = []
103
+ has_tables = False
104
+ has_figures = False
105
+ doc_types = set()
106
+
107
+ for doc in indexed_doc_list:
108
+ doc_names.append(doc.filename)
109
+ doc_types.add(doc.file_type.lower())
110
+
111
+ # Sample text from chunks
112
+ for chunk in doc.chunks[:10]: # First 10 chunks
113
+ chunk_text = chunk.get('text', '') if isinstance(chunk, dict) else str(chunk)
114
+ all_text_samples.append(chunk_text[:500])
115
+
116
+ # Check for tables
117
+ chunk_type = chunk.get('chunk_type', '') if isinstance(chunk, dict) else ''
118
+ if 'table' in chunk_type.lower():
119
+ has_tables = True
120
+ if 'figure' in chunk_type.lower() or 'chart' in chunk_type.lower():
121
+ has_figures = True
122
+
123
+ # Also check indexed_docs from RAG system
124
+ for doc_info in indexed_docs[:5]:
125
+ if isinstance(doc_info, dict):
126
+ doc_names.append(doc_info.get('filename', doc_info.get('doc_id', '')))
127
+
128
+ # Extract key topics from text samples
129
+ combined_text = ' '.join(all_text_samples).lower()
130
+
131
+ # Extract potential topics (simple keyword extraction)
132
+ stop_words = {
133
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
134
+ 'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
135
+ 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
136
+ 'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need',
137
+ 'this', 'that', 'these', 'those', 'it', 'its', 'as', 'if', 'when',
138
+ 'than', 'so', 'no', 'not', 'only', 'own', 'same', 'too', 'very',
139
+ 'just', 'also', 'now', 'here', 'there', 'where', 'why', 'how', 'all',
140
+ 'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such',
141
+ 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between',
142
+ 'under', 'again', 'further', 'then', 'once', 'any', 'about', 'which', 'who',
143
+ 'page', 'document', 'file', 'section', 'chapter', 'figure', 'table',
144
+ }
145
+
146
+ # Extract words (3+ chars, not numbers)
147
+ words = re.findall(r'\b[a-z]{3,}\b', combined_text)
148
+ meaningful_words = [w for w in words if w not in stop_words and len(w) > 3]
149
+ word_freq = Counter(meaningful_words)
150
+ top_topics = [word for word, count in word_freq.most_common(15) if count > 2]
151
+
152
+ # Generate questions based on top topics (prioritize content-based questions)
153
+ if top_topics:
154
+ topic = top_topics[0]
155
+ questions.append(f"What does the document say about {topic}?")
156
+
157
+ if len(top_topics) > 1:
158
+ questions.append(f"Explain the {top_topics[1]} mentioned in the document")
159
+
160
+ if len(top_topics) > 2:
161
+ questions.append(f"How are {top_topics[0]} and {top_topics[2]} related?")
162
+
163
+ # Generate questions based on clean document names (only if name is meaningful)
164
+ for name in doc_names[:2]:
165
+ clean_name = clean_filename_for_question(name)
166
+ if clean_name and len(clean_name) > 5:
167
+ questions.append(f"Summarize the {clean_name} document")
168
+ break # Only use one document name question
169
+
170
+ # Add table-specific question if tables detected
171
+ if has_tables:
172
+ questions.append("What data is presented in the tables?")
173
+
174
+ # Add figure-specific question if figures detected
175
+ if has_figures:
176
+ questions.append("What do the figures and charts show?")
177
+
178
+ # Add document-type specific questions
179
+ if 'pdf' in doc_types:
180
+ questions.append("What are the main conclusions?")
181
+ if 'docx' in doc_types or 'doc' in doc_types:
182
+ questions.append("What recommendations are made?")
183
+ if 'xlsx' in doc_types or 'xls' in doc_types:
184
+ questions.append("What trends are visible in the data?")
185
+
186
+ # Add content-aware generic questions
187
+ generic_questions = [
188
+ "Summarize the key points in this document",
189
+ "What are the main findings discussed?",
190
+ "What methodology or approach is described?",
191
+ "What are the important takeaways?",
192
+ "List the main topics covered",
193
+ "What problems or challenges are mentioned?",
194
+ ]
195
+
196
+ # Fill remaining slots with generic questions
197
+ for q in generic_questions:
198
+ if len(questions) >= max_questions:
199
+ break
200
+ if q not in questions:
201
+ questions.append(q)
202
+
203
+ # Ensure we have unique questions and limit to max
204
+ seen = set()
205
+ unique_questions = []
206
+ for q in questions:
207
+ q_lower = q.lower()
208
+ if q_lower not in seen:
209
+ seen.add(q_lower)
210
+ unique_questions.append(q)
211
+ if len(unique_questions) >= max_questions:
212
+ break
213
+
214
+ # Fallback if we don't have enough
215
+ while len(unique_questions) < max_questions:
216
+ fallback = [
217
+ "What is this document about?",
218
+ "Summarize the key points",
219
+ "What are the main findings?",
220
+ "What conclusions are drawn?",
221
+ ]
222
+ for q in fallback:
223
+ if q not in unique_questions:
224
+ unique_questions.append(q)
225
+ break
226
+ if len(unique_questions) >= max_questions:
227
+ break
228
+
229
+ return unique_questions[:max_questions]
230
+
231
+ st.set_page_config(
232
+ page_title="Interactive RAG - SPARKNET",
233
+ page_icon="💬",
234
+ layout="wide"
235
+ )
236
+
237
+ # Custom CSS
238
+ st.markdown("""
239
+ <style>
240
+ .chat-message {
241
+ padding: 15px;
242
+ border-radius: 12px;
243
+ margin: 10px 0;
244
+ }
245
+ .user-message {
246
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
247
+ color: white;
248
+ }
249
+ .assistant-message {
250
+ background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
251
+ color: #eee;
252
+ }
253
+ .source-card {
254
+ background: #0d1117;
255
+ border-radius: 8px;
256
+ padding: 12px;
257
+ margin: 8px 0;
258
+ border-left: 4px solid #4ECDC4;
259
+ }
260
+ .source-header {
261
+ font-size: 11px;
262
+ color: #888;
263
+ margin-bottom: 6px;
264
+ }
265
+ .source-text {
266
+ font-size: 13px;
267
+ color: #c9d1d9;
268
+ font-family: monospace;
269
+ }
270
+ .metric-box {
271
+ background: #161b22;
272
+ border-radius: 8px;
273
+ padding: 10px;
274
+ text-align: center;
275
+ }
276
+ .metric-value {
277
+ font-size: 20px;
278
+ font-weight: bold;
279
+ color: #4ECDC4;
280
+ }
281
+ .metric-label {
282
+ font-size: 10px;
283
+ color: #888;
284
+ text-transform: uppercase;
285
+ }
286
+ .pipeline-bar {
287
+ display: flex;
288
+ justify-content: center;
289
+ gap: 5px;
290
+ padding: 10px;
291
+ background: #0d1117;
292
+ border-radius: 8px;
293
+ margin: 10px 0;
294
+ }
295
+ .pipeline-step {
296
+ padding: 5px 12px;
297
+ border-radius: 15px;
298
+ font-size: 11px;
299
+ background: #21262d;
300
+ color: #8b949e;
301
+ }
302
+ .pipeline-step.active {
303
+ background: linear-gradient(90deg, #4ECDC4, #44a08d);
304
+ color: white;
305
+ }
306
+ .pipeline-step.done {
307
+ background: #238636;
308
+ color: white;
309
+ }
310
+ .doc-filter-card {
311
+ background: #161b22;
312
+ border-radius: 8px;
313
+ padding: 10px;
314
+ margin: 5px 0;
315
+ border: 1px solid #30363d;
316
+ }
317
+ .doc-filter-card.selected {
318
+ border-color: #4ECDC4;
319
+ }
320
+ .chunk-preview {
321
+ background: #0d1117;
322
+ border-radius: 6px;
323
+ padding: 8px;
324
+ margin: 4px 0;
325
+ font-size: 12px;
326
+ font-family: monospace;
327
+ max-height: 100px;
328
+ overflow-y: auto;
329
+ }
330
+ </style>
331
+ """, unsafe_allow_html=True)
332
+
333
+
334
+ def get_chunk_color(index: int) -> str:
335
+ """Get distinct color for citations."""
336
+ colors = [
337
+ "#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4",
338
+ "#FFEAA7", "#DDA0DD", "#98D8C8", "#F7DC6F",
339
+ ]
340
+ return colors[index % len(colors)]
341
+
342
+
343
+ # Initialize state manager
344
+ state_manager = get_state_manager()
345
+
346
+ # Get system status
347
+ rag_system = get_unified_rag_system()
348
+ ollama_ok, models = check_ollama()
349
+ stats = get_store_stats()
350
+ indexed_docs = get_indexed_documents()
351
+
352
+ # Session state
353
+ if "messages" not in st.session_state:
354
+ st.session_state.messages = []
355
+ if "quick_indexed" not in st.session_state:
356
+ st.session_state.quick_indexed = []
357
+ if "doc_filter" not in st.session_state:
358
+ st.session_state.doc_filter = None # None = all documents
359
+
360
+
361
+ # Header
362
+ st.markdown("# 💬 Interactive RAG Chat")
363
+ st.markdown("Ask questions about your indexed documents with multi-agent pipeline")
364
+
365
+ # Global status bar
366
+ render_global_status_bar()
367
+
368
+ # Pipeline indicator
369
+ st.markdown("""
370
+ <div class="pipeline-bar">
371
+ <span class="pipeline-step">📝 Query</span>
372
+ <span>→</span>
373
+ <span class="pipeline-step">🎯 Plan</span>
374
+ <span>→</span>
375
+ <span class="pipeline-step">🔍 Retrieve</span>
376
+ <span>→</span>
377
+ <span class="pipeline-step">📊 Rerank</span>
378
+ <span>→</span>
379
+ <span class="pipeline-step">💬 Generate</span>
380
+ <span>→</span>
381
+ <span class="pipeline-step">✅ Validate</span>
382
+ </div>
383
+ """, unsafe_allow_html=True)
384
+
385
+ # Status bar
386
+ cols = st.columns(5)
387
+ with cols[0]:
388
+ if ollama_ok:
389
+ st.success(f"Ollama ({len(models)})")
390
+ else:
391
+ st.error("Ollama Offline")
392
+ with cols[1]:
393
+ if rag_system["status"] == "ready":
394
+ st.success("RAG Ready")
395
+ else:
396
+ st.error("RAG Error")
397
+ with cols[2]:
398
+ st.info(f"{rag_system.get('llm_model', 'N/A').split(':')[0]}")
399
+ with cols[3]:
400
+ chunk_count = stats.get('total_chunks', 0)
401
+ if chunk_count > 0:
402
+ st.success(f"{chunk_count} Chunks")
403
+ else:
404
+ st.warning("0 Chunks")
405
+ with cols[4]:
406
+ st.info(f"{rag_system.get('embed_model', 'N/A').split(':')[0]}")
407
+
408
+ if rag_system["status"] == "error":
409
+ with st.expander("RAG Error Details"):
410
+ st.code(rag_system["error"])
411
+
412
+ st.markdown("---")
413
+
414
+ # Sidebar
415
+ with st.sidebar:
416
+ st.markdown("## 📚 Document Filter")
417
+
418
+ if indexed_docs:
419
+ st.markdown(f"**{len(indexed_docs)} documents indexed**")
420
+
421
+ # All documents option
422
+ if st.button(
423
+ "All Documents",
424
+ key="filter_all",
425
+ type="primary" if st.session_state.doc_filter is None else "secondary",
426
+ use_container_width=True
427
+ ):
428
+ st.session_state.doc_filter = None
429
+ st.rerun()
430
+
431
+ st.markdown("---")
432
+ st.markdown("**Filter by document:**")
433
+
434
+ # Document list
435
+ for doc in indexed_docs[:10]:
436
+ doc_id = doc.get("document_id", "unknown")
437
+ chunk_count = doc.get("chunk_count", 0)
438
+ is_selected = st.session_state.doc_filter == doc_id
439
+
440
+ if st.button(
441
+ f"📄 {doc_id[:20]}... ({chunk_count})",
442
+ key=f"filter_{doc_id}",
443
+ type="primary" if is_selected else "secondary",
444
+ use_container_width=True
445
+ ):
446
+ st.session_state.doc_filter = doc_id
447
+ st.rerun()
448
+
449
+ if len(indexed_docs) > 10:
450
+ st.caption(f"... and {len(indexed_docs) - 10} more")
451
+
452
+ # Show selected filter
453
+ if st.session_state.doc_filter:
454
+ st.markdown("---")
455
+ st.info(f"Filtering: {st.session_state.doc_filter[:25]}...")
456
+ if st.button("Clear Filter"):
457
+ st.session_state.doc_filter = None
458
+ st.rerun()
459
+ else:
460
+ st.info("No documents indexed yet")
461
+
462
+ st.markdown("---")
463
+ st.markdown("## 📤 Quick Index")
464
+ st.caption("Index text directly without leaving this page")
465
+
466
+ quick_text = st.text_area("Paste text:", height=120, key="quick_text",
467
+ placeholder="Paste document text here...")
468
+ quick_name = st.text_input("Name:", value="quick_doc", key="quick_name")
469
+
470
+ if st.button("📥 Index Now", type="primary", use_container_width=True,
471
+ disabled=(rag_system["status"] != "ready")):
472
+ if quick_text.strip():
473
+ with st.spinner("Indexing..."):
474
+ doc_id = f"{quick_name}_{hashlib.md5(quick_text[:50].encode()).hexdigest()[:8]}"
475
+ result = index_document(
476
+ text=quick_text,
477
+ document_id=doc_id,
478
+ metadata={"filename": quick_name, "source": "quick_index"}
479
+ )
480
+ if result["success"]:
481
+ st.session_state.quick_indexed.append(quick_name)
482
+ st.success(f"{result['num_chunks']} chunks indexed!")
483
+ st.rerun()
484
+ else:
485
+ st.error(f"Error: {result['error']}")
486
+ else:
487
+ st.warning("Enter some text first")
488
+
489
+ # Recently indexed
490
+ if st.session_state.quick_indexed:
491
+ st.markdown("---")
492
+ st.markdown("### Recently Indexed")
493
+ for doc in st.session_state.quick_indexed[-5:]:
494
+ st.caption(f"• {doc}")
495
+
496
+ st.markdown("---")
497
+ st.markdown("### Options")
498
+
499
+ show_sources = st.checkbox("Show sources", value=True)
500
+ show_metrics = st.checkbox("Show metrics", value=True)
501
+ show_chunk_preview = st.checkbox("Show chunk preview", value=False)
502
+
503
+ if st.button("Clear Chat"):
504
+ st.session_state.messages = []
505
+ st.rerun()
506
+
507
+ # Main chat area
508
+ if stats.get('total_chunks', 0) == 0:
509
+ st.warning("No documents indexed yet!")
510
+ st.markdown("""
511
+ **To get started:**
512
+ 1. Use the **Quick Index** in the sidebar to paste and index text
513
+ 2. Or go to **🔬 Live Processing** page to upload and process documents
514
+
515
+ Once you've indexed some content, come back here to ask questions!
516
+ """)
517
+
518
+ # Sample text for quick start
519
+ with st.expander("Try with sample text"):
520
+ sample = """SPARKNET is a multi-agentic document intelligence framework.
521
+ It uses RAG (Retrieval-Augmented Generation) for document Q&A.
522
+
523
+ Key features:
524
+ - PDF, TXT, MD document processing
525
+ - Visual chunk segmentation
526
+ - Hybrid retrieval (dense + sparse)
527
+ - Cross-encoder reranking
528
+ - Grounded answer generation with citations
529
+ - Hallucination detection and validation
530
+
531
+ The system uses multiple specialized agents:
532
+ 1. Query Planner - analyzes and decomposes queries
533
+ 2. Retriever - performs hybrid search
534
+ 3. Reranker - scores relevance with cross-encoder
535
+ 4. Synthesizer - generates grounded answers
536
+ 5. Critic - validates for hallucination"""
537
+
538
+ st.code(sample, language=None)
539
+ if st.button("Index This Sample"):
540
+ result = index_document(
541
+ text=sample,
542
+ document_id="sparknet_sample",
543
+ metadata={"filename": "sparknet_sample", "source": "sample"}
544
+ )
545
+ if result["success"]:
546
+ st.success(f"Indexed {result['num_chunks']} chunks!")
547
+ st.rerun()
548
+
549
+ # Navigation
550
+ col1, col2 = st.columns(2)
551
+ with col1:
552
+ if st.button("🔬 Go to Live Processing", type="primary", use_container_width=True):
553
+ st.switch_page("pages/1_🔬_Live_Processing.py")
554
+ with col2:
555
+ if st.button("📄 Go to Document Viewer", use_container_width=True):
556
+ st.switch_page("pages/5_📄_Document_Viewer.py")
557
+
558
+ else:
559
+ # Check if we need to process a pending user message (from sample question click)
560
+ pending_query = None
561
+ if st.session_state.messages and st.session_state.messages[-1]["role"] == "user":
562
+ # Check if there's no assistant response after the last user message
563
+ pending_query = st.session_state.messages[-1]["content"]
564
+
565
+ # Display chat history (except pending query which we'll process below)
566
+ messages_to_display = st.session_state.messages[:-1] if pending_query else st.session_state.messages
567
+ for msg in messages_to_display:
568
+ with st.chat_message(msg["role"]):
569
+ st.markdown(msg["content"])
570
+
571
+ if msg["role"] == "assistant" and "metadata" in msg:
572
+ meta = msg["metadata"]
573
+
574
+ # Metrics
575
+ if show_metrics and meta:
576
+ m_cols = st.columns(4)
577
+ with m_cols[0]:
578
+ st.markdown(f'<div class="metric-box"><div class="metric-value">{meta.get("latency_ms", 0):.0f}ms</div><div class="metric-label">Latency</div></div>', unsafe_allow_html=True)
579
+ with m_cols[1]:
580
+ st.markdown(f'<div class="metric-box"><div class="metric-value">{meta.get("num_sources", 0)}</div><div class="metric-label">Sources</div></div>', unsafe_allow_html=True)
581
+ with m_cols[2]:
582
+ conf = meta.get("confidence", 0)
583
+ color = "#4ECDC4" if conf > 0.6 else "#ffc107" if conf > 0.3 else "#dc3545"
584
+ st.markdown(f'<div class="metric-box"><div class="metric-value" style="color:{color}">{conf:.0%}</div><div class="metric-label">Confidence</div></div>', unsafe_allow_html=True)
585
+ with m_cols[3]:
586
+ val = "✓" if meta.get("validated") else "?"
587
+ st.markdown(f'<div class="metric-box"><div class="metric-value">{val}</div><div class="metric-label">Validated</div></div>', unsafe_allow_html=True)
588
+
589
+ # Sources
590
+ if show_sources and "citations" in msg and msg["citations"]:
591
+ with st.expander(f"Sources ({len(msg['citations'])})"):
592
+ for i, cite in enumerate(msg["citations"]):
593
+ color = get_chunk_color(i)
594
+ st.markdown(f"""
595
+ <div class="source-card" style="border-left-color: {color};">
596
+ <div class="source-header">
597
+ <strong>[{cite.get('index', i+1)}]</strong> • Relevance: {cite.get('relevance_score', 0):.0%}
598
+ </div>
599
+ <div class="source-text">{cite.get('text_snippet', '')[:300]}...</div>
600
+ </div>
601
+ """, unsafe_allow_html=True)
602
+
603
+ # Show current filter
604
+ if st.session_state.doc_filter:
605
+ st.info(f"Searching in: **{st.session_state.doc_filter}** — [Clear filter in sidebar]")
606
+
607
+ # Process pending query from sample question click
608
+ if pending_query:
609
+ with st.chat_message("user"):
610
+ st.markdown(pending_query)
611
+
612
+ with st.chat_message("assistant"):
613
+ if rag_system["status"] != "ready":
614
+ st.error("RAG system not ready")
615
+ st.session_state.messages.append({"role": "assistant", "content": "RAG system not ready"})
616
+ else:
617
+ # Show progress
618
+ progress = st.progress(0)
619
+ status = st.empty()
620
+
621
+ stages = ["Planning", "Retrieving", "Reranking", "Generating", "Validating"]
622
+ for i, stage in enumerate(stages):
623
+ status.markdown(f"**{stage}...**")
624
+ progress.progress((i + 1) * 20)
625
+ time.sleep(0.15)
626
+
627
+ # Build filters for document
628
+ filters = None
629
+ if st.session_state.doc_filter:
630
+ filters = {"document_id": st.session_state.doc_filter}
631
+
632
+ # Query RAG
633
+ response, error = query_rag(pending_query, filters=filters)
634
+
635
+ progress.empty()
636
+ status.empty()
637
+
638
+ if error:
639
+ st.error(f"Error: {error}")
640
+ st.session_state.messages.append({"role": "assistant", "content": f"Error: {error}"})
641
+ elif response:
642
+ # Display answer
643
+ st.markdown(response.answer)
644
+
645
+ # Build metadata
646
+ metadata = {
647
+ "latency_ms": response.latency_ms,
648
+ "num_sources": response.num_sources,
649
+ "confidence": response.confidence,
650
+ "validated": response.validated,
651
+ }
652
+
653
+ # Display metrics
654
+ if show_metrics:
655
+ m_cols = st.columns(4)
656
+ with m_cols[0]:
657
+ st.markdown(f'<div class="metric-box"><div class="metric-value">{metadata.get("latency_ms", 0):.0f}ms</div><div class="metric-label">Latency</div></div>', unsafe_allow_html=True)
658
+ with m_cols[1]:
659
+ st.markdown(f'<div class="metric-box"><div class="metric-value">{metadata.get("num_sources", 0)}</div><div class="metric-label">Sources</div></div>', unsafe_allow_html=True)
660
+ with m_cols[2]:
661
+ conf = metadata.get("confidence", 0)
662
+ color = "#4ECDC4" if conf > 0.6 else "#ffc107" if conf > 0.3 else "#dc3545"
663
+ st.markdown(f'<div class="metric-box"><div class="metric-value" style="color:{color}">{conf:.0%}</div><div class="metric-label">Confidence</div></div>', unsafe_allow_html=True)
664
+ with m_cols[3]:
665
+ val = "✓" if metadata.get("validated") else "?"
666
+ st.markdown(f'<div class="metric-box"><div class="metric-value">{val}</div><div class="metric-label">Validated</div></div>', unsafe_allow_html=True)
667
+
668
+ # Build citations list
669
+ citations = []
670
+ if hasattr(response, 'citations') and response.citations:
671
+ for i, cite in enumerate(response.citations):
672
+ citations.append({
673
+ "index": i + 1,
674
+ "text_snippet": cite.text_snippet if hasattr(cite, 'text_snippet') else str(cite),
675
+ "relevance_score": cite.relevance_score if hasattr(cite, 'relevance_score') else 0.0,
676
+ })
677
+
678
+ # Store message with metadata
679
+ st.session_state.messages.append({
680
+ "role": "assistant",
681
+ "content": response.answer,
682
+ "metadata": metadata,
683
+ "citations": citations,
684
+ })
685
+ else:
686
+ st.warning("No response from RAG system")
687
+ st.session_state.messages.append({"role": "assistant", "content": "No response from RAG system"})
688
+
689
+ # Chat input
690
+ if prompt := st.chat_input("Ask about your documents..."):
691
+ # Add user message
692
+ st.session_state.messages.append({"role": "user", "content": prompt})
693
+
694
+ with st.chat_message("user"):
695
+ st.markdown(prompt)
696
+
697
+ with st.chat_message("assistant"):
698
+ if rag_system["status"] != "ready":
699
+ st.error("RAG system not ready")
700
+ st.session_state.messages.append({"role": "assistant", "content": "RAG system not ready"})
701
+ else:
702
+ # Show progress
703
+ progress = st.progress(0)
704
+ status = st.empty()
705
+
706
+ stages = ["Planning", "Retrieving", "Reranking", "Generating", "Validating"]
707
+ for i, stage in enumerate(stages):
708
+ status.markdown(f"**{stage}...**")
709
+ progress.progress((i + 1) * 20)
710
+ time.sleep(0.15)
711
+
712
+ # Build filters for document
713
+ filters = None
714
+ if st.session_state.doc_filter:
715
+ filters = {"document_id": st.session_state.doc_filter}
716
+
717
+ # Query RAG
718
+ response, error = query_rag(prompt, filters=filters)
719
+
720
+ progress.empty()
721
+ status.empty()
722
+
723
+ if error:
724
+ st.error(f"Error: {error}")
725
+ st.session_state.messages.append({"role": "assistant", "content": f"Error: {error}"})
726
+ elif response:
727
+ # Display answer
728
+ st.markdown(response.answer)
729
+
730
+ # Build metadata
731
+ metadata = {
732
+ "latency_ms": response.latency_ms,
733
+ "num_sources": response.num_sources,
734
+ "confidence": response.confidence,
735
+ "validated": response.validated,
736
+ }
737
+
738
+ # Display metrics
739
+ if show_metrics:
740
+ m_cols = st.columns(4)
741
+ with m_cols[0]:
742
+ st.markdown(f'<div class="metric-box"><div class="metric-value">{response.latency_ms:.0f}ms</div><div class="metric-label">Latency</div></div>', unsafe_allow_html=True)
743
+ with m_cols[1]:
744
+ st.markdown(f'<div class="metric-box"><div class="metric-value">{response.num_sources}</div><div class="metric-label">Sources</div></div>', unsafe_allow_html=True)
745
+ with m_cols[2]:
746
+ conf_color = "#4ECDC4" if response.confidence > 0.6 else "#ffc107" if response.confidence > 0.3 else "#dc3545"
747
+ st.markdown(f'<div class="metric-box"><div class="metric-value" style="color:{conf_color}">{response.confidence:.0%}</div><div class="metric-label">Confidence</div></div>', unsafe_allow_html=True)
748
+ with m_cols[3]:
749
+ val_icon = "✓" if response.validated else "?"
750
+ st.markdown(f'<div class="metric-box"><div class="metric-value">{val_icon}</div><div class="metric-label">Validated</div></div>', unsafe_allow_html=True)
751
+
752
+ # Display sources
753
+ citations = []
754
+ if show_sources and response.citations:
755
+ with st.expander(f"Sources ({len(response.citations)})"):
756
+ for i, cite in enumerate(response.citations):
757
+ color = get_chunk_color(i)
758
+ citations.append({
759
+ "index": cite.index,
760
+ "relevance_score": cite.relevance_score,
761
+ "text_snippet": cite.text_snippet,
762
+ })
763
+ st.markdown(f"""
764
+ <div class="source-card" style="border-left-color: {color};">
765
+ <div class="source-header">
766
+ <strong>[{cite.index}]</strong> • Relevance: {cite.relevance_score:.0%}
767
+ </div>
768
+ <div class="source-text">{cite.text_snippet[:300]}...</div>
769
+ </div>
770
+ """, unsafe_allow_html=True)
771
+
772
+ # Chunk preview (semantic search)
773
+ if show_chunk_preview:
774
+ with st.expander("Chunk Preview (Top Matches)"):
775
+ chunks = search_similar_chunks(
776
+ prompt,
777
+ top_k=5,
778
+ doc_filter=st.session_state.doc_filter
779
+ )
780
+ for i, chunk in enumerate(chunks):
781
+ sim = chunk.get("similarity", 0)
782
+ color = "#4ECDC4" if sim > 0.7 else "#ffc107" if sim > 0.5 else "#8b949e"
783
+ st.markdown(f"""
784
+ <div class="chunk-preview" style="border-left: 3px solid {color};">
785
+ <div style="font-size: 10px; color: #8b949e;">
786
+ Similarity: <span style="color: {color};">{sim:.0%}</span> |
787
+ Doc: {chunk.get('document_id', 'N/A')[:15]}...
788
+ </div>
789
+ <div style="margin-top: 4px;">{chunk.get('text', '')[:200]}...</div>
790
+ </div>
791
+ """, unsafe_allow_html=True)
792
+
793
+ # Save to history
794
+ st.session_state.messages.append({
795
+ "role": "assistant",
796
+ "content": response.answer,
797
+ "citations": citations,
798
+ "metadata": metadata,
799
+ })
800
+
801
+ # Dynamic suggested questions based on document content
802
+ st.markdown("---")
803
+ st.markdown("### 💡 Try asking")
804
+
805
+ # Get indexed documents for question generation
806
+ indexed_docs = get_indexed_documents()
807
+ state_manager = get_state_manager()
808
+
809
+ # Generate dynamic questions based on document content
810
+ dynamic_questions = generate_dynamic_questions(state_manager, indexed_docs, max_questions=4)
811
+
812
+ # Display as clickable buttons
813
+ sample_cols = st.columns(len(dynamic_questions))
814
+ for i, q in enumerate(dynamic_questions):
815
+ with sample_cols[i]:
816
+ # Truncate long questions for button display
817
+ display_q = q if len(q) <= 35 else q[:32] + "..."
818
+ if st.button(display_q, key=f"sample_{i}", use_container_width=True,
819
+ disabled=(stats.get('total_chunks', 0) == 0),
820
+ help=q if len(q) > 35 else None):
821
+ st.session_state.messages.append({"role": "user", "content": q})
822
+ st.rerun()
823
+
824
+ # Show hint about dynamic questions
825
+ if stats.get('total_chunks', 0) > 0:
826
+ st.caption("📌 Questions are generated based on your indexed documents")
827
+
828
+ # Architecture info
829
+ with st.expander("Multi-Agent RAG Architecture"):
830
+ st.markdown("""
831
+ ```
832
+ Query → [Query Planner] → [Retriever] → [Reranker] → [Synthesizer] → [Critic] → Answer
833
+ ↓ ↓ ↓ ↓ ↓
834
+ Decompose Dense+Sparse Cross-Encoder Grounded Hallucination
835
+ & Expand + RRF Fusion Scoring Citations Detection
836
+ ```
837
+
838
+ **Agents:**
839
+ - **Query Planner**: Analyzes intent, decomposes complex queries, expands terms
840
+ - **Retriever**: Hybrid search combining dense (embedding) and sparse (BM25) retrieval
841
+ - **Reranker**: Cross-encoder scoring for precision, diversity via MMR
842
+ - **Synthesizer**: Generates grounded answers with proper citations
843
+ - **Critic**: Validates for hallucination, checks citation accuracy
844
+ """)
demo/pages/3_📊_Document_Comparison.py ADDED
@@ -0,0 +1,528 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document Comparison - SPARKNET
3
+
4
+ Compare documents using semantic similarity, structure analysis,
5
+ and content comparison with real embedding-based similarity.
6
+ """
7
+
8
+ import streamlit as st
9
+ import sys
10
+ from pathlib import Path
11
+ import pandas as pd
12
+
13
+ PROJECT_ROOT = Path(__file__).parent.parent.parent
14
+ sys.path.insert(0, str(PROJECT_ROOT))
15
+ sys.path.insert(0, str(PROJECT_ROOT / "demo"))
16
+
17
+ from state_manager import (
18
+ get_state_manager,
19
+ render_global_status_bar,
20
+ )
21
+ from rag_config import (
22
+ get_indexed_documents,
23
+ compute_document_similarity,
24
+ search_similar_chunks,
25
+ check_ollama,
26
+ get_unified_rag_system,
27
+ )
28
+
29
+ st.set_page_config(page_title="Document Comparison - SPARKNET", page_icon="📊", layout="wide")
30
+
31
+ # Custom CSS
32
+ st.markdown("""
33
+ <style>
34
+ .comparison-card {
35
+ background: #161b22;
36
+ border-radius: 10px;
37
+ padding: 15px;
38
+ margin: 10px 0;
39
+ border: 1px solid #30363d;
40
+ }
41
+ .doc-header {
42
+ font-size: 16px;
43
+ font-weight: bold;
44
+ color: #4ECDC4;
45
+ margin-bottom: 10px;
46
+ }
47
+ .similarity-badge {
48
+ display: inline-block;
49
+ padding: 8px 16px;
50
+ border-radius: 20px;
51
+ font-weight: bold;
52
+ font-size: 18px;
53
+ }
54
+ .sim-high {
55
+ background: linear-gradient(90deg, #4ECDC4 0%, #44a08d 100%);
56
+ color: white;
57
+ }
58
+ .sim-med {
59
+ background: linear-gradient(90deg, #ffc107 0%, #ff8800 100%);
60
+ color: black;
61
+ }
62
+ .sim-low {
63
+ background: linear-gradient(90deg, #dc3545 0%, #c82333 100%);
64
+ color: white;
65
+ }
66
+ .chunk-match {
67
+ background: #0d1117;
68
+ border-radius: 8px;
69
+ padding: 10px;
70
+ margin: 8px 0;
71
+ border-left: 4px solid;
72
+ }
73
+ .diff-added {
74
+ background: rgba(78, 205, 196, 0.1);
75
+ border-left-color: #4ECDC4;
76
+ }
77
+ .diff-removed {
78
+ background: rgba(220, 53, 69, 0.1);
79
+ border-left-color: #dc3545;
80
+ }
81
+ .diff-common {
82
+ background: rgba(139, 148, 158, 0.1);
83
+ border-left-color: #8b949e;
84
+ }
85
+ .metric-card {
86
+ background: #161b22;
87
+ border-radius: 8px;
88
+ padding: 15px;
89
+ text-align: center;
90
+ }
91
+ .metric-value {
92
+ font-size: 32px;
93
+ font-weight: bold;
94
+ }
95
+ .metric-label {
96
+ font-size: 11px;
97
+ color: #8b949e;
98
+ text-transform: uppercase;
99
+ }
100
+ </style>
101
+ """, unsafe_allow_html=True)
102
+
103
+
104
+ def get_similarity_class(sim: float) -> str:
105
+ """Get CSS class based on similarity."""
106
+ if sim >= 0.7:
107
+ return "sim-high"
108
+ elif sim >= 0.4:
109
+ return "sim-med"
110
+ return "sim-low"
111
+
112
+
113
+ def get_similarity_color(sim: float) -> str:
114
+ """Get color based on similarity."""
115
+ if sim >= 0.7:
116
+ return "#4ECDC4"
117
+ elif sim >= 0.4:
118
+ return "#ffc107"
119
+ return "#dc3545"
120
+
121
+
122
+ # Initialize state manager
123
+ state_manager = get_state_manager()
124
+ rag_system = get_unified_rag_system()
125
+
126
+ # Header
127
+ st.markdown("# 📊 Document Comparison")
128
+ st.markdown("Compare documents using semantic similarity, structure analysis, and content comparison")
129
+
130
+ # Global status bar
131
+ render_global_status_bar()
132
+
133
+ st.markdown("---")
134
+
135
+ # Get documents
136
+ all_docs = state_manager.get_all_documents()
137
+ indexed_docs = get_indexed_documents()
138
+
139
+ if not all_docs and not indexed_docs:
140
+ st.warning("No documents available for comparison")
141
+ st.markdown("""
142
+ ### Getting Started
143
+
144
+ To compare documents:
145
+ 1. Go to **Live Processing** to upload and process documents
146
+ 2. Process at least 2 documents
147
+ 3. Come back here to compare them
148
+
149
+ Features:
150
+ - **Semantic Similarity**: Compare documents using embedding-based similarity
151
+ - **Structure Analysis**: Compare document structure (pages, chunks, regions)
152
+ - **Content Comparison**: Find similar passages between documents
153
+ """)
154
+
155
+ if st.button("🔬 Go to Live Processing", type="primary", use_container_width=True):
156
+ st.switch_page("pages/1_🔬_Live_Processing.py")
157
+
158
+ else:
159
+ # Build document options
160
+ doc_options = {}
161
+ for doc in all_docs:
162
+ doc_options[f"{doc.filename} (State)"] = {"id": doc.doc_id, "source": "state", "doc": doc}
163
+ for doc in indexed_docs:
164
+ doc_id = doc.get("document_id", "unknown")
165
+ if doc_id not in [d["id"] for d in doc_options.values()]:
166
+ doc_options[f"{doc_id} (RAG)"] = {"id": doc_id, "source": "rag", "doc": doc}
167
+
168
+ if len(doc_options) < 2:
169
+ st.warning("Need at least 2 documents for comparison. Process more documents first.")
170
+ else:
171
+ # Document selection
172
+ st.markdown("### Select Documents to Compare")
173
+
174
+ col1, col2 = st.columns(2)
175
+ with col1:
176
+ doc1_name = st.selectbox("Document 1", list(doc_options.keys()), index=0)
177
+ with col2:
178
+ remaining = [k for k in doc_options.keys() if k != doc1_name]
179
+ doc2_name = st.selectbox("Document 2", remaining, index=0 if remaining else None)
180
+
181
+ doc1_info = doc_options.get(doc1_name)
182
+ doc2_info = doc_options.get(doc2_name)
183
+
184
+ # Comparison type
185
+ comparison_type = st.radio(
186
+ "Comparison Type",
187
+ ["Semantic Similarity", "Structure Analysis", "Content Comparison"],
188
+ horizontal=True,
189
+ )
190
+
191
+ if st.button("🔍 Compare Documents", type="primary", use_container_width=True):
192
+ st.markdown("---")
193
+
194
+ if comparison_type == "Semantic Similarity":
195
+ st.markdown("### Semantic Similarity Analysis")
196
+
197
+ with st.spinner("Computing document embeddings and similarity..."):
198
+ # Use the compute_document_similarity function from rag_config
199
+ if rag_system["status"] == "ready":
200
+ result = compute_document_similarity(doc1_info["id"], doc2_info["id"])
201
+
202
+ if result.get("error"):
203
+ st.warning(f"Could not compute similarity: {result['error']}")
204
+ # Use fallback based on text overlap
205
+ if doc1_info["source"] == "state" and doc2_info["source"] == "state":
206
+ doc1 = doc1_info["doc"]
207
+ doc2 = doc2_info["doc"]
208
+ # Simple word overlap
209
+ words1 = set(doc1.raw_text.lower().split())
210
+ words2 = set(doc2.raw_text.lower().split())
211
+ overlap = len(words1 & words2) / max(len(words1 | words2), 1)
212
+ similarity = overlap
213
+ else:
214
+ similarity = 0.5 # Default fallback
215
+ else:
216
+ similarity = result.get("similarity", 0)
217
+ else:
218
+ st.error("RAG system not ready for similarity computation")
219
+ similarity = 0.5
220
+
221
+ # Display similarity score
222
+ sim_class = get_similarity_class(similarity)
223
+ sim_color = get_similarity_color(similarity)
224
+
225
+ st.markdown(f"""
226
+ <div style="text-align: center; padding: 30px;">
227
+ <div class="similarity-badge {sim_class}">
228
+ {similarity:.0%} Similarity
229
+ </div>
230
+ <p style="color: #8b949e; margin-top: 15px;">
231
+ Based on embedding-based semantic similarity
232
+ </p>
233
+ </div>
234
+ """, unsafe_allow_html=True)
235
+
236
+ # Similarity interpretation
237
+ if similarity >= 0.7:
238
+ st.success("These documents are highly similar in content and meaning.")
239
+ elif similarity >= 0.4:
240
+ st.warning("These documents have moderate similarity - some shared topics.")
241
+ else:
242
+ st.info("These documents are quite different in content.")
243
+
244
+ # Document details
245
+ col1, col2 = st.columns(2)
246
+
247
+ with col1:
248
+ st.markdown(f"#### 📄 {doc1_name.split(' (')[0]}")
249
+ if doc1_info["source"] == "state":
250
+ doc = doc1_info["doc"]
251
+ st.metric("Pages", doc.page_count)
252
+ st.metric("Chunks", len(doc.chunks))
253
+ st.metric("Characters", f"{len(doc.raw_text):,}")
254
+ else:
255
+ doc = doc1_info["doc"]
256
+ st.metric("Chunks", doc.get("chunk_count", "N/A"))
257
+
258
+ with col2:
259
+ st.markdown(f"#### 📄 {doc2_name.split(' (')[0]}")
260
+ if doc2_info["source"] == "state":
261
+ doc = doc2_info["doc"]
262
+ st.metric("Pages", doc.page_count)
263
+ st.metric("Chunks", len(doc.chunks))
264
+ st.metric("Characters", f"{len(doc.raw_text):,}")
265
+ else:
266
+ doc = doc2_info["doc"]
267
+ st.metric("Chunks", doc.get("chunk_count", "N/A"))
268
+
269
+ elif comparison_type == "Structure Analysis":
270
+ st.markdown("### Document Structure Comparison")
271
+
272
+ col1, col2 = st.columns(2)
273
+
274
+ # Get structure data
275
+ def get_structure(info):
276
+ if info["source"] == "state":
277
+ doc = info["doc"]
278
+ return {
279
+ "Pages": doc.page_count,
280
+ "Chunks": len(doc.chunks),
281
+ "OCR Regions": len(doc.ocr_regions),
282
+ "Layout Regions": len(doc.layout_data.get("regions", [])),
283
+ "Characters": len(doc.raw_text),
284
+ "Words": len(doc.raw_text.split()),
285
+ }
286
+ else:
287
+ doc = info["doc"]
288
+ return {
289
+ "Chunks": doc.get("chunk_count", 0),
290
+ "Source": doc.get("source_path", "N/A"),
291
+ }
292
+
293
+ struct1 = get_structure(doc1_info)
294
+ struct2 = get_structure(doc2_info)
295
+
296
+ with col1:
297
+ st.markdown(f"#### 📄 {doc1_name.split(' (')[0]}")
298
+ for key, value in struct1.items():
299
+ if isinstance(value, int) and value > 1000:
300
+ st.metric(key, f"{value:,}")
301
+ else:
302
+ st.metric(key, value)
303
+
304
+ with col2:
305
+ st.markdown(f"#### 📄 {doc2_name.split(' (')[0]}")
306
+ for key, value in struct2.items():
307
+ if isinstance(value, int) and value > 1000:
308
+ st.metric(key, f"{value:,}")
309
+ else:
310
+ st.metric(key, value)
311
+
312
+ # Structure comparison chart
313
+ st.markdown("---")
314
+ st.markdown("### Comparison Chart")
315
+
316
+ common_keys = [k for k in struct1.keys() if k in struct2 and isinstance(struct1[k], (int, float))]
317
+ if common_keys:
318
+ comparison_df = pd.DataFrame({
319
+ "Metric": common_keys,
320
+ doc1_name.split(' (')[0]: [struct1[k] for k in common_keys],
321
+ doc2_name.split(' (')[0]: [struct2[k] for k in common_keys],
322
+ })
323
+ st.bar_chart(comparison_df.set_index("Metric"))
324
+
325
+ # Chunk type comparison (if available)
326
+ if doc1_info["source"] == "state" and doc2_info["source"] == "state":
327
+ st.markdown("---")
328
+ st.markdown("### Chunk Type Distribution")
329
+
330
+ def get_chunk_types(doc):
331
+ types = {}
332
+ for chunk in doc.chunks:
333
+ t = chunk.get("chunk_type", "unknown")
334
+ types[t] = types.get(t, 0) + 1
335
+ return types
336
+
337
+ types1 = get_chunk_types(doc1_info["doc"])
338
+ types2 = get_chunk_types(doc2_info["doc"])
339
+
340
+ all_types = set(types1.keys()) | set(types2.keys())
341
+
342
+ type_df = pd.DataFrame({
343
+ "Type": list(all_types),
344
+ doc1_name.split(' (')[0]: [types1.get(t, 0) for t in all_types],
345
+ doc2_name.split(' (')[0]: [types2.get(t, 0) for t in all_types],
346
+ })
347
+ st.dataframe(type_df, width='stretch', hide_index=True)
348
+
349
+ else: # Content Comparison
350
+ st.markdown("### Content Comparison")
351
+
352
+ if doc1_info["source"] == "state" and doc2_info["source"] == "state":
353
+ doc1 = doc1_info["doc"]
354
+ doc2 = doc2_info["doc"]
355
+
356
+ # Word overlap analysis
357
+ words1 = set(doc1.raw_text.lower().split())
358
+ words2 = set(doc2.raw_text.lower().split())
359
+
360
+ common_words = words1 & words2
361
+ only_doc1 = words1 - words2
362
+ only_doc2 = words2 - words1
363
+
364
+ # Metrics
365
+ metric_cols = st.columns(4)
366
+ metric_cols[0].markdown(f"""
367
+ <div class="metric-card">
368
+ <div class="metric-value" style="color: #4ECDC4;">{len(common_words):,}</div>
369
+ <div class="metric-label">Common Words</div>
370
+ </div>
371
+ """, unsafe_allow_html=True)
372
+ metric_cols[1].markdown(f"""
373
+ <div class="metric-card">
374
+ <div class="metric-value" style="color: #FF6B6B;">{len(only_doc1):,}</div>
375
+ <div class="metric-label">Only in Doc 1</div>
376
+ </div>
377
+ """, unsafe_allow_html=True)
378
+ metric_cols[2].markdown(f"""
379
+ <div class="metric-card">
380
+ <div class="metric-value" style="color: #45B7D1;">{len(only_doc2):,}</div>
381
+ <div class="metric-label">Only in Doc 2</div>
382
+ </div>
383
+ """, unsafe_allow_html=True)
384
+
385
+ overlap_pct = len(common_words) / max(len(words1 | words2), 1)
386
+ metric_cols[3].markdown(f"""
387
+ <div class="metric-card">
388
+ <div class="metric-value" style="color: #ffc107;">{overlap_pct:.0%}</div>
389
+ <div class="metric-label">Word Overlap</div>
390
+ </div>
391
+ """, unsafe_allow_html=True)
392
+
393
+ # Similar passages
394
+ st.markdown("---")
395
+ st.markdown("### Similar Passages")
396
+
397
+ # Find similar chunks between documents
398
+ with st.spinner("Finding similar passages..."):
399
+ similar_passages = []
400
+
401
+ # Compare first 10 chunks from doc1 against doc2
402
+ for i, chunk1 in enumerate(doc1.chunks[:10]):
403
+ text1 = chunk1.get("text", "")
404
+ words_c1 = set(text1.lower().split())
405
+
406
+ best_match = None
407
+ best_score = 0
408
+
409
+ for j, chunk2 in enumerate(doc2.chunks):
410
+ text2 = chunk2.get("text", "")
411
+ words_c2 = set(text2.lower().split())
412
+
413
+ # Jaccard similarity
414
+ if words_c1 and words_c2:
415
+ score = len(words_c1 & words_c2) / len(words_c1 | words_c2)
416
+ if score > best_score and score > 0.3:
417
+ best_score = score
418
+ best_match = {
419
+ "doc1_chunk": i,
420
+ "doc2_chunk": j,
421
+ "doc1_text": text1[:200],
422
+ "doc2_text": text2[:200],
423
+ "similarity": score,
424
+ }
425
+
426
+ if best_match:
427
+ similar_passages.append(best_match)
428
+
429
+ if similar_passages:
430
+ # Sort by similarity
431
+ similar_passages.sort(key=lambda x: x["similarity"], reverse=True)
432
+
433
+ for i, match in enumerate(similar_passages[:5]):
434
+ sim_color = get_similarity_color(match["similarity"])
435
+ with st.expander(f"Match {i+1} - Similarity: {match['similarity']:.0%}"):
436
+ col1, col2 = st.columns(2)
437
+ with col1:
438
+ st.markdown(f"**{doc1_name.split(' (')[0]}** (Chunk {match['doc1_chunk']+1})")
439
+ st.markdown(f"""
440
+ <div class="chunk-match diff-common">
441
+ {match['doc1_text']}...
442
+ </div>
443
+ """, unsafe_allow_html=True)
444
+ with col2:
445
+ st.markdown(f"**{doc2_name.split(' (')[0]}** (Chunk {match['doc2_chunk']+1})")
446
+ st.markdown(f"""
447
+ <div class="chunk-match diff-common">
448
+ {match['doc2_text']}...
449
+ </div>
450
+ """, unsafe_allow_html=True)
451
+ else:
452
+ st.info("No significantly similar passages found between documents")
453
+
454
+ # Key terms comparison
455
+ st.markdown("---")
456
+ st.markdown("### Key Terms Comparison")
457
+
458
+ # Get most frequent words (simple approach)
459
+ from collections import Counter
460
+
461
+ def get_top_words(text, n=20):
462
+ words = text.lower().split()
463
+ # Filter out common words
464
+ stopwords = {"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
465
+ "have", "has", "had", "do", "does", "did", "will", "would", "could",
466
+ "should", "may", "might", "must", "and", "or", "but", "if", "then",
467
+ "so", "to", "of", "in", "for", "on", "with", "at", "by", "from",
468
+ "this", "that", "these", "those", "it", "its"}
469
+ words = [w for w in words if len(w) > 3 and w not in stopwords]
470
+ return Counter(words).most_common(n)
471
+
472
+ top1 = get_top_words(doc1.raw_text)
473
+ top2 = get_top_words(doc2.raw_text)
474
+
475
+ col1, col2 = st.columns(2)
476
+ with col1:
477
+ st.markdown(f"**Top terms in {doc1_name.split(' (')[0]}:**")
478
+ for word, count in top1[:10]:
479
+ in_doc2 = word in [w for w, c in top2]
480
+ color = "#4ECDC4" if in_doc2 else "#8b949e"
481
+ st.markdown(f"<span style='color: {color};'>• {word}</span> ({count})", unsafe_allow_html=True)
482
+
483
+ with col2:
484
+ st.markdown(f"**Top terms in {doc2_name.split(' (')[0]}:**")
485
+ for word, count in top2[:10]:
486
+ in_doc1 = word in [w for w, c in top1]
487
+ color = "#4ECDC4" if in_doc1 else "#8b949e"
488
+ st.markdown(f"<span style='color: {color};'>• {word}</span> ({count})", unsafe_allow_html=True)
489
+
490
+ else:
491
+ st.info("Content comparison requires both documents to be in processed state")
492
+
493
+ # Export options
494
+ st.markdown("---")
495
+ st.markdown("### Export Comparison")
496
+
497
+ export_cols = st.columns(3)
498
+ with export_cols[0]:
499
+ if st.button("📄 Export as JSON", use_container_width=True):
500
+ import json
501
+ export_data = {
502
+ "document1": doc1_name,
503
+ "document2": doc2_name,
504
+ "comparison_type": comparison_type,
505
+ }
506
+ st.json(export_data)
507
+ with export_cols[1]:
508
+ st.button("📊 Export as CSV", disabled=True, use_container_width=True)
509
+ with export_cols[2]:
510
+ st.button("📋 Export as PDF", disabled=True, use_container_width=True)
511
+
512
+ # Navigation
513
+ st.markdown("---")
514
+ st.markdown("### Navigation")
515
+ nav_cols = st.columns(4)
516
+
517
+ with nav_cols[0]:
518
+ if st.button("🔬 Live Processing", use_container_width=True):
519
+ st.switch_page("pages/1_🔬_Live_Processing.py")
520
+ with nav_cols[1]:
521
+ if st.button("💬 Interactive RAG", use_container_width=True):
522
+ st.switch_page("pages/2_💬_Interactive_RAG.py")
523
+ with nav_cols[2]:
524
+ if st.button("🎯 Evidence Viewer", use_container_width=True):
525
+ st.switch_page("pages/4_🎯_Evidence_Viewer.py")
526
+ with nav_cols[3]:
527
+ if st.button("📄 Document Viewer", use_container_width=True):
528
+ st.switch_page("pages/5_📄_Document_Viewer.py")
demo/pages/4_🎯_Evidence_Viewer.py ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evidence Viewer - SPARKNET
3
+
4
+ Visualize extracted OCR regions, layout, and evidence grounding with
5
+ confidence-based coloring and interactivity.
6
+ """
7
+
8
+ import streamlit as st
9
+ import sys
10
+ from pathlib import Path
11
+ import base64
12
+
13
+ PROJECT_ROOT = Path(__file__).parent.parent.parent
14
+ sys.path.insert(0, str(PROJECT_ROOT))
15
+ sys.path.insert(0, str(PROJECT_ROOT / "demo"))
16
+
17
+ from state_manager import (
18
+ get_state_manager,
19
+ render_global_status_bar,
20
+ )
21
+ from rag_config import (
22
+ get_indexed_documents,
23
+ get_chunks_for_document,
24
+ check_ollama,
25
+ )
26
+
27
+ st.set_page_config(page_title="Evidence Viewer - SPARKNET", page_icon="🎯", layout="wide")
28
+
29
+ # Custom CSS with confidence-based colors
30
+ st.markdown("""
31
+ <style>
32
+ .region-card {
33
+ background: #161b22;
34
+ border-radius: 10px;
35
+ padding: 12px;
36
+ margin: 8px 0;
37
+ border-left: 4px solid;
38
+ transition: transform 0.2s;
39
+ }
40
+ .region-card:hover {
41
+ transform: translateX(4px);
42
+ }
43
+ .confidence-high {
44
+ border-left-color: #4ECDC4 !important;
45
+ background: linear-gradient(90deg, rgba(78,205,196,0.1) 0%, transparent 100%);
46
+ }
47
+ .confidence-med {
48
+ border-left-color: #ffc107 !important;
49
+ background: linear-gradient(90deg, rgba(255,193,7,0.1) 0%, transparent 100%);
50
+ }
51
+ .confidence-low {
52
+ border-left-color: #dc3545 !important;
53
+ background: linear-gradient(90deg, rgba(220,53,69,0.1) 0%, transparent 100%);
54
+ }
55
+ .region-header {
56
+ display: flex;
57
+ justify-content: space-between;
58
+ align-items: center;
59
+ margin-bottom: 8px;
60
+ }
61
+ .region-type {
62
+ font-weight: bold;
63
+ text-transform: uppercase;
64
+ font-size: 12px;
65
+ }
66
+ .region-conf {
67
+ font-size: 14px;
68
+ font-weight: bold;
69
+ }
70
+ .region-text {
71
+ font-family: 'Monaco', 'Menlo', monospace;
72
+ font-size: 13px;
73
+ color: #c9d1d9;
74
+ line-height: 1.5;
75
+ }
76
+ .region-meta {
77
+ font-size: 10px;
78
+ color: #8b949e;
79
+ margin-top: 8px;
80
+ }
81
+ .bbox-display {
82
+ background: #0d1117;
83
+ padding: 4px 8px;
84
+ border-radius: 4px;
85
+ font-family: monospace;
86
+ font-size: 11px;
87
+ }
88
+ .page-thumbnail {
89
+ border: 2px solid #30363d;
90
+ border-radius: 8px;
91
+ padding: 10px;
92
+ background: #0d1117;
93
+ }
94
+ .page-thumbnail.active {
95
+ border-color: #4ECDC4;
96
+ }
97
+ .stats-card {
98
+ background: #161b22;
99
+ border-radius: 8px;
100
+ padding: 15px;
101
+ text-align: center;
102
+ }
103
+ .stats-value {
104
+ font-size: 28px;
105
+ font-weight: bold;
106
+ color: #4ECDC4;
107
+ }
108
+ .stats-label {
109
+ font-size: 11px;
110
+ color: #8b949e;
111
+ text-transform: uppercase;
112
+ }
113
+ .copy-btn {
114
+ background: #21262d;
115
+ border: none;
116
+ padding: 4px 8px;
117
+ border-radius: 4px;
118
+ font-size: 11px;
119
+ cursor: pointer;
120
+ }
121
+ </style>
122
+ """, unsafe_allow_html=True)
123
+
124
+
125
+ def get_confidence_class(conf: float) -> str:
126
+ """Get CSS class based on confidence."""
127
+ if conf >= 0.8:
128
+ return "confidence-high"
129
+ elif conf >= 0.6:
130
+ return "confidence-med"
131
+ return "confidence-low"
132
+
133
+
134
+ def get_confidence_color(conf: float) -> str:
135
+ """Get color based on confidence."""
136
+ if conf >= 0.8:
137
+ return "#4ECDC4"
138
+ elif conf >= 0.6:
139
+ return "#ffc107"
140
+ return "#dc3545"
141
+
142
+
143
+ def get_type_color(region_type: str) -> str:
144
+ """Get color for region type."""
145
+ colors = {
146
+ "title": "#FF6B6B",
147
+ "heading": "#FF8E6B",
148
+ "paragraph": "#4ECDC4",
149
+ "text": "#45B7D1",
150
+ "list": "#96CEB4",
151
+ "table": "#FFEAA7",
152
+ "figure": "#DDA0DD",
153
+ "header": "#98D8C8",
154
+ "footer": "#8b949e",
155
+ }
156
+ return colors.get(region_type.lower(), "#666")
157
+
158
+
159
+ # Initialize state manager
160
+ state_manager = get_state_manager()
161
+
162
+ # Header
163
+ st.markdown("# 🎯 Evidence Viewer")
164
+ st.markdown("Visualize OCR regions, layout structure, and evidence grounding with confidence scoring")
165
+
166
+ # Global status bar
167
+ render_global_status_bar()
168
+
169
+ st.markdown("---")
170
+
171
+ # Get documents from state
172
+ all_docs = state_manager.get_all_documents()
173
+ indexed_docs = get_indexed_documents()
174
+
175
+ # Sidebar for document selection
176
+ with st.sidebar:
177
+ st.markdown("## 📚 Select Document")
178
+
179
+ if all_docs:
180
+ doc_options = {f"{d.filename} ({len(d.ocr_regions)} regions)": d.doc_id for d in all_docs}
181
+ selected_doc_name = st.selectbox("Processed Documents", list(doc_options.keys()))
182
+ selected_doc_id = doc_options.get(selected_doc_name)
183
+
184
+ if selected_doc_id:
185
+ state_manager.set_active_document(selected_doc_id)
186
+ else:
187
+ st.info("No documents processed yet")
188
+ selected_doc_id = None
189
+
190
+ st.markdown("---")
191
+ st.markdown("## 🎨 Display Options")
192
+
193
+ show_ocr = st.checkbox("Show OCR Regions", value=True)
194
+ show_layout = st.checkbox("Show Layout Regions", value=True)
195
+ show_bbox = st.checkbox("Show Bounding Boxes", value=True)
196
+
197
+ st.markdown("---")
198
+ st.markdown("## 🎚️ Filters")
199
+
200
+ min_confidence = st.slider("Min Confidence", 0.0, 1.0, 0.0, 0.1)
201
+
202
+ region_types = ["All", "title", "heading", "paragraph", "text", "list", "table", "figure"]
203
+ selected_type = st.selectbox("Region Type", region_types)
204
+
205
+ # Main content
206
+ active_doc = state_manager.get_active_document()
207
+
208
+ if active_doc:
209
+ # Document header
210
+ col1, col2 = st.columns([3, 1])
211
+ with col1:
212
+ st.markdown(f"## 📄 {active_doc.filename}")
213
+ st.caption(f"ID: `{active_doc.doc_id}` | {active_doc.page_count} pages")
214
+ with col2:
215
+ if active_doc.indexed:
216
+ st.success("Indexed")
217
+ else:
218
+ st.warning("Not indexed")
219
+
220
+ # Statistics cards
221
+ stat_cols = st.columns(5)
222
+
223
+ # Calculate stats
224
+ ocr_regions = active_doc.ocr_regions
225
+ layout_regions = active_doc.layout_data.get("regions", [])
226
+
227
+ avg_ocr_conf = sum(r.get("confidence", 0) for r in ocr_regions) / len(ocr_regions) if ocr_regions else 0
228
+ high_conf_count = len([r for r in ocr_regions if r.get("confidence", 0) >= 0.8])
229
+ med_conf_count = len([r for r in ocr_regions if 0.6 <= r.get("confidence", 0) < 0.8])
230
+ low_conf_count = len([r for r in ocr_regions if r.get("confidence", 0) < 0.6])
231
+
232
+ stat_cols[0].markdown(f"""
233
+ <div class="stats-card">
234
+ <div class="stats-value">{len(ocr_regions)}</div>
235
+ <div class="stats-label">OCR Regions</div>
236
+ </div>
237
+ """, unsafe_allow_html=True)
238
+
239
+ stat_cols[1].markdown(f"""
240
+ <div class="stats-card">
241
+ <div class="stats-value">{len(layout_regions)}</div>
242
+ <div class="stats-label">Layout Regions</div>
243
+ </div>
244
+ """, unsafe_allow_html=True)
245
+
246
+ stat_cols[2].markdown(f"""
247
+ <div class="stats-card">
248
+ <div class="stats-value" style="color: #4ECDC4;">{avg_ocr_conf:.0%}</div>
249
+ <div class="stats-label">Avg Confidence</div>
250
+ </div>
251
+ """, unsafe_allow_html=True)
252
+
253
+ stat_cols[3].markdown(f"""
254
+ <div class="stats-card">
255
+ <div class="stats-value" style="color: #4ECDC4;">{high_conf_count}</div>
256
+ <div class="stats-label">High Conf (&gt;80%)</div>
257
+ </div>
258
+ """, unsafe_allow_html=True)
259
+
260
+ stat_cols[4].markdown(f"""
261
+ <div class="stats-card">
262
+ <div class="stats-value" style="color: #dc3545;">{low_conf_count}</div>
263
+ <div class="stats-label">Low Conf (&lt;60%)</div>
264
+ </div>
265
+ """, unsafe_allow_html=True)
266
+
267
+ st.markdown("---")
268
+
269
+ # Main view - Page images and regions
270
+ tab_regions, tab_pages, tab_export = st.tabs(["📋 Regions", "📄 Page View", "📥 Export"])
271
+
272
+ with tab_regions:
273
+ # Filter regions
274
+ filtered_ocr = ocr_regions
275
+ if min_confidence > 0:
276
+ filtered_ocr = [r for r in filtered_ocr if r.get("confidence", 0) >= min_confidence]
277
+
278
+ # Page selector
279
+ pages = sorted(set(r.get("page", 0) for r in filtered_ocr))
280
+ if pages:
281
+ selected_page = st.selectbox(
282
+ "Select Page",
283
+ pages,
284
+ format_func=lambda x: f"Page {x + 1} ({len([r for r in filtered_ocr if r.get('page') == x])} regions)"
285
+ )
286
+
287
+ page_regions = [r for r in filtered_ocr if r.get("page") == selected_page]
288
+
289
+ st.markdown(f"### OCR Regions on Page {selected_page + 1}")
290
+ st.caption(f"Showing {len(page_regions)} regions (filtered by confidence >= {min_confidence:.0%})")
291
+
292
+ # Display regions with confidence coloring
293
+ for i, region in enumerate(page_regions):
294
+ conf = region.get("confidence", 0)
295
+ conf_class = get_confidence_class(conf)
296
+ conf_color = get_confidence_color(conf)
297
+ text = region.get("text", "")
298
+ bbox = region.get("bbox")
299
+
300
+ st.markdown(f"""
301
+ <div class="region-card {conf_class}">
302
+ <div class="region-header">
303
+ <span class="region-type" style="color: {conf_color};">Region {i + 1}</span>
304
+ <span class="region-conf" style="color: {conf_color};">{conf:.0%}</span>
305
+ </div>
306
+ <div class="region-text">{text}</div>
307
+ <div class="region-meta">
308
+ {f'<span class="bbox-display">Bbox: ({bbox[0]:.0f}, {bbox[1]:.0f}) - ({bbox[2]:.0f}, {bbox[3]:.0f})</span>' if bbox and show_bbox else ''}
309
+ </div>
310
+ </div>
311
+ """, unsafe_allow_html=True)
312
+
313
+ # Copy button
314
+ col1, col2 = st.columns([4, 1])
315
+ with col2:
316
+ if st.button("📋 Copy", key=f"copy_{i}"):
317
+ st.toast(f"Copied region {i+1} text!")
318
+
319
+ else:
320
+ st.info("No OCR regions available for this document")
321
+
322
+ # Layout regions
323
+ if show_layout and layout_regions:
324
+ st.markdown("---")
325
+ st.markdown("### Layout Regions")
326
+
327
+ # Group by type
328
+ by_type = {}
329
+ for r in layout_regions:
330
+ rtype = r.get("type", "unknown")
331
+ if rtype not in by_type:
332
+ by_type[rtype] = []
333
+ by_type[rtype].append(r)
334
+
335
+ # Type pills
336
+ st.markdown("**Detected types:**")
337
+ type_html = ""
338
+ for rtype, regions in by_type.items():
339
+ color = get_type_color(rtype)
340
+ type_html += f'<span style="background: {color}33; color: {color}; padding: 4px 10px; border-radius: 12px; margin: 4px; display: inline-block; font-size: 12px;">{rtype.title()} ({len(regions)})</span>'
341
+ st.markdown(type_html, unsafe_allow_html=True)
342
+
343
+ # Layout details
344
+ for rtype, regions in by_type.items():
345
+ with st.expander(f"{rtype.title()} ({len(regions)} regions)"):
346
+ for r in regions[:10]:
347
+ conf = r.get("confidence", 0)
348
+ conf_color = get_confidence_color(conf)
349
+ st.markdown(f"""
350
+ <div style="background: #0d1117; padding: 8px; border-radius: 6px; margin: 4px 0; border-left: 3px solid {get_type_color(rtype)};">
351
+ <span style="color: {conf_color};">{conf:.0%}</span> | Page {r.get('page', 0) + 1}
352
+ </div>
353
+ """, unsafe_allow_html=True)
354
+
355
+ with tab_pages:
356
+ st.markdown("### Page Images with Regions")
357
+
358
+ if active_doc.page_images:
359
+ page_select = st.selectbox(
360
+ "Page",
361
+ range(len(active_doc.page_images)),
362
+ format_func=lambda x: f"Page {x + 1}",
363
+ key="page_view_select"
364
+ )
365
+
366
+ if page_select is not None:
367
+ # Display page image
368
+ img_data = active_doc.page_images[page_select]
369
+ st.image(
370
+ f"data:image/png;base64,{img_data}",
371
+ caption=f"Page {page_select + 1}",
372
+ use_container_width=True
373
+ )
374
+
375
+ # Regions on this page
376
+ page_ocr = [r for r in ocr_regions if r.get("page") == page_select]
377
+ page_layout = [r for r in layout_regions if r.get("page") == page_select]
378
+
379
+ col1, col2 = st.columns(2)
380
+ with col1:
381
+ st.metric("OCR Regions", len(page_ocr))
382
+ with col2:
383
+ st.metric("Layout Regions", len(page_layout))
384
+
385
+ st.info("Bounding box overlay visualization will be available in future updates")
386
+ else:
387
+ st.info("No page images available. Process a PDF document to see page images.")
388
+
389
+ with tab_export:
390
+ st.markdown("### Export Evidence Data")
391
+
392
+ export_cols = st.columns(3)
393
+
394
+ with export_cols[0]:
395
+ st.markdown("**OCR Regions JSON**")
396
+ if st.button("📥 Export OCR", use_container_width=True):
397
+ import json
398
+ ocr_json = json.dumps({
399
+ "document_id": active_doc.doc_id,
400
+ "filename": active_doc.filename,
401
+ "ocr_regions": ocr_regions,
402
+ }, indent=2)
403
+ st.download_button(
404
+ "Download JSON",
405
+ ocr_json,
406
+ file_name=f"{active_doc.doc_id}_ocr.json",
407
+ mime="application/json"
408
+ )
409
+
410
+ with export_cols[1]:
411
+ st.markdown("**Layout Regions JSON**")
412
+ if st.button("📥 Export Layout", use_container_width=True):
413
+ import json
414
+ layout_json = json.dumps({
415
+ "document_id": active_doc.doc_id,
416
+ "filename": active_doc.filename,
417
+ "layout_regions": layout_regions,
418
+ }, indent=2)
419
+ st.download_button(
420
+ "Download JSON",
421
+ layout_json,
422
+ file_name=f"{active_doc.doc_id}_layout.json",
423
+ mime="application/json"
424
+ )
425
+
426
+ with export_cols[2]:
427
+ st.markdown("**Full Text**")
428
+ st.download_button(
429
+ "📥 Export Text",
430
+ active_doc.raw_text,
431
+ file_name=f"{active_doc.doc_id}.txt",
432
+ mime="text/plain",
433
+ use_container_width=True
434
+ )
435
+
436
+ # Confidence distribution chart
437
+ st.markdown("---")
438
+ st.markdown("### Confidence Distribution")
439
+
440
+ if ocr_regions:
441
+ import pandas as pd
442
+
443
+ # Build distribution data
444
+ conf_bins = {"High (>80%)": 0, "Medium (60-80%)": 0, "Low (<60%)": 0}
445
+ for r in ocr_regions:
446
+ c = r.get("confidence", 0)
447
+ if c >= 0.8:
448
+ conf_bins["High (>80%)"] += 1
449
+ elif c >= 0.6:
450
+ conf_bins["Medium (60-80%)"] += 1
451
+ else:
452
+ conf_bins["Low (<60%)"] += 1
453
+
454
+ df = pd.DataFrame({
455
+ "Confidence Level": list(conf_bins.keys()),
456
+ "Count": list(conf_bins.values())
457
+ })
458
+ st.bar_chart(df.set_index("Confidence Level"))
459
+
460
+ # Navigation
461
+ st.markdown("---")
462
+ st.markdown("### Actions")
463
+ nav_cols = st.columns(4)
464
+
465
+ with nav_cols[0]:
466
+ if st.button("💬 Query RAG", use_container_width=True):
467
+ st.switch_page("pages/2_💬_Interactive_RAG.py")
468
+ with nav_cols[1]:
469
+ if st.button("📄 Document Viewer", use_container_width=True):
470
+ st.switch_page("pages/5_📄_Document_Viewer.py")
471
+ with nav_cols[2]:
472
+ if st.button("📊 Compare", use_container_width=True):
473
+ st.switch_page("pages/3_📊_Document_Comparison.py")
474
+ with nav_cols[3]:
475
+ if st.button("🔬 Process New", use_container_width=True):
476
+ st.switch_page("pages/1_🔬_Live_Processing.py")
477
+
478
+ else:
479
+ # No document selected
480
+ st.markdown("## No Document Selected")
481
+
482
+ st.markdown("""
483
+ ### Getting Started
484
+
485
+ 1. Go to **Live Processing** to upload and process a document
486
+ 2. Come back here to view OCR regions and evidence grounding
487
+ 3. Use confidence filters to focus on high or low quality regions
488
+
489
+ Evidence viewer shows:
490
+ - OCR extracted text regions with confidence scores
491
+ - Layout detection results (titles, paragraphs, tables, etc.)
492
+ - Bounding box coordinates for each region
493
+ - Page images with region overlays
494
+ """)
495
+
496
+ col1, col2 = st.columns(2)
497
+ with col1:
498
+ if st.button("🔬 Go to Live Processing", type="primary", use_container_width=True):
499
+ st.switch_page("pages/1_🔬_Live_Processing.py")
500
+ with col2:
501
+ if st.button("📄 Go to Document Viewer", use_container_width=True):
502
+ st.switch_page("pages/5_📄_Document_Viewer.py")
503
+
504
+ # Legend
505
+ st.markdown("---")
506
+ st.markdown("### Confidence Color Legend")
507
+
508
+ legend_cols = st.columns(3)
509
+ with legend_cols[0]:
510
+ st.markdown("""
511
+ <div style="background: rgba(78,205,196,0.2); padding: 10px; border-radius: 8px; border-left: 4px solid #4ECDC4;">
512
+ <strong style="color: #4ECDC4;">High Confidence (>80%)</strong><br>
513
+ <span style="color: #8b949e;">Reliable extraction</span>
514
+ </div>
515
+ """, unsafe_allow_html=True)
516
+ with legend_cols[1]:
517
+ st.markdown("""
518
+ <div style="background: rgba(255,193,7,0.2); padding: 10px; border-radius: 8px; border-left: 4px solid #ffc107;">
519
+ <strong style="color: #ffc107;">Medium Confidence (60-80%)</strong><br>
520
+ <span style="color: #8b949e;">Review recommended</span>
521
+ </div>
522
+ """, unsafe_allow_html=True)
523
+ with legend_cols[2]:
524
+ st.markdown("""
525
+ <div style="background: rgba(220,53,69,0.2); padding: 10px; border-radius: 8px; border-left: 4px solid #dc3545;">
526
+ <strong style="color: #dc3545;">Low Confidence (<60%)</strong><br>
527
+ <span style="color: #8b949e;">Manual verification needed</span>
528
+ </div>
529
+ """, unsafe_allow_html=True)
demo/pages/5_📄_Document_Viewer.py ADDED
@@ -0,0 +1,565 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document Viewer - SPARKNET
3
+
4
+ View and explore processed documents from the state manager.
5
+ Provides visual chunk segmentation, OCR regions, and layout visualization.
6
+ """
7
+
8
+ import streamlit as st
9
+ import sys
10
+ from pathlib import Path
11
+ import time
12
+ import hashlib
13
+ import base64
14
+ from typing import List, Dict, Any
15
+
16
+ PROJECT_ROOT = Path(__file__).parent.parent.parent
17
+ sys.path.insert(0, str(PROJECT_ROOT))
18
+ sys.path.insert(0, str(PROJECT_ROOT / "demo"))
19
+
20
+ # Import state manager and RAG config
21
+ from state_manager import (
22
+ get_state_manager,
23
+ ProcessedDocument,
24
+ render_global_status_bar,
25
+ )
26
+ from rag_config import (
27
+ get_unified_rag_system,
28
+ get_store_stats,
29
+ get_indexed_documents,
30
+ get_chunks_for_document,
31
+ check_ollama,
32
+ )
33
+
34
+ st.set_page_config(
35
+ page_title="Document Viewer - SPARKNET",
36
+ page_icon="📄",
37
+ layout="wide"
38
+ )
39
+
40
+ # Custom CSS
41
+ st.markdown("""
42
+ <style>
43
+ .chunk-card {
44
+ background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
45
+ border-radius: 10px;
46
+ padding: 12px;
47
+ margin: 8px 0;
48
+ border-left: 4px solid #4ECDC4;
49
+ }
50
+ .chunk-header {
51
+ display: flex;
52
+ justify-content: space-between;
53
+ font-size: 11px;
54
+ color: #8b949e;
55
+ margin-bottom: 6px;
56
+ }
57
+ .chunk-text {
58
+ font-size: 13px;
59
+ line-height: 1.5;
60
+ color: #c9d1d9;
61
+ font-family: 'Monaco', 'Menlo', monospace;
62
+ }
63
+ .ocr-region {
64
+ background: #161b22;
65
+ border-radius: 6px;
66
+ padding: 8px;
67
+ margin: 4px 0;
68
+ border-left: 3px solid;
69
+ }
70
+ .layout-region {
71
+ display: inline-block;
72
+ padding: 4px 8px;
73
+ margin: 3px;
74
+ border-radius: 4px;
75
+ font-size: 11px;
76
+ }
77
+ .doc-card {
78
+ background: #0d1117;
79
+ border-radius: 10px;
80
+ padding: 15px;
81
+ margin: 10px 0;
82
+ border: 1px solid #30363d;
83
+ cursor: pointer;
84
+ transition: border-color 0.2s;
85
+ }
86
+ .doc-card:hover {
87
+ border-color: #4ECDC4;
88
+ }
89
+ .doc-card.active {
90
+ border-color: #4ECDC4;
91
+ border-width: 2px;
92
+ }
93
+ .metric-mini {
94
+ background: #161b22;
95
+ border-radius: 6px;
96
+ padding: 8px;
97
+ text-align: center;
98
+ margin: 4px;
99
+ }
100
+ .metric-mini .value {
101
+ font-size: 18px;
102
+ font-weight: bold;
103
+ color: #4ECDC4;
104
+ }
105
+ .metric-mini .label {
106
+ font-size: 10px;
107
+ color: #8b949e;
108
+ text-transform: uppercase;
109
+ }
110
+ .page-viewer {
111
+ background: #0d1117;
112
+ border-radius: 10px;
113
+ padding: 20px;
114
+ max-height: 600px;
115
+ overflow-y: auto;
116
+ }
117
+ .confidence-high { color: #4ECDC4; }
118
+ .confidence-med { color: #ffc107; }
119
+ .confidence-low { color: #dc3545; }
120
+ </style>
121
+ """, unsafe_allow_html=True)
122
+
123
+
124
+ def get_chunk_color(index: int) -> str:
125
+ """Get distinct color for chunk visualization."""
126
+ colors = [
127
+ "#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4",
128
+ "#FFEAA7", "#DDA0DD", "#98D8C8", "#F7DC6F",
129
+ "#BB8FCE", "#85C1E9", "#F8B500", "#00CED1"
130
+ ]
131
+ return colors[index % len(colors)]
132
+
133
+
134
+ def get_confidence_class(conf: float) -> str:
135
+ """Get confidence CSS class."""
136
+ if conf >= 0.8:
137
+ return "confidence-high"
138
+ elif conf >= 0.6:
139
+ return "confidence-med"
140
+ return "confidence-low"
141
+
142
+
143
+ def get_layout_color(layout_type: str) -> str:
144
+ """Get color for layout type."""
145
+ colors = {
146
+ "title": "#FF6B6B",
147
+ "heading": "#FF8E6B",
148
+ "paragraph": "#4ECDC4",
149
+ "text": "#45B7D1",
150
+ "list": "#96CEB4",
151
+ "table": "#FFEAA7",
152
+ "figure": "#DDA0DD",
153
+ "header": "#98D8C8",
154
+ "footer": "#8b949e",
155
+ }
156
+ return colors.get(layout_type.lower(), "#666")
157
+
158
+
159
+ # Initialize state manager
160
+ state_manager = get_state_manager()
161
+
162
+ # Header
163
+ st.markdown("# 📄 Document Viewer")
164
+ st.markdown("Explore processed documents, chunks, OCR regions, and layout structure")
165
+
166
+ # Global status bar
167
+ render_global_status_bar()
168
+
169
+ st.markdown("---")
170
+
171
+ # Get all documents from state and RAG
172
+ all_state_docs = state_manager.get_all_documents()
173
+ rag_docs = get_indexed_documents()
174
+
175
+ # Sidebar for document selection
176
+ with st.sidebar:
177
+ st.markdown("## 📚 Documents")
178
+
179
+ # Processed documents from state manager
180
+ if all_state_docs:
181
+ st.markdown("### Recently Processed")
182
+ selected_doc_id = None
183
+
184
+ for doc in reversed(all_state_docs[-10:]):
185
+ is_active = state_manager.state.get("active_doc_id") == doc.doc_id
186
+ card_class = "doc-card active" if is_active else "doc-card"
187
+
188
+ if st.button(
189
+ f"📄 {doc.filename[:25]}...",
190
+ key=f"doc_{doc.doc_id}",
191
+ use_container_width=True,
192
+ type="primary" if is_active else "secondary"
193
+ ):
194
+ state_manager.set_active_document(doc.doc_id)
195
+ st.rerun()
196
+
197
+ # Mini stats
198
+ cols = st.columns(3)
199
+ cols[0].caption(f"📄 {doc.page_count}p")
200
+ cols[1].caption(f"📦 {len(doc.chunks)}")
201
+ if doc.indexed:
202
+ cols[2].caption("✓ Indexed")
203
+ st.markdown("---")
204
+ else:
205
+ st.info("No documents processed yet")
206
+ st.markdown("Go to **Live Processing** to process documents")
207
+
208
+ # RAG indexed documents
209
+ if rag_docs:
210
+ st.markdown("### 📊 RAG Index")
211
+ st.caption(f"{len(rag_docs)} documents indexed")
212
+ for doc in rag_docs[:5]:
213
+ st.caption(f"• {doc.get('document_id', 'unknown')[:20]}...")
214
+
215
+ # Main content
216
+ active_doc = state_manager.get_active_document()
217
+
218
+ if active_doc:
219
+ # Document header
220
+ col1, col2 = st.columns([3, 1])
221
+
222
+ with col1:
223
+ st.markdown(f"## 📄 {active_doc.filename}")
224
+ st.caption(f"ID: `{active_doc.doc_id}` | Type: {active_doc.file_type} | Processed: {active_doc.created_at.strftime('%Y-%m-%d %H:%M')}")
225
+
226
+ with col2:
227
+ if active_doc.indexed:
228
+ st.success(f"✓ Indexed ({active_doc.indexed_chunks} chunks)")
229
+ else:
230
+ st.warning("Not indexed")
231
+
232
+ # Summary metrics
233
+ metric_cols = st.columns(6)
234
+ metric_cols[0].markdown(f"""
235
+ <div class="metric-mini">
236
+ <div class="value">{active_doc.page_count}</div>
237
+ <div class="label">Pages</div>
238
+ </div>
239
+ """, unsafe_allow_html=True)
240
+ metric_cols[1].markdown(f"""
241
+ <div class="metric-mini">
242
+ <div class="value">{len(active_doc.chunks)}</div>
243
+ <div class="label">Chunks</div>
244
+ </div>
245
+ """, unsafe_allow_html=True)
246
+ metric_cols[2].markdown(f"""
247
+ <div class="metric-mini">
248
+ <div class="value">{len(active_doc.ocr_regions)}</div>
249
+ <div class="label">OCR Regions</div>
250
+ </div>
251
+ """, unsafe_allow_html=True)
252
+ layout_count = len(active_doc.layout_data.get("regions", []))
253
+ metric_cols[3].markdown(f"""
254
+ <div class="metric-mini">
255
+ <div class="value">{layout_count}</div>
256
+ <div class="label">Layout Regions</div>
257
+ </div>
258
+ """, unsafe_allow_html=True)
259
+ metric_cols[4].markdown(f"""
260
+ <div class="metric-mini">
261
+ <div class="value">{len(active_doc.raw_text):,}</div>
262
+ <div class="label">Characters</div>
263
+ </div>
264
+ """, unsafe_allow_html=True)
265
+ metric_cols[5].markdown(f"""
266
+ <div class="metric-mini">
267
+ <div class="value">{active_doc.processing_time:.1f}s</div>
268
+ <div class="label">Process Time</div>
269
+ </div>
270
+ """, unsafe_allow_html=True)
271
+
272
+ st.markdown("---")
273
+
274
+ # Tabs for different views
275
+ tab_chunks, tab_text, tab_ocr, tab_layout, tab_pages = st.tabs([
276
+ "📦 Chunks",
277
+ "📝 Full Text",
278
+ "🔍 OCR Regions",
279
+ "🗺️ Layout",
280
+ "📄 Page Images"
281
+ ])
282
+
283
+ with tab_chunks:
284
+ st.markdown("### Document Chunks")
285
+
286
+ # Filter options
287
+ filter_cols = st.columns([2, 1, 1])
288
+ with filter_cols[0]:
289
+ search_term = st.text_input("Search in chunks", placeholder="Enter search term...")
290
+ with filter_cols[1]:
291
+ chunk_types = list(set(c.get("chunk_type", "text") for c in active_doc.chunks))
292
+ selected_type = st.selectbox("Filter by type", ["All"] + chunk_types)
293
+ with filter_cols[2]:
294
+ page_filter = st.selectbox("Filter by page", ["All"] + list(range(1, active_doc.page_count + 1)))
295
+
296
+ # Filter chunks
297
+ filtered_chunks = active_doc.chunks
298
+ if search_term:
299
+ filtered_chunks = [c for c in filtered_chunks if search_term.lower() in c.get("text", "").lower()]
300
+ if selected_type != "All":
301
+ filtered_chunks = [c for c in filtered_chunks if c.get("chunk_type") == selected_type]
302
+ if page_filter != "All":
303
+ filtered_chunks = [c for c in filtered_chunks if c.get("page", 0) + 1 == page_filter]
304
+
305
+ st.caption(f"Showing {len(filtered_chunks)} of {len(active_doc.chunks)} chunks")
306
+
307
+ # Display chunks
308
+ for i, chunk in enumerate(filtered_chunks[:30]):
309
+ chunk_type = chunk.get("chunk_type", "text")
310
+ conf = chunk.get("confidence", 0)
311
+ color = get_chunk_color(i)
312
+ conf_class = get_confidence_class(conf)
313
+
314
+ with st.expander(f"[{i+1}] {chunk_type.upper()} - {chunk.get('text', '')[:60]}...", expanded=(i == 0)):
315
+ st.markdown(f"""
316
+ <div class="chunk-card" style="border-left-color: {color};">
317
+ <div class="chunk-header">
318
+ <span>ID: <code>{chunk.get('chunk_id', 'N/A')}</code></span>
319
+ <span>Page {chunk.get('page', 0) + 1}</span>
320
+ <span class="{conf_class}">Confidence: {conf:.0%}</span>
321
+ </div>
322
+ <div class="chunk-text">{chunk.get('text', '')}</div>
323
+ </div>
324
+ """, unsafe_allow_html=True)
325
+
326
+ # Bounding box info
327
+ bbox = chunk.get("bbox")
328
+ if bbox:
329
+ st.caption(f"Bbox: ({bbox[0]:.0f}, {bbox[1]:.0f}) - ({bbox[2]:.0f}, {bbox[3]:.0f})")
330
+
331
+ if len(filtered_chunks) > 30:
332
+ st.info(f"Showing 30 of {len(filtered_chunks)} matching chunks")
333
+
334
+ with tab_text:
335
+ st.markdown("### Extracted Text")
336
+
337
+ # Text display options
338
+ text_cols = st.columns([1, 1, 1])
339
+ with text_cols[0]:
340
+ show_page_markers = st.checkbox("Show page markers", value=True)
341
+ with text_cols[1]:
342
+ font_size = st.slider("Font size", 10, 18, 13)
343
+ with text_cols[2]:
344
+ max_chars = st.slider("Max characters", 5000, 50000, 20000, 1000)
345
+
346
+ text_to_display = active_doc.raw_text[:max_chars]
347
+ if len(active_doc.raw_text) > max_chars:
348
+ text_to_display += f"\n\n... [Truncated - {len(active_doc.raw_text) - max_chars:,} more characters]"
349
+
350
+ st.markdown(f"""
351
+ <div class="page-viewer" style="font-size: {font_size}px;">
352
+ <pre style="white-space: pre-wrap; font-family: monospace; margin: 0;">{text_to_display}</pre>
353
+ </div>
354
+ """, unsafe_allow_html=True)
355
+
356
+ # Download button
357
+ st.download_button(
358
+ "📥 Download Full Text",
359
+ active_doc.raw_text,
360
+ file_name=f"{active_doc.filename}.txt",
361
+ mime="text/plain"
362
+ )
363
+
364
+ with tab_ocr:
365
+ st.markdown("### OCR Regions")
366
+
367
+ if active_doc.ocr_regions:
368
+ # Group by page
369
+ by_page = {}
370
+ for region in active_doc.ocr_regions:
371
+ page = region.get("page", 0)
372
+ if page not in by_page:
373
+ by_page[page] = []
374
+ by_page[page].append(region)
375
+
376
+ # Page selector
377
+ page_select = st.selectbox(
378
+ "Select page",
379
+ sorted(by_page.keys()),
380
+ format_func=lambda x: f"Page {x + 1} ({len(by_page.get(x, []))} regions)"
381
+ )
382
+
383
+ if page_select is not None and page_select in by_page:
384
+ page_regions = by_page[page_select]
385
+
386
+ # Summary
387
+ avg_conf = sum(r.get("confidence", 0) for r in page_regions) / len(page_regions) if page_regions else 0
388
+ conf_class = get_confidence_class(avg_conf)
389
+
390
+ st.markdown(f"**{len(page_regions)} regions** | Average confidence: <span class='{conf_class}'>{avg_conf:.0%}</span>", unsafe_allow_html=True)
391
+
392
+ # Filter by confidence
393
+ min_conf = st.slider("Minimum confidence", 0.0, 1.0, 0.5, 0.1)
394
+ filtered_regions = [r for r in page_regions if r.get("confidence", 0) >= min_conf]
395
+
396
+ for i, region in enumerate(filtered_regions[:50]):
397
+ conf = region.get("confidence", 0)
398
+ conf_class = get_confidence_class(conf)
399
+ color = "#4ECDC4" if conf >= 0.8 else "#ffc107" if conf >= 0.6 else "#dc3545"
400
+
401
+ st.markdown(f"""
402
+ <div class="ocr-region" style="border-left-color: {color};">
403
+ <div style="display: flex; justify-content: space-between; margin-bottom: 4px;">
404
+ <span style="font-size: 11px; color: #8b949e;">Region {i+1}</span>
405
+ <span class="{conf_class}" style="font-size: 11px;">{conf:.0%}</span>
406
+ </div>
407
+ <div style="font-family: monospace; font-size: 12px;">{region.get('text', '')}</div>
408
+ </div>
409
+ """, unsafe_allow_html=True)
410
+
411
+ if len(filtered_regions) > 50:
412
+ st.info(f"Showing 50 of {len(filtered_regions)} regions")
413
+ else:
414
+ st.info("No OCR regions available for this document")
415
+ st.markdown("OCR regions are extracted during document processing with OCR enabled.")
416
+
417
+ with tab_layout:
418
+ st.markdown("### Layout Structure")
419
+
420
+ layout_regions = active_doc.layout_data.get("regions", [])
421
+
422
+ if layout_regions:
423
+ # Group by type
424
+ by_type = {}
425
+ for region in layout_regions:
426
+ rtype = region.get("type", "unknown")
427
+ if rtype not in by_type:
428
+ by_type[rtype] = []
429
+ by_type[rtype].append(region)
430
+
431
+ # Type summary
432
+ st.markdown("**Detected region types:**")
433
+ type_cols = st.columns(min(len(by_type), 6))
434
+ for i, (rtype, regions) in enumerate(by_type.items()):
435
+ color = get_layout_color(rtype)
436
+ type_cols[i % 6].markdown(f"""
437
+ <div class="layout-region" style="background: {color}20; border: 1px solid {color};">
438
+ <strong>{rtype.title()}</strong>: {len(regions)}
439
+ </div>
440
+ """, unsafe_allow_html=True)
441
+
442
+ st.markdown("---")
443
+
444
+ # Layout regions list
445
+ type_filter = st.selectbox("Filter by type", ["All"] + list(by_type.keys()))
446
+
447
+ filtered_layout = layout_regions
448
+ if type_filter != "All":
449
+ filtered_layout = by_type.get(type_filter, [])
450
+
451
+ for i, region in enumerate(filtered_layout[:30]):
452
+ rtype = region.get("type", "unknown")
453
+ conf = region.get("confidence", 0)
454
+ color = get_layout_color(rtype)
455
+ conf_class = get_confidence_class(conf)
456
+
457
+ st.markdown(f"""
458
+ <div style="background: #161b22; border-radius: 6px; padding: 10px; margin: 6px 0; border-left: 3px solid {color};">
459
+ <div style="display: flex; justify-content: space-between;">
460
+ <span><strong style="color: {color};">{rtype.upper()}</strong></span>
461
+ <span>Page {region.get('page', 0) + 1}</span>
462
+ <span class="{conf_class}">{conf:.0%}</span>
463
+ </div>
464
+ </div>
465
+ """, unsafe_allow_html=True)
466
+
467
+ if len(filtered_layout) > 30:
468
+ st.info(f"Showing 30 of {len(filtered_layout)} regions")
469
+ else:
470
+ st.info("No layout regions available for this document")
471
+ st.markdown("Layout regions are extracted during document processing with layout detection enabled.")
472
+
473
+ with tab_pages:
474
+ st.markdown("### Page Images")
475
+
476
+ if active_doc.page_images:
477
+ page_idx = st.selectbox(
478
+ "Select page",
479
+ list(range(len(active_doc.page_images))),
480
+ format_func=lambda x: f"Page {x + 1}"
481
+ )
482
+
483
+ if page_idx is not None and page_idx < len(active_doc.page_images):
484
+ img_data = active_doc.page_images[page_idx]
485
+
486
+ # Display image
487
+ st.image(
488
+ f"data:image/png;base64,{img_data}",
489
+ caption=f"Page {page_idx + 1}",
490
+ use_container_width=True
491
+ )
492
+
493
+ # Overlay options
494
+ st.markdown("**Overlay options:**")
495
+ overlay_cols = st.columns(3)
496
+ with overlay_cols[0]:
497
+ show_chunks = st.checkbox("Show chunk boundaries", value=False)
498
+ with overlay_cols[1]:
499
+ show_ocr = st.checkbox("Show OCR regions", value=False)
500
+ with overlay_cols[2]:
501
+ show_layout = st.checkbox("Show layout regions", value=False)
502
+
503
+ if show_chunks or show_ocr or show_layout:
504
+ st.info("Overlay visualization coming soon - requires image annotation support")
505
+ else:
506
+ st.info("No page images available for this document")
507
+ st.markdown("Page images are extracted from PDF documents during processing.")
508
+
509
+ # Navigation to other modules
510
+ st.markdown("---")
511
+ st.markdown("### 🔗 Actions")
512
+
513
+ nav_cols = st.columns(4)
514
+
515
+ with nav_cols[0]:
516
+ if st.button("💬 Ask Questions", use_container_width=True):
517
+ st.switch_page("pages/2_💬_Interactive_RAG.py")
518
+
519
+ with nav_cols[1]:
520
+ if st.button("🎯 View Evidence", use_container_width=True):
521
+ st.switch_page("pages/4_🎯_Evidence_Viewer.py")
522
+
523
+ with nav_cols[2]:
524
+ if st.button("📊 Compare Documents", use_container_width=True):
525
+ st.switch_page("pages/3_📊_Document_Comparison.py")
526
+
527
+ with nav_cols[3]:
528
+ if st.button("🔬 Process New", use_container_width=True):
529
+ st.switch_page("pages/1_🔬_Live_Processing.py")
530
+
531
+ else:
532
+ # No active document
533
+ st.markdown("## No Document Selected")
534
+
535
+ col1, col2 = st.columns(2)
536
+
537
+ with col1:
538
+ st.markdown("""
539
+ ### Getting Started
540
+
541
+ 1. Go to **Live Processing** to upload and process a document
542
+ 2. Processed documents will appear in the sidebar
543
+ 3. Click on a document to view its details
544
+
545
+ Or select a document from the sidebar if you've already processed some.
546
+ """)
547
+
548
+ if st.button("🔬 Go to Live Processing", type="primary", use_container_width=True):
549
+ st.switch_page("pages/1_🔬_Live_Processing.py")
550
+
551
+ with col2:
552
+ # Show RAG stats
553
+ stats = get_store_stats()
554
+ st.markdown("### RAG Index Status")
555
+ st.metric("Total Indexed Chunks", stats.get("total_chunks", 0))
556
+
557
+ if rag_docs:
558
+ st.markdown("**Indexed Documents:**")
559
+ for doc in rag_docs[:5]:
560
+ doc_id = doc.get("document_id", "unknown")
561
+ chunks = doc.get("chunk_count", 0)
562
+ st.caption(f"• {doc_id[:30]}... ({chunks} chunks)")
563
+
564
+ if len(rag_docs) > 5:
565
+ st.caption(f"... and {len(rag_docs) - 5} more")
demo/rag_config.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unified RAG Configuration for SPARKNET Demo
3
+
4
+ This module provides a single source of truth for RAG system configuration,
5
+ ensuring all demo pages use the same vector store, embeddings, and models.
6
+ """
7
+
8
+ import streamlit as st
9
+ from pathlib import Path
10
+ import sys
11
+
12
+ PROJECT_ROOT = Path(__file__).parent.parent
13
+ sys.path.insert(0, str(PROJECT_ROOT))
14
+
15
+ # Configuration constants
16
+ OLLAMA_BASE_URL = "http://localhost:11434"
17
+ VECTOR_STORE_PATH = "data/sparknet_unified_rag"
18
+ COLLECTION_NAME = "sparknet_documents"
19
+
20
+ # Model preferences (in order of preference)
21
+ EMBEDDING_MODELS = ["nomic-embed-text", "mxbai-embed-large:latest", "mxbai-embed-large"]
22
+ LLM_MODELS = ["llama3.2:latest", "llama3.1:8b", "mistral:latest", "qwen2.5:14b", "qwen2.5:32b"]
23
+
24
+
25
+ def check_ollama():
26
+ """Check Ollama availability and get available models."""
27
+ try:
28
+ import httpx
29
+ with httpx.Client(timeout=5.0) as client:
30
+ resp = client.get(f"{OLLAMA_BASE_URL}/api/tags")
31
+ if resp.status_code == 200:
32
+ models = [m["name"] for m in resp.json().get("models", [])]
33
+ return True, models
34
+ except:
35
+ pass
36
+ return False, []
37
+
38
+
39
+ def select_model(available_models: list, preferred_models: list) -> str:
40
+ """Select the best available model from preferences."""
41
+ for model in preferred_models:
42
+ if model in available_models:
43
+ return model
44
+ # Return first preference as fallback
45
+ return preferred_models[0] if preferred_models else "llama3.2:latest"
46
+
47
+
48
+ @st.cache_resource
49
+ def get_unified_rag_system():
50
+ """
51
+ Initialize and return the unified RAG system.
52
+
53
+ This is cached at the Streamlit level so all pages share the same instance.
54
+ """
55
+ try:
56
+ from src.rag.agentic import AgenticRAG, RAGConfig
57
+ from src.rag.store import get_vector_store, VectorStoreConfig, reset_vector_store
58
+ from src.rag.embeddings import get_embedding_adapter, EmbeddingConfig, reset_embedding_adapter
59
+
60
+ # Check Ollama
61
+ ollama_ok, available_models = check_ollama()
62
+ if not ollama_ok:
63
+ return {
64
+ "status": "error",
65
+ "error": "Ollama is not running. Please start Ollama first.",
66
+ "rag": None,
67
+ "store": None,
68
+ "embedder": None,
69
+ }
70
+
71
+ # Select models
72
+ embed_model = select_model(available_models, EMBEDDING_MODELS)
73
+ llm_model = select_model(available_models, LLM_MODELS)
74
+
75
+ # Reset singletons to ensure fresh config
76
+ reset_vector_store()
77
+ reset_embedding_adapter()
78
+
79
+ # Initialize embedding adapter
80
+ embed_config = EmbeddingConfig(
81
+ ollama_model=embed_model,
82
+ ollama_base_url=OLLAMA_BASE_URL,
83
+ )
84
+ embedder = get_embedding_adapter(config=embed_config)
85
+
86
+ # Initialize vector store
87
+ store_config = VectorStoreConfig(
88
+ persist_directory=VECTOR_STORE_PATH,
89
+ collection_name=COLLECTION_NAME,
90
+ similarity_threshold=0.0, # No threshold - let reranker handle filtering
91
+ )
92
+ store = get_vector_store(config=store_config)
93
+
94
+ # Initialize RAG config
95
+ rag_config = RAGConfig(
96
+ model=llm_model,
97
+ base_url=OLLAMA_BASE_URL,
98
+ max_revision_attempts=1,
99
+ enable_query_planning=True,
100
+ enable_reranking=True,
101
+ enable_validation=True,
102
+ retrieval_top_k=10,
103
+ final_top_k=5,
104
+ min_confidence=0.3,
105
+ verbose=False,
106
+ )
107
+
108
+ # Initialize RAG system
109
+ rag = AgenticRAG(
110
+ config=rag_config,
111
+ vector_store=store,
112
+ embedding_adapter=embedder,
113
+ )
114
+
115
+ return {
116
+ "status": "ready",
117
+ "error": None,
118
+ "rag": rag,
119
+ "store": store,
120
+ "embedder": embedder,
121
+ "embed_model": embed_model,
122
+ "llm_model": llm_model,
123
+ "available_models": available_models,
124
+ }
125
+
126
+ except Exception as e:
127
+ import traceback
128
+ return {
129
+ "status": "error",
130
+ "error": f"{str(e)}\n{traceback.format_exc()}",
131
+ "rag": None,
132
+ "store": None,
133
+ "embedder": None,
134
+ }
135
+
136
+
137
+ def get_store_stats():
138
+ """Get current vector store statistics."""
139
+ system = get_unified_rag_system()
140
+ if system["status"] != "ready":
141
+ return {"total_chunks": 0, "status": "error"}
142
+
143
+ try:
144
+ return {
145
+ "total_chunks": system["store"].count(),
146
+ "status": "ready",
147
+ "embed_model": system.get("embed_model", "unknown"),
148
+ "llm_model": system.get("llm_model", "unknown"),
149
+ }
150
+ except:
151
+ return {"total_chunks": 0, "status": "error"}
152
+
153
+
154
+ def index_document(text: str, document_id: str, metadata: dict = None) -> dict:
155
+ """Index a document into the unified RAG system."""
156
+ system = get_unified_rag_system()
157
+ if system["status"] != "ready":
158
+ return {"success": False, "error": system["error"], "num_chunks": 0}
159
+
160
+ try:
161
+ num_chunks = system["rag"].index_text(
162
+ text=text,
163
+ document_id=document_id,
164
+ metadata=metadata or {},
165
+ )
166
+ return {"success": True, "num_chunks": num_chunks, "error": None}
167
+ except Exception as e:
168
+ return {"success": False, "error": str(e), "num_chunks": 0}
169
+
170
+
171
+ def query_rag(question: str, filters: dict = None):
172
+ """Query the unified RAG system."""
173
+ system = get_unified_rag_system()
174
+ if system["status"] != "ready":
175
+ return None, system["error"]
176
+
177
+ try:
178
+ response = system["rag"].query(question, filters=filters)
179
+ return response, None
180
+ except Exception as e:
181
+ return None, str(e)
182
+
183
+
184
+ def clear_index():
185
+ """Clear the vector store index."""
186
+ # Force reinitialization by clearing cache
187
+ get_unified_rag_system.clear()
188
+ return True
189
+
190
+
191
+ def get_indexed_documents() -> list:
192
+ """Get list of indexed document IDs from vector store."""
193
+ system = get_unified_rag_system()
194
+ if system["status"] != "ready":
195
+ return []
196
+
197
+ try:
198
+ # Query ChromaDB for unique document IDs
199
+ store = system["store"]
200
+ collection = store._collection
201
+
202
+ # Get all metadata to extract unique document_ids
203
+ results = collection.get(include=["metadatas"])
204
+ if not results or not results.get("metadatas"):
205
+ return []
206
+
207
+ doc_ids = set()
208
+ doc_info = {}
209
+ for meta in results["metadatas"]:
210
+ doc_id = meta.get("document_id", "unknown")
211
+ if doc_id not in doc_info:
212
+ doc_info[doc_id] = {
213
+ "document_id": doc_id,
214
+ "source_path": meta.get("source_path", ""),
215
+ "chunk_count": 0,
216
+ }
217
+ doc_info[doc_id]["chunk_count"] += 1
218
+
219
+ return list(doc_info.values())
220
+ except Exception as e:
221
+ return []
222
+
223
+
224
+ def get_chunks_for_document(document_id: str) -> list:
225
+ """Get all chunks for a specific document."""
226
+ system = get_unified_rag_system()
227
+ if system["status"] != "ready":
228
+ return []
229
+
230
+ try:
231
+ store = system["store"]
232
+ collection = store._collection
233
+
234
+ # Query for chunks with this document_id
235
+ results = collection.get(
236
+ where={"document_id": document_id},
237
+ include=["documents", "metadatas"]
238
+ )
239
+
240
+ if not results or not results.get("ids"):
241
+ return []
242
+
243
+ chunks = []
244
+ for i, chunk_id in enumerate(results["ids"]):
245
+ chunks.append({
246
+ "chunk_id": chunk_id,
247
+ "text": results["documents"][i] if results.get("documents") else "",
248
+ "metadata": results["metadatas"][i] if results.get("metadatas") else {},
249
+ })
250
+
251
+ return chunks
252
+ except Exception as e:
253
+ return []
254
+
255
+
256
+ def search_similar_chunks(query: str, top_k: int = 5, doc_filter: str = None):
257
+ """Search for similar chunks with optional document filter."""
258
+ system = get_unified_rag_system()
259
+ if system["status"] != "ready":
260
+ return []
261
+
262
+ try:
263
+ embedder = system["embedder"]
264
+ store = system["store"]
265
+
266
+ # Generate query embedding
267
+ query_embedding = embedder.embed_text(query)
268
+
269
+ # Build filter
270
+ filters = None
271
+ if doc_filter:
272
+ filters = {"document_id": doc_filter}
273
+
274
+ # Search
275
+ results = store.search(
276
+ query_embedding=query_embedding,
277
+ top_k=top_k,
278
+ filters=filters,
279
+ )
280
+
281
+ return [
282
+ {
283
+ "chunk_id": r.chunk_id,
284
+ "document_id": r.document_id,
285
+ "text": r.text,
286
+ "similarity": r.similarity,
287
+ "page": r.page,
288
+ "metadata": r.metadata,
289
+ }
290
+ for r in results
291
+ ]
292
+ except Exception as e:
293
+ return []
294
+
295
+
296
+ def compute_document_similarity(doc_id_1: str, doc_id_2: str) -> dict:
297
+ """Compute semantic similarity between two documents."""
298
+ system = get_unified_rag_system()
299
+ if system["status"] != "ready":
300
+ return {"error": "RAG system not ready", "similarity": 0.0}
301
+
302
+ try:
303
+ # Get chunks for both documents
304
+ chunks_1 = get_chunks_for_document(doc_id_1)
305
+ chunks_2 = get_chunks_for_document(doc_id_2)
306
+
307
+ if not chunks_1 or not chunks_2:
308
+ return {"error": "One or both documents not found", "similarity": 0.0}
309
+
310
+ embedder = system["embedder"]
311
+
312
+ # Compute average embeddings for each document
313
+ def avg_embedding(chunks):
314
+ embeddings = []
315
+ for chunk in chunks[:10]: # Limit to first 10 chunks
316
+ emb = embedder.embed_text(chunk["text"])
317
+ embeddings.append(emb)
318
+ if not embeddings:
319
+ return None
320
+ # Average
321
+ import numpy as np
322
+ return np.mean(embeddings, axis=0).tolist()
323
+
324
+ emb1 = avg_embedding(chunks_1)
325
+ emb2 = avg_embedding(chunks_2)
326
+
327
+ if emb1 is None or emb2 is None:
328
+ return {"error": "Could not compute embeddings", "similarity": 0.0}
329
+
330
+ # Compute cosine similarity
331
+ import numpy as np
332
+ emb1 = np.array(emb1)
333
+ emb2 = np.array(emb2)
334
+ similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
335
+
336
+ return {
337
+ "similarity": float(similarity),
338
+ "doc1_chunks": len(chunks_1),
339
+ "doc2_chunks": len(chunks_2),
340
+ "error": None,
341
+ }
342
+ except Exception as e:
343
+ return {"error": str(e), "similarity": 0.0}
344
+
345
+
346
+ def auto_index_processed_document(doc_id: str, text: str, chunks: list, metadata: dict = None):
347
+ """
348
+ Auto-index a processed document with pre-computed chunks.
349
+
350
+ This is called after document processing completes to immediately
351
+ make the document available in RAG.
352
+ """
353
+ system = get_unified_rag_system()
354
+ if system["status"] != "ready":
355
+ return {"success": False, "error": "RAG system not ready", "num_chunks": 0}
356
+
357
+ try:
358
+ store = system["store"]
359
+ embedder = system["embedder"]
360
+
361
+ # Prepare chunks for indexing
362
+ chunk_dicts = []
363
+ embeddings = []
364
+
365
+ for i, chunk in enumerate(chunks):
366
+ chunk_text = chunk.get("text", chunk) if isinstance(chunk, dict) else chunk
367
+
368
+ if len(chunk_text.strip()) < 20:
369
+ continue
370
+
371
+ chunk_id = f"{doc_id}_chunk_{i}"
372
+ chunk_dict = {
373
+ "chunk_id": chunk_id,
374
+ "document_id": doc_id,
375
+ "text": chunk_text,
376
+ "page": chunk.get("page", 0) if isinstance(chunk, dict) else 0,
377
+ "chunk_type": "text",
378
+ "source_path": metadata.get("filename", "") if metadata else "",
379
+ "sequence_index": i,
380
+ }
381
+ chunk_dicts.append(chunk_dict)
382
+
383
+ # Generate embedding
384
+ embedding = embedder.embed_text(chunk_text)
385
+ embeddings.append(embedding)
386
+
387
+ if not chunk_dicts:
388
+ return {"success": False, "error": "No valid chunks to index", "num_chunks": 0}
389
+
390
+ # Add to store
391
+ store.add_chunks(chunk_dicts, embeddings)
392
+
393
+ return {"success": True, "num_chunks": len(chunk_dicts), "error": None}
394
+
395
+ except Exception as e:
396
+ return {"success": False, "error": str(e), "num_chunks": 0}
demo/requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPARKNET Demo Requirements
2
+ # Run: pip install -r demo/requirements.txt
3
+
4
+ # Streamlit
5
+ streamlit>=1.28.0
6
+
7
+ # Data handling
8
+ pandas>=2.0.0
9
+ numpy>=1.24.0
10
+
11
+ # HTTP client (for Ollama checks)
12
+ httpx>=0.25.0
13
+
14
+ # Image handling (optional, for advanced features)
15
+ Pillow>=10.0.0
16
+
17
+ # Charts (optional)
18
+ plotly>=5.18.0
19
+ altair>=5.2.0
demo/state_manager.py ADDED
@@ -0,0 +1,833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unified State Manager for SPARKNET Demo
3
+
4
+ Enhanced state management for cross-module communication (Phase 1B):
5
+ - Document processing state tracking
6
+ - Indexed documents registry
7
+ - Cross-module event system (pub/sub)
8
+ - Real-time status updates
9
+ - Evidence highlighting synchronization
10
+ - Document selection synchronization
11
+ - Query/response sharing between modules
12
+ """
13
+
14
+ import streamlit as st
15
+ from pathlib import Path
16
+ from typing import Dict, List, Any, Optional, Callable, Set
17
+ from dataclasses import dataclass, field
18
+ from datetime import datetime
19
+ from enum import Enum
20
+ import hashlib
21
+ import json
22
+ import sys
23
+ import time
24
+ from threading import Lock
25
+
26
+ PROJECT_ROOT = Path(__file__).parent.parent
27
+ sys.path.insert(0, str(PROJECT_ROOT))
28
+
29
+
30
+ # ==============================================================================
31
+ # Event System (Phase 1B Enhancement)
32
+ # ==============================================================================
33
+
34
+ class EventType(str, Enum):
35
+ """Cross-module event types for synchronization."""
36
+ DOCUMENT_SELECTED = "document_selected"
37
+ DOCUMENT_PROCESSED = "document_processed"
38
+ DOCUMENT_INDEXED = "document_indexed"
39
+ DOCUMENT_REMOVED = "document_removed"
40
+ CHUNK_SELECTED = "chunk_selected"
41
+ EVIDENCE_HIGHLIGHT = "evidence_highlight"
42
+ RAG_QUERY_STARTED = "rag_query_started"
43
+ RAG_QUERY_COMPLETED = "rag_query_completed"
44
+ PAGE_CHANGED = "page_changed"
45
+ PROCESSING_STARTED = "processing_started"
46
+ PROCESSING_COMPLETED = "processing_completed"
47
+ SYSTEM_STATUS_CHANGED = "system_status_changed"
48
+
49
+
50
+ @dataclass
51
+ class Event:
52
+ """Cross-module event for synchronization."""
53
+ event_type: EventType
54
+ source_module: str
55
+ payload: Dict[str, Any]
56
+ timestamp: datetime = field(default_factory=datetime.now)
57
+ event_id: str = field(default_factory=lambda: hashlib.md5(
58
+ f"{time.time()}".encode()
59
+ ).hexdigest()[:8])
60
+
61
+
62
+ @dataclass
63
+ class EvidenceHighlight:
64
+ """Evidence highlight for cross-module visualization."""
65
+ doc_id: str
66
+ chunk_id: str
67
+ page: int
68
+ bbox: tuple # (x_min, y_min, x_max, y_max)
69
+ text_snippet: str
70
+ confidence: float
71
+ source_query: Optional[str] = None
72
+ highlight_color: str = "#FFE082" # Amber highlight
73
+
74
+
75
+ @dataclass
76
+ class ProcessedDocument:
77
+ """Represents a processed document with all extracted data."""
78
+ doc_id: str
79
+ filename: str
80
+ file_type: str
81
+ raw_text: str
82
+ chunks: List[Dict[str, Any]]
83
+ page_count: int = 1
84
+ page_images: List[bytes] = field(default_factory=list)
85
+ ocr_regions: List[Dict[str, Any]] = field(default_factory=list)
86
+ layout_data: Dict[str, Any] = field(default_factory=dict)
87
+ metadata: Dict[str, Any] = field(default_factory=dict)
88
+ indexed: bool = False
89
+ indexed_chunks: int = 0
90
+ processing_time: float = 0.0
91
+ created_at: datetime = field(default_factory=datetime.now)
92
+
93
+ def to_dict(self) -> Dict[str, Any]:
94
+ return {
95
+ "doc_id": self.doc_id,
96
+ "filename": self.filename,
97
+ "file_type": self.file_type,
98
+ "text_length": len(self.raw_text),
99
+ "chunk_count": len(self.chunks),
100
+ "page_count": self.page_count,
101
+ "ocr_region_count": len(self.ocr_regions),
102
+ "indexed": self.indexed,
103
+ "indexed_chunks": self.indexed_chunks,
104
+ "processing_time": self.processing_time,
105
+ "created_at": self.created_at.isoformat(),
106
+ }
107
+
108
+
109
+ @dataclass
110
+ class ProcessingStatus:
111
+ """Tracks processing status for a document."""
112
+ doc_id: str
113
+ stage: str # loading, ocr, chunking, embedding, indexing, complete, error
114
+ progress: float # 0.0 - 1.0
115
+ message: str
116
+ started_at: datetime = field(default_factory=datetime.now)
117
+ completed_at: Optional[datetime] = None
118
+ error: Optional[str] = None
119
+
120
+
121
+ class UnifiedStateManager:
122
+ """
123
+ Central state manager for SPARKNET demo.
124
+
125
+ Enhanced with Phase 1B features:
126
+ - Document processing state tracking
127
+ - Indexed documents registry
128
+ - Cross-module event system (pub/sub)
129
+ - Real-time status updates
130
+ - Evidence highlighting sync
131
+ - Query/response sharing
132
+ """
133
+
134
+ def __init__(self):
135
+ self._ensure_session_state()
136
+ self._event_handlers: Dict[EventType, List[Callable]] = {}
137
+
138
+ def _ensure_session_state(self):
139
+ """Initialize session state if not exists."""
140
+ if "unified_state" not in st.session_state:
141
+ st.session_state.unified_state = {
142
+ "documents": {}, # doc_id -> ProcessedDocument
143
+ "processing_status": {}, # doc_id -> ProcessingStatus
144
+ "indexed_doc_ids": set(),
145
+ "active_doc_id": None,
146
+ "active_page": 0,
147
+ "active_chunk_id": None,
148
+ "notifications": [],
149
+ "rag_ready": False,
150
+ "total_indexed_chunks": 0,
151
+ "last_update": datetime.now().isoformat(),
152
+ # Phase 1B: Cross-module sync
153
+ "event_queue": [], # List of Event objects
154
+ "evidence_highlights": [], # List of EvidenceHighlight
155
+ "last_rag_query": None,
156
+ "last_rag_response": None,
157
+ "selected_sources": [], # Source chunks from RAG
158
+ "module_states": {}, # Per-module custom state
159
+ "sync_version": 0, # Increment on any state change
160
+ }
161
+
162
+ @property
163
+ def state(self) -> Dict:
164
+ """Get the unified state dict."""
165
+ self._ensure_session_state()
166
+ return st.session_state.unified_state
167
+
168
+ # ==================== Document Management ====================
169
+
170
+ def add_document(self, doc: ProcessedDocument) -> str:
171
+ """Add a processed document to the state."""
172
+ self.state["documents"][doc.doc_id] = doc
173
+ self._notify(f"Document '{doc.filename}' added", "info")
174
+ self._update_timestamp()
175
+ return doc.doc_id
176
+
177
+ def get_document(self, doc_id: str) -> Optional[ProcessedDocument]:
178
+ """Get a document by ID."""
179
+ return self.state["documents"].get(doc_id)
180
+
181
+ def get_all_documents(self) -> List[ProcessedDocument]:
182
+ """Get all documents."""
183
+ return list(self.state["documents"].values())
184
+
185
+ def get_indexed_documents(self) -> List[ProcessedDocument]:
186
+ """Get only indexed documents."""
187
+ return [d for d in self.state["documents"].values() if d.indexed]
188
+
189
+ def remove_document(self, doc_id: str):
190
+ """Remove a document from state."""
191
+ if doc_id in self.state["documents"]:
192
+ doc = self.state["documents"].pop(doc_id)
193
+ self.state["indexed_doc_ids"].discard(doc_id)
194
+ self._notify(f"Document '{doc.filename}' removed", "warning")
195
+ self._update_timestamp()
196
+
197
+ def set_active_document(self, doc_id: Optional[str]):
198
+ """Set the currently active document."""
199
+ self.state["active_doc_id"] = doc_id
200
+ self._update_timestamp()
201
+
202
+ def get_active_document(self) -> Optional[ProcessedDocument]:
203
+ """Get the currently active document."""
204
+ if self.state["active_doc_id"]:
205
+ return self.get_document(self.state["active_doc_id"])
206
+ return None
207
+
208
+ # ==================== Processing Status ====================
209
+
210
+ def start_processing(self, doc_id: str, filename: str):
211
+ """Start processing a document."""
212
+ status = ProcessingStatus(
213
+ doc_id=doc_id,
214
+ stage="loading",
215
+ progress=0.0,
216
+ message=f"Loading {filename}..."
217
+ )
218
+ self.state["processing_status"][doc_id] = status
219
+ self._update_timestamp()
220
+
221
+ def update_processing(self, doc_id: str, stage: str, progress: float, message: str):
222
+ """Update processing status."""
223
+ if doc_id in self.state["processing_status"]:
224
+ status = self.state["processing_status"][doc_id]
225
+ status.stage = stage
226
+ status.progress = progress
227
+ status.message = message
228
+ self._update_timestamp()
229
+
230
+ def complete_processing(self, doc_id: str, success: bool = True, error: str = None):
231
+ """Mark processing as complete."""
232
+ if doc_id in self.state["processing_status"]:
233
+ status = self.state["processing_status"][doc_id]
234
+ status.stage = "complete" if success else "error"
235
+ status.progress = 1.0 if success else status.progress
236
+ status.completed_at = datetime.now()
237
+ status.error = error
238
+ status.message = "Processing complete!" if success else f"Error: {error}"
239
+
240
+ if success:
241
+ self._notify(f"Document processed successfully!", "success")
242
+ else:
243
+ self._notify(f"Processing failed: {error}", "error")
244
+
245
+ self._update_timestamp()
246
+
247
+ def get_processing_status(self, doc_id: str) -> Optional[ProcessingStatus]:
248
+ """Get processing status for a document."""
249
+ return self.state["processing_status"].get(doc_id)
250
+
251
+ def is_processing(self, doc_id: str) -> bool:
252
+ """Check if document is being processed."""
253
+ status = self.get_processing_status(doc_id)
254
+ return status is not None and status.stage not in ["complete", "error"]
255
+
256
+ # ==================== Indexing ====================
257
+
258
+ def mark_indexed(self, doc_id: str, chunk_count: int):
259
+ """Mark a document as indexed to RAG."""
260
+ if doc_id in self.state["documents"]:
261
+ doc = self.state["documents"][doc_id]
262
+ doc.indexed = True
263
+ doc.indexed_chunks = chunk_count
264
+ self.state["indexed_doc_ids"].add(doc_id)
265
+ self.state["total_indexed_chunks"] += chunk_count
266
+ self._notify(f"Indexed {chunk_count} chunks from '{doc.filename}'", "success")
267
+ self._update_timestamp()
268
+
269
+ def is_indexed(self, doc_id: str) -> bool:
270
+ """Check if document is indexed."""
271
+ return doc_id in self.state["indexed_doc_ids"]
272
+
273
+ def get_total_indexed_chunks(self) -> int:
274
+ """Get total number of indexed chunks."""
275
+ return self.state["total_indexed_chunks"]
276
+
277
+ # ==================== Notifications ====================
278
+
279
+ def _notify(self, message: str, level: str = "info"):
280
+ """Add a notification."""
281
+ self.state["notifications"].append({
282
+ "message": message,
283
+ "level": level,
284
+ "timestamp": datetime.now().isoformat(),
285
+ })
286
+ # Keep only last 50 notifications
287
+ if len(self.state["notifications"]) > 50:
288
+ self.state["notifications"] = self.state["notifications"][-50:]
289
+
290
+ def get_notifications(self, limit: int = 10) -> List[Dict]:
291
+ """Get recent notifications."""
292
+ return self.state["notifications"][-limit:]
293
+
294
+ def clear_notifications(self):
295
+ """Clear all notifications."""
296
+ self.state["notifications"] = []
297
+
298
+ # ==================== RAG Status ====================
299
+
300
+ def set_rag_ready(self, ready: bool):
301
+ """Set RAG system ready status."""
302
+ self.state["rag_ready"] = ready
303
+ self._update_timestamp()
304
+
305
+ def is_rag_ready(self) -> bool:
306
+ """Check if RAG is ready."""
307
+ return self.state["rag_ready"]
308
+
309
+ # ==================== Utilities ====================
310
+
311
+ def _update_timestamp(self):
312
+ """Update the last update timestamp."""
313
+ self.state["last_update"] = datetime.now().isoformat()
314
+ self.state["sync_version"] += 1
315
+
316
+ def get_summary(self) -> Dict[str, Any]:
317
+ """Get a summary of current state."""
318
+ return {
319
+ "total_documents": len(self.state["documents"]),
320
+ "indexed_documents": len(self.state["indexed_doc_ids"]),
321
+ "total_indexed_chunks": self.state["total_indexed_chunks"],
322
+ "active_doc_id": self.state["active_doc_id"],
323
+ "active_page": self.state.get("active_page", 0),
324
+ "rag_ready": self.state["rag_ready"],
325
+ "last_update": self.state["last_update"],
326
+ "sync_version": self.state.get("sync_version", 0),
327
+ "processing_count": sum(
328
+ 1 for s in self.state["processing_status"].values()
329
+ if s.stage not in ["complete", "error"]
330
+ ),
331
+ "evidence_count": len(self.state.get("evidence_highlights", [])),
332
+ }
333
+
334
+ def reset(self):
335
+ """Reset all state."""
336
+ st.session_state.unified_state = {
337
+ "documents": {},
338
+ "processing_status": {},
339
+ "indexed_doc_ids": set(),
340
+ "active_doc_id": None,
341
+ "active_page": 0,
342
+ "active_chunk_id": None,
343
+ "notifications": [],
344
+ "rag_ready": False,
345
+ "total_indexed_chunks": 0,
346
+ "last_update": datetime.now().isoformat(),
347
+ "event_queue": [],
348
+ "evidence_highlights": [],
349
+ "last_rag_query": None,
350
+ "last_rag_response": None,
351
+ "selected_sources": [],
352
+ "module_states": {},
353
+ "sync_version": 0,
354
+ }
355
+
356
+ # ==================== Event System (Phase 1B) ====================
357
+
358
+ def publish_event(
359
+ self,
360
+ event_type: EventType,
361
+ source_module: str,
362
+ payload: Dict[str, Any]
363
+ ) -> Event:
364
+ """
365
+ Publish an event for cross-module synchronization.
366
+
367
+ Args:
368
+ event_type: Type of event
369
+ source_module: Name of module publishing the event
370
+ payload: Event data
371
+
372
+ Returns:
373
+ The created Event object
374
+ """
375
+ event = Event(
376
+ event_type=event_type,
377
+ source_module=source_module,
378
+ payload=payload
379
+ )
380
+
381
+ # Add to event queue
382
+ self.state["event_queue"].append(event)
383
+
384
+ # Keep only last 100 events
385
+ if len(self.state["event_queue"]) > 100:
386
+ self.state["event_queue"] = self.state["event_queue"][-100:]
387
+
388
+ # Call registered handlers
389
+ if event_type in self._event_handlers:
390
+ for handler in self._event_handlers[event_type]:
391
+ try:
392
+ handler(event)
393
+ except Exception as e:
394
+ self._notify(f"Event handler error: {e}", "error")
395
+
396
+ self._update_timestamp()
397
+ return event
398
+
399
+ def subscribe(self, event_type: EventType, handler: Callable[[Event], None]):
400
+ """
401
+ Subscribe to an event type.
402
+
403
+ Args:
404
+ event_type: Type of event to subscribe to
405
+ handler: Callback function to handle the event
406
+ """
407
+ if event_type not in self._event_handlers:
408
+ self._event_handlers[event_type] = []
409
+ self._event_handlers[event_type].append(handler)
410
+
411
+ def unsubscribe(self, event_type: EventType, handler: Callable[[Event], None]):
412
+ """Unsubscribe from an event type."""
413
+ if event_type in self._event_handlers:
414
+ self._event_handlers[event_type] = [
415
+ h for h in self._event_handlers[event_type] if h != handler
416
+ ]
417
+
418
+ def get_recent_events(
419
+ self,
420
+ event_type: Optional[EventType] = None,
421
+ limit: int = 10
422
+ ) -> List[Event]:
423
+ """Get recent events, optionally filtered by type."""
424
+ events = self.state.get("event_queue", [])
425
+
426
+ if event_type:
427
+ events = [e for e in events if e.event_type == event_type]
428
+
429
+ return events[-limit:]
430
+
431
+ # ==================== Evidence Highlighting (Phase 1B) ====================
432
+
433
+ def add_evidence_highlight(self, highlight: EvidenceHighlight):
434
+ """
435
+ Add an evidence highlight for cross-module visualization.
436
+
437
+ Used when RAG finds relevant evidence that should be displayed
438
+ in the Document Viewer or Evidence Viewer.
439
+ """
440
+ self.state["evidence_highlights"].append(highlight)
441
+
442
+ # Publish event for other modules
443
+ self.publish_event(
444
+ EventType.EVIDENCE_HIGHLIGHT,
445
+ source_module="rag",
446
+ payload={
447
+ "doc_id": highlight.doc_id,
448
+ "chunk_id": highlight.chunk_id,
449
+ "page": highlight.page,
450
+ "bbox": highlight.bbox,
451
+ "text_snippet": highlight.text_snippet[:100],
452
+ }
453
+ )
454
+
455
+ self._update_timestamp()
456
+
457
+ def clear_evidence_highlights(self, doc_id: Optional[str] = None):
458
+ """Clear evidence highlights, optionally for a specific document."""
459
+ if doc_id:
460
+ self.state["evidence_highlights"] = [
461
+ h for h in self.state["evidence_highlights"]
462
+ if h.doc_id != doc_id
463
+ ]
464
+ else:
465
+ self.state["evidence_highlights"] = []
466
+
467
+ self._update_timestamp()
468
+
469
+ def get_evidence_highlights(
470
+ self,
471
+ doc_id: Optional[str] = None,
472
+ page: Optional[int] = None
473
+ ) -> List[EvidenceHighlight]:
474
+ """Get evidence highlights, optionally filtered by doc_id and page."""
475
+ highlights = self.state.get("evidence_highlights", [])
476
+
477
+ if doc_id:
478
+ highlights = [h for h in highlights if h.doc_id == doc_id]
479
+
480
+ if page is not None:
481
+ highlights = [h for h in highlights if h.page == page]
482
+
483
+ return highlights
484
+
485
+ # ==================== Page/Chunk Selection (Phase 1B) ====================
486
+
487
+ def select_page(self, page: int, source_module: str = "unknown"):
488
+ """
489
+ Set the active page and notify other modules.
490
+
491
+ Used for synchronized scrolling between Document Viewer and Evidence Viewer.
492
+ """
493
+ old_page = self.state.get("active_page", 0)
494
+ self.state["active_page"] = page
495
+
496
+ if old_page != page:
497
+ self.publish_event(
498
+ EventType.PAGE_CHANGED,
499
+ source_module=source_module,
500
+ payload={"page": page, "previous_page": old_page}
501
+ )
502
+
503
+ def get_active_page(self) -> int:
504
+ """Get the currently active page."""
505
+ return self.state.get("active_page", 0)
506
+
507
+ def select_chunk(
508
+ self,
509
+ chunk_id: str,
510
+ doc_id: str,
511
+ source_module: str = "unknown"
512
+ ):
513
+ """
514
+ Select a chunk and navigate to its location.
515
+
516
+ Publishes event to trigger synchronized navigation.
517
+ """
518
+ self.state["active_chunk_id"] = chunk_id
519
+
520
+ # Get chunk details to navigate
521
+ doc = self.get_document(doc_id)
522
+ if doc:
523
+ for chunk in doc.chunks:
524
+ if chunk.get("chunk_id") == chunk_id:
525
+ page = chunk.get("page", 0)
526
+ self.select_page(page, source_module)
527
+
528
+ self.publish_event(
529
+ EventType.CHUNK_SELECTED,
530
+ source_module=source_module,
531
+ payload={
532
+ "chunk_id": chunk_id,
533
+ "doc_id": doc_id,
534
+ "page": page,
535
+ "bbox": chunk.get("bbox"),
536
+ }
537
+ )
538
+ break
539
+
540
+ def get_active_chunk_id(self) -> Optional[str]:
541
+ """Get the currently selected chunk ID."""
542
+ return self.state.get("active_chunk_id")
543
+
544
+ # ==================== RAG Query Sync (Phase 1B) ====================
545
+
546
+ def store_rag_query(
547
+ self,
548
+ query: str,
549
+ response: Dict[str, Any],
550
+ sources: List[Dict[str, Any]]
551
+ ):
552
+ """
553
+ Store the last RAG query and response for cross-module access.
554
+
555
+ Allows Evidence Viewer to display sources from Interactive RAG.
556
+ """
557
+ self.state["last_rag_query"] = query
558
+ self.state["last_rag_response"] = response
559
+ self.state["selected_sources"] = sources
560
+
561
+ # Clear old highlights and add new ones from sources
562
+ self.clear_evidence_highlights()
563
+
564
+ for source in sources:
565
+ if all(k in source for k in ["doc_id", "chunk_id", "page"]):
566
+ bbox = source.get("bbox", (0, 0, 1, 1))
567
+ if isinstance(bbox, dict):
568
+ bbox = (bbox.get("x_min", 0), bbox.get("y_min", 0),
569
+ bbox.get("x_max", 1), bbox.get("y_max", 1))
570
+
571
+ highlight = EvidenceHighlight(
572
+ doc_id=source["doc_id"],
573
+ chunk_id=source["chunk_id"],
574
+ page=source["page"],
575
+ bbox=bbox,
576
+ text_snippet=source.get("text", "")[:200],
577
+ confidence=source.get("score", 0.0),
578
+ source_query=query,
579
+ )
580
+ self.add_evidence_highlight(highlight)
581
+
582
+ self.publish_event(
583
+ EventType.RAG_QUERY_COMPLETED,
584
+ source_module="rag",
585
+ payload={
586
+ "query": query,
587
+ "source_count": len(sources),
588
+ "response_length": len(str(response)),
589
+ }
590
+ )
591
+
592
+ self._update_timestamp()
593
+
594
+ def get_last_rag_query(self) -> Optional[str]:
595
+ """Get the last RAG query."""
596
+ return self.state.get("last_rag_query")
597
+
598
+ def get_last_rag_response(self) -> Optional[Dict[str, Any]]:
599
+ """Get the last RAG response."""
600
+ return self.state.get("last_rag_response")
601
+
602
+ def get_selected_sources(self) -> List[Dict[str, Any]]:
603
+ """Get the sources from the last RAG query."""
604
+ return self.state.get("selected_sources", [])
605
+
606
+ # ==================== Module State (Phase 1B) ====================
607
+
608
+ def set_module_state(self, module_name: str, state: Dict[str, Any]):
609
+ """
610
+ Store custom state for a specific module.
611
+
612
+ Allows modules to persist their own state across reruns.
613
+ """
614
+ self.state["module_states"][module_name] = {
615
+ **state,
616
+ "updated_at": datetime.now().isoformat()
617
+ }
618
+
619
+ def get_module_state(self, module_name: str) -> Dict[str, Any]:
620
+ """Get custom state for a specific module."""
621
+ return self.state.get("module_states", {}).get(module_name, {})
622
+
623
+ def get_sync_version(self) -> int:
624
+ """
625
+ Get the current sync version.
626
+
627
+ Modules can use this to detect if state has changed since last check.
628
+ """
629
+ return self.state.get("sync_version", 0)
630
+
631
+
632
+ def generate_doc_id(filename: str, content_hash: str = None) -> str:
633
+ """Generate a unique document ID."""
634
+ timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
635
+ base = f"{filename}_{timestamp}"
636
+ if content_hash:
637
+ base = f"{base}_{content_hash[:8]}"
638
+ return hashlib.md5(base.encode()).hexdigest()[:12]
639
+
640
+
641
+ def get_state_manager() -> UnifiedStateManager:
642
+ """Get or create the unified state manager."""
643
+ if "state_manager_instance" not in st.session_state:
644
+ st.session_state.state_manager_instance = UnifiedStateManager()
645
+ return st.session_state.state_manager_instance
646
+
647
+
648
+ # ==================== Global Status Bar Component ====================
649
+
650
+ def render_global_status_bar():
651
+ """Render a global status bar showing system state."""
652
+ manager = get_state_manager()
653
+ summary = manager.get_summary()
654
+
655
+ # Import RAG config for additional status
656
+ try:
657
+ from rag_config import get_unified_rag_system, check_ollama
658
+ rag_system = get_unified_rag_system()
659
+ ollama_ok, models = check_ollama()
660
+ rag_status = rag_system["status"]
661
+ llm_model = rag_system.get("llm_model", "N/A")
662
+ except:
663
+ ollama_ok = False
664
+ rag_status = "error"
665
+ llm_model = "N/A"
666
+ models = []
667
+
668
+ # Status bar
669
+ cols = st.columns(6)
670
+
671
+ with cols[0]:
672
+ if ollama_ok:
673
+ st.success(f"Ollama ({len(models)})")
674
+ else:
675
+ st.error("Ollama Offline")
676
+
677
+ with cols[1]:
678
+ if rag_status == "ready":
679
+ st.success("RAG Ready")
680
+ else:
681
+ st.error("RAG Error")
682
+
683
+ with cols[2]:
684
+ st.info(f"{llm_model.split(':')[0]}")
685
+
686
+ with cols[3]:
687
+ st.info(f"{summary['total_documents']} Docs")
688
+
689
+ with cols[4]:
690
+ if summary['indexed_documents'] > 0:
691
+ st.success(f"{summary['total_indexed_chunks']} Chunks")
692
+ else:
693
+ st.warning("0 Chunks")
694
+
695
+ with cols[5]:
696
+ if summary['processing_count'] > 0:
697
+ st.warning(f"Processing...")
698
+ else:
699
+ st.info("Idle")
700
+
701
+
702
+ def render_notifications():
703
+ """Render recent notifications."""
704
+ manager = get_state_manager()
705
+ notifications = manager.get_notifications(5)
706
+
707
+ if notifications:
708
+ for notif in reversed(notifications):
709
+ level = notif["level"]
710
+ msg = notif["message"]
711
+ if level == "success":
712
+ st.success(msg)
713
+ elif level == "error":
714
+ st.error(msg)
715
+ elif level == "warning":
716
+ st.warning(msg)
717
+ else:
718
+ st.info(msg)
719
+
720
+
721
+ # ==================== Helper Components (Phase 1B) ====================
722
+
723
+ def render_evidence_panel():
724
+ """
725
+ Render a panel showing current evidence highlights.
726
+
727
+ Can be used in any module to show sources from RAG queries.
728
+ """
729
+ manager = get_state_manager()
730
+ highlights = manager.get_evidence_highlights()
731
+
732
+ if not highlights:
733
+ st.info("No evidence highlights. Run a RAG query to see sources.")
734
+ return
735
+
736
+ st.subheader(f"Evidence Sources ({len(highlights)})")
737
+
738
+ for i, h in enumerate(highlights):
739
+ with st.expander(f"Source {i+1}: Page {h.page + 1} ({h.confidence:.0%})"):
740
+ st.markdown(f"**Document:** {h.doc_id}")
741
+ st.markdown(f"**Text:** {h.text_snippet}")
742
+
743
+ if h.source_query:
744
+ st.markdown(f"**Query:** _{h.source_query}_")
745
+
746
+ # Button to navigate to source
747
+ if st.button(f"View in Document", key=f"view_source_{i}"):
748
+ manager.set_active_document(h.doc_id)
749
+ manager.select_page(h.page, "evidence_panel")
750
+ manager.select_chunk(h.chunk_id, h.doc_id, "evidence_panel")
751
+ st.rerun()
752
+
753
+
754
+ def render_sync_status():
755
+ """Render sync status indicator for debugging."""
756
+ manager = get_state_manager()
757
+ summary = manager.get_summary()
758
+
759
+ with st.expander("Sync Status", expanded=False):
760
+ st.json({
761
+ "sync_version": summary["sync_version"],
762
+ "active_doc": summary["active_doc_id"],
763
+ "active_page": summary["active_page"],
764
+ "evidence_count": summary["evidence_count"],
765
+ "last_update": summary["last_update"],
766
+ })
767
+
768
+ # Recent events
769
+ events = manager.get_recent_events(limit=5)
770
+ if events:
771
+ st.subheader("Recent Events")
772
+ for event in reversed(events):
773
+ st.text(f"{event.event_type.value}: {event.source_module}")
774
+
775
+
776
+ def render_document_selector():
777
+ """
778
+ Render a document selector that syncs with state manager.
779
+
780
+ Returns the selected document ID.
781
+ """
782
+ manager = get_state_manager()
783
+ documents = manager.get_all_documents()
784
+
785
+ if not documents:
786
+ st.info("No documents uploaded. Upload a document to get started.")
787
+ return None
788
+
789
+ # Get current selection
790
+ active_doc_id = manager.state.get("active_doc_id")
791
+
792
+ # Create options
793
+ options = {doc.doc_id: f"{doc.filename} ({doc.indexed_chunks} chunks)" for doc in documents}
794
+ option_list = list(options.keys())
795
+
796
+ # Find current index
797
+ current_index = option_list.index(active_doc_id) if active_doc_id in option_list else 0
798
+
799
+ # Render selectbox
800
+ selected_id = st.selectbox(
801
+ "Select Document",
802
+ options=option_list,
803
+ format_func=lambda x: options[x],
804
+ index=current_index,
805
+ key="global_doc_selector"
806
+ )
807
+
808
+ # Update state if changed
809
+ if selected_id != active_doc_id:
810
+ manager.set_active_document(selected_id)
811
+ manager.publish_event(
812
+ EventType.DOCUMENT_SELECTED,
813
+ source_module="selector",
814
+ payload={"doc_id": selected_id}
815
+ )
816
+
817
+ return selected_id
818
+
819
+
820
+ def create_sync_callback(module_name: str) -> Callable:
821
+ """
822
+ Create a rerun callback for a module.
823
+
824
+ Returns a function that can be used as an event handler
825
+ to trigger Streamlit rerun when relevant events occur.
826
+ """
827
+ def callback(event: Event):
828
+ # Only rerun if event is from a different module
829
+ if event.source_module != module_name:
830
+ # Store that we need to rerun
831
+ st.session_state[f"_{module_name}_needs_rerun"] = True
832
+
833
+ return callback
docker-compose.dev.yml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ # SPARKNET Development Docker Compose
4
+ # Lighter configuration for local development
5
+
6
+ services:
7
+ sparknet-api:
8
+ build:
9
+ context: .
10
+ dockerfile: Dockerfile
11
+ target: development
12
+ container_name: sparknet-api-dev
13
+ ports:
14
+ - "8000:8000"
15
+ volumes:
16
+ - .:/app
17
+ - ./data:/app/data
18
+ - ./uploads:/app/uploads
19
+ - ./outputs:/app/outputs
20
+ environment:
21
+ - PYTHONPATH=/app
22
+ - OLLAMA_HOST=http://host.docker.internal:11434
23
+ - LOG_LEVEL=DEBUG
24
+ - SPARKNET_SECRET_KEY=dev-secret-key
25
+ extra_hosts:
26
+ - "host.docker.internal:host-gateway"
27
+ networks:
28
+ - sparknet-dev-network
29
+ restart: unless-stopped
30
+
31
+ sparknet-demo:
32
+ build:
33
+ context: .
34
+ dockerfile: Dockerfile
35
+ target: development
36
+ container_name: sparknet-demo-dev
37
+ command: ["streamlit", "run", "demo/app.py", "--server.address", "0.0.0.0", "--server.port", "4000", "--server.runOnSave", "true"]
38
+ ports:
39
+ - "4000:4000"
40
+ volumes:
41
+ - .:/app
42
+ - ./data:/app/data
43
+ - ./uploads:/app/uploads
44
+ environment:
45
+ - PYTHONPATH=/app
46
+ - OLLAMA_HOST=http://host.docker.internal:11434
47
+ - API_URL=http://sparknet-api:8000
48
+ extra_hosts:
49
+ - "host.docker.internal:host-gateway"
50
+ depends_on:
51
+ - sparknet-api
52
+ networks:
53
+ - sparknet-dev-network
54
+ restart: unless-stopped
55
+
56
+ redis:
57
+ image: redis:7-alpine
58
+ container_name: sparknet-redis-dev
59
+ ports:
60
+ - "6379:6379"
61
+ networks:
62
+ - sparknet-dev-network
63
+
64
+ networks:
65
+ sparknet-dev-network:
66
+ driver: bridge
docker-compose.yml ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ # SPARKNET Docker Compose Configuration
4
+ # Full stack deployment with all services
5
+
6
+ services:
7
+ # ============== Main Application ==============
8
+ sparknet-api:
9
+ build:
10
+ context: .
11
+ dockerfile: Dockerfile
12
+ target: production
13
+ container_name: sparknet-api
14
+ ports:
15
+ - "8000:8000"
16
+ volumes:
17
+ - ./data:/app/data
18
+ - ./uploads:/app/uploads
19
+ - ./outputs:/app/outputs
20
+ - ./logs:/app/logs
21
+ environment:
22
+ - PYTHONPATH=/app
23
+ - OLLAMA_HOST=http://ollama:11434
24
+ - CHROMA_HOST=chromadb
25
+ - CHROMA_PORT=8000
26
+ - REDIS_URL=redis://redis:6379
27
+ - SPARKNET_SECRET_KEY=${SPARKNET_SECRET_KEY:-sparknet-docker-secret-key}
28
+ - LOG_LEVEL=INFO
29
+ depends_on:
30
+ ollama:
31
+ condition: service_healthy
32
+ chromadb:
33
+ condition: service_started
34
+ redis:
35
+ condition: service_healthy
36
+ networks:
37
+ - sparknet-network
38
+ restart: unless-stopped
39
+ healthcheck:
40
+ test: ["CMD", "curl", "-f", "http://localhost:8000/api/health"]
41
+ interval: 30s
42
+ timeout: 10s
43
+ retries: 3
44
+ start_period: 60s
45
+
46
+ sparknet-demo:
47
+ build:
48
+ context: .
49
+ dockerfile: Dockerfile
50
+ target: production
51
+ container_name: sparknet-demo
52
+ command: ["streamlit", "run", "demo/app.py", "--server.address", "0.0.0.0", "--server.port", "4000"]
53
+ ports:
54
+ - "4000:4000"
55
+ volumes:
56
+ - ./data:/app/data
57
+ - ./uploads:/app/uploads
58
+ - ./outputs:/app/outputs
59
+ environment:
60
+ - PYTHONPATH=/app
61
+ - OLLAMA_HOST=http://ollama:11434
62
+ - CHROMA_HOST=chromadb
63
+ - CHROMA_PORT=8000
64
+ - API_URL=http://sparknet-api:8000
65
+ depends_on:
66
+ - sparknet-api
67
+ networks:
68
+ - sparknet-network
69
+ restart: unless-stopped
70
+
71
+ # ============== Ollama LLM Service ==============
72
+ ollama:
73
+ image: ollama/ollama:latest
74
+ container_name: sparknet-ollama
75
+ ports:
76
+ - "11434:11434"
77
+ volumes:
78
+ - ollama_data:/root/.ollama
79
+ environment:
80
+ - OLLAMA_KEEP_ALIVE=24h
81
+ deploy:
82
+ resources:
83
+ reservations:
84
+ devices:
85
+ - driver: nvidia
86
+ count: all
87
+ capabilities: [gpu]
88
+ networks:
89
+ - sparknet-network
90
+ restart: unless-stopped
91
+ healthcheck:
92
+ test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
93
+ interval: 30s
94
+ timeout: 10s
95
+ retries: 5
96
+ start_period: 120s
97
+
98
+ # ============== ChromaDB Vector Store ==============
99
+ chromadb:
100
+ image: chromadb/chroma:latest
101
+ container_name: sparknet-chromadb
102
+ ports:
103
+ - "8001:8000"
104
+ volumes:
105
+ - chroma_data:/chroma/chroma
106
+ environment:
107
+ - IS_PERSISTENT=TRUE
108
+ - PERSIST_DIRECTORY=/chroma/chroma
109
+ - ANONYMIZED_TELEMETRY=FALSE
110
+ networks:
111
+ - sparknet-network
112
+ restart: unless-stopped
113
+
114
+ # ============== Redis Cache ==============
115
+ redis:
116
+ image: redis:7-alpine
117
+ container_name: sparknet-redis
118
+ ports:
119
+ - "6379:6379"
120
+ volumes:
121
+ - redis_data:/data
122
+ command: redis-server --appendonly yes --maxmemory 256mb --maxmemory-policy allkeys-lru
123
+ networks:
124
+ - sparknet-network
125
+ restart: unless-stopped
126
+ healthcheck:
127
+ test: ["CMD", "redis-cli", "ping"]
128
+ interval: 10s
129
+ timeout: 5s
130
+ retries: 5
131
+
132
+ # ============== Nginx Reverse Proxy (Optional) ==============
133
+ nginx:
134
+ image: nginx:alpine
135
+ container_name: sparknet-nginx
136
+ ports:
137
+ - "80:80"
138
+ - "443:443"
139
+ volumes:
140
+ - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
141
+ - ./nginx/ssl:/etc/nginx/ssl:ro
142
+ depends_on:
143
+ - sparknet-api
144
+ - sparknet-demo
145
+ networks:
146
+ - sparknet-network
147
+ restart: unless-stopped
148
+ profiles:
149
+ - production
150
+
151
+ # ============== Volumes ==============
152
+ volumes:
153
+ ollama_data:
154
+ driver: local
155
+ chroma_data:
156
+ driver: local
157
+ redis_data:
158
+ driver: local
159
+
160
+ # ============== Networks ==============
161
+ networks:
162
+ sparknet-network:
163
+ driver: bridge
docs/CLOUD_ARCHITECTURE.md ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPARKNET Cloud Architecture
2
+
3
+ This document outlines the cloud-ready architecture for deploying SPARKNET on AWS.
4
+
5
+ ## Overview
6
+
7
+ SPARKNET is designed with a modular architecture that supports both local development and cloud deployment. The system can scale from a single developer machine to enterprise-grade cloud infrastructure.
8
+
9
+ ## Local Development Stack
10
+
11
+ ```
12
+ ┌─────────────────────────────────────────────────────┐
13
+ │ Local Machine │
14
+ ├─────────────────────────────────────────────────────┤
15
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
16
+ │ │ Ollama │ │ ChromaDB │ │ File I/O │ │
17
+ │ │ (LLM) │ │ (Vector) │ │ (Storage) │ │
18
+ │ └─────────────┘ └─────────────┘ └─────────────┘ │
19
+ │ │ │ │ │
20
+ │ └───────────────┼───────────────┘ │
21
+ │ │ │
22
+ │ ┌────────┴────────┐ │
23
+ │ │ SPARKNET │ │
24
+ │ │ Application │ │
25
+ │ └─────────────────┘ │
26
+ └─────────────────────────────────────────────────────┘
27
+ ```
28
+
29
+ ## AWS Cloud Architecture
30
+
31
+ ### Target Architecture
32
+
33
+ ```
34
+ ┌────────────────────────────────────────────────────────────────────┐
35
+ │ AWS Cloud │
36
+ ├────────────────────────────────────────────────────────────────────┤
37
+ │ │
38
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │
39
+ │ │ API GW │──────│ Lambda │──────│ Step Functions │ │
40
+ │ │ (REST) │ │ (Compute) │ │ (Orchestration) │ │
41
+ │ └─────────────┘ └─────────────┘ └─────────────────────┘ │
42
+ │ │ │ │ │
43
+ │ │ │ │ │
44
+ │ ▼ ▼ ▼ │
45
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │
46
+ │ │ S3 │ │ Bedrock │ │ OpenSearch │ │
47
+ │ │ (Storage) │ │ (LLM) │ │ (Vector Store) │ │
48
+ │ └─────────────┘ └─────────────┘ └─────────────────────┘ │
49
+ │ │
50
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │
51
+ │ │ Textract │ │ Titan │ │ DynamoDB │ │
52
+ │ │ (OCR) │ │ (Embeddings)│ │ (Metadata) │ │
53
+ │ └─────────────┘ └─────────────┘ └─────────────────────┘ │
54
+ │ │
55
+ └────────────────────────────────────────────────────────────────────┘
56
+ ```
57
+
58
+ ### Component Mapping
59
+
60
+ | Local Component | AWS Service | Purpose |
61
+ |----------------|-------------|---------|
62
+ | File I/O | S3 | Document storage |
63
+ | PaddleOCR/Tesseract | Textract | OCR extraction |
64
+ | Ollama LLM | Bedrock (Claude/Titan) | Text generation |
65
+ | Ollama Embeddings | Titan Embeddings | Vector embeddings |
66
+ | ChromaDB | OpenSearch Serverless | Vector search |
67
+ | SQLite (optional) | DynamoDB | Metadata storage |
68
+ | Python Process | Lambda | Compute |
69
+ | CLI | API Gateway | HTTP interface |
70
+
71
+ ## Migration Strategy
72
+
73
+ ### Phase 1: Storage Migration
74
+
75
+ ```python
76
+ # Abstract storage interface
77
+ class StorageAdapter:
78
+ def put(self, key: str, data: bytes) -> str: ...
79
+ def get(self, key: str) -> bytes: ...
80
+ def delete(self, key: str) -> bool: ...
81
+
82
+ # Local implementation
83
+ class LocalStorageAdapter(StorageAdapter):
84
+ def __init__(self, base_path: str):
85
+ self.base_path = Path(base_path)
86
+
87
+ # S3 implementation
88
+ class S3StorageAdapter(StorageAdapter):
89
+ def __init__(self, bucket: str):
90
+ self.client = boto3.client('s3')
91
+ self.bucket = bucket
92
+ ```
93
+
94
+ ### Phase 2: OCR Migration
95
+
96
+ ```python
97
+ # Abstract OCR interface
98
+ class OCREngine:
99
+ def recognize(self, image: np.ndarray) -> OCRResult: ...
100
+
101
+ # Local: PaddleOCR
102
+ class PaddleOCREngine(OCREngine): ...
103
+
104
+ # Cloud: Textract
105
+ class TextractEngine(OCREngine):
106
+ def __init__(self):
107
+ self.client = boto3.client('textract')
108
+
109
+ def recognize(self, image: np.ndarray) -> OCRResult:
110
+ response = self.client.detect_document_text(
111
+ Document={'Bytes': image_bytes}
112
+ )
113
+ return self._convert_response(response)
114
+ ```
115
+
116
+ ### Phase 3: LLM Migration
117
+
118
+ ```python
119
+ # Abstract LLM interface
120
+ class LLMAdapter:
121
+ def generate(self, prompt: str) -> str: ...
122
+
123
+ # Local: Ollama
124
+ class OllamaAdapter(LLMAdapter): ...
125
+
126
+ # Cloud: Bedrock
127
+ class BedrockAdapter(LLMAdapter):
128
+ def __init__(self, model_id: str = "anthropic.claude-3-sonnet"):
129
+ self.client = boto3.client('bedrock-runtime')
130
+ self.model_id = model_id
131
+
132
+ def generate(self, prompt: str) -> str:
133
+ response = self.client.invoke_model(
134
+ modelId=self.model_id,
135
+ body=json.dumps({"prompt": prompt})
136
+ )
137
+ return response['body']
138
+ ```
139
+
140
+ ### Phase 4: Vector Store Migration
141
+
142
+ ```python
143
+ # Abstract vector store interface (already implemented)
144
+ class VectorStore:
145
+ def add_chunks(self, chunks, embeddings): ...
146
+ def search(self, query_embedding, top_k): ...
147
+
148
+ # Local: ChromaDB (already implemented)
149
+ class ChromaVectorStore(VectorStore): ...
150
+
151
+ # Cloud: OpenSearch
152
+ class OpenSearchVectorStore(VectorStore):
153
+ def __init__(self, endpoint: str, index: str):
154
+ self.client = OpenSearch(hosts=[endpoint])
155
+ self.index = index
156
+
157
+ def search(self, query_embedding, top_k):
158
+ response = self.client.search(
159
+ index=self.index,
160
+ body={
161
+ "knn": {
162
+ "embedding": {
163
+ "vector": query_embedding,
164
+ "k": top_k
165
+ }
166
+ }
167
+ }
168
+ )
169
+ return self._convert_results(response)
170
+ ```
171
+
172
+ ## AWS Services Deep Dive
173
+
174
+ ### Amazon S3
175
+
176
+ - **Purpose**: Document storage and processed results
177
+ - **Structure**:
178
+ ```
179
+ s3://sparknet-documents/
180
+ ├── raw/ # Original documents
181
+ │ └── {doc_id}/
182
+ │ └── document.pdf
183
+ ├── processed/ # Processed results
184
+ │ └── {doc_id}/
185
+ │ ├── metadata.json
186
+ │ ├── chunks.json
187
+ │ └── pages/
188
+ │ ├── page_0.png
189
+ │ └── page_1.png
190
+ └── cache/ # Processing cache
191
+ ```
192
+
193
+ ### Amazon Textract
194
+
195
+ - **Purpose**: OCR extraction with layout analysis
196
+ - **Features**:
197
+ - Document text detection
198
+ - Table extraction
199
+ - Form extraction
200
+ - Handwriting recognition
201
+
202
+ ### Amazon Bedrock
203
+
204
+ - **Purpose**: LLM inference
205
+ - **Models**:
206
+ - Claude 3.5 Sonnet (primary)
207
+ - Titan Text (cost-effective)
208
+ - Titan Embeddings (vectors)
209
+
210
+ ### Amazon OpenSearch Serverless
211
+
212
+ - **Purpose**: Vector search and retrieval
213
+ - **Configuration**:
214
+ ```json
215
+ {
216
+ "index": "sparknet-vectors",
217
+ "settings": {
218
+ "index.knn": true,
219
+ "index.knn.space_type": "cosinesimil"
220
+ },
221
+ "mappings": {
222
+ "properties": {
223
+ "embedding": {
224
+ "type": "knn_vector",
225
+ "dimension": 1024
226
+ }
227
+ }
228
+ }
229
+ }
230
+ ```
231
+
232
+ ### AWS Lambda
233
+
234
+ - **Purpose**: Serverless compute
235
+ - **Functions**:
236
+ - `process-document`: Document processing pipeline
237
+ - `extract-fields`: Field extraction
238
+ - `rag-query`: RAG query handling
239
+ - `index-document`: Vector indexing
240
+
241
+ ### AWS Step Functions
242
+
243
+ - **Purpose**: Workflow orchestration
244
+ - **Workflow**:
245
+ ```json
246
+ {
247
+ "StartAt": "ProcessDocument",
248
+ "States": {
249
+ "ProcessDocument": {
250
+ "Type": "Task",
251
+ "Resource": "arn:aws:lambda:process-document",
252
+ "Next": "IndexChunks"
253
+ },
254
+ "IndexChunks": {
255
+ "Type": "Task",
256
+ "Resource": "arn:aws:lambda:index-document",
257
+ "End": true
258
+ }
259
+ }
260
+ }
261
+ ```
262
+
263
+ ## Cost Optimization
264
+
265
+ ### Tiered Processing
266
+
267
+ | Tier | Use Case | Services | Cost |
268
+ |------|----------|----------|------|
269
+ | Basic | Simple OCR | Textract + Titan | $ |
270
+ | Standard | Full pipeline | + Claude Haiku | $$ |
271
+ | Premium | Complex analysis | + Claude Sonnet | $$$ |
272
+
273
+ ### Caching Strategy
274
+
275
+ 1. **Document Cache**: S3 with lifecycle policies
276
+ 2. **Embedding Cache**: ElastiCache (Redis)
277
+ 3. **Query Cache**: Lambda@Edge
278
+
279
+ ## Security
280
+
281
+ ### IAM Policies
282
+
283
+ ```json
284
+ {
285
+ "Version": "2012-10-17",
286
+ "Statement": [
287
+ {
288
+ "Effect": "Allow",
289
+ "Action": [
290
+ "s3:GetObject",
291
+ "s3:PutObject"
292
+ ],
293
+ "Resource": "arn:aws:s3:::sparknet-documents/*"
294
+ },
295
+ {
296
+ "Effect": "Allow",
297
+ "Action": [
298
+ "textract:DetectDocumentText",
299
+ "textract:AnalyzeDocument"
300
+ ],
301
+ "Resource": "*"
302
+ },
303
+ {
304
+ "Effect": "Allow",
305
+ "Action": [
306
+ "bedrock:InvokeModel"
307
+ ],
308
+ "Resource": "arn:aws:bedrock:*::foundation-model/*"
309
+ }
310
+ ]
311
+ }
312
+ ```
313
+
314
+ ### Data Encryption
315
+
316
+ - S3: Server-side encryption (SSE-S3 or SSE-KMS)
317
+ - OpenSearch: Encryption at rest
318
+ - Lambda: Environment variable encryption
319
+
320
+ ## Deployment
321
+
322
+ ### Infrastructure as Code (Terraform)
323
+
324
+ ```hcl
325
+ # S3 Bucket
326
+ resource "aws_s3_bucket" "documents" {
327
+ bucket = "sparknet-documents"
328
+ }
329
+
330
+ # Lambda Function
331
+ resource "aws_lambda_function" "processor" {
332
+ function_name = "sparknet-processor"
333
+ runtime = "python3.11"
334
+ handler = "handler.process"
335
+ memory_size = 1024
336
+ timeout = 300
337
+ }
338
+
339
+ # OpenSearch Serverless
340
+ resource "aws_opensearchserverless_collection" "vectors" {
341
+ name = "sparknet-vectors"
342
+ type = "VECTORSEARCH"
343
+ }
344
+ ```
345
+
346
+ ### CI/CD Pipeline
347
+
348
+ ```yaml
349
+ # GitHub Actions
350
+ name: Deploy SPARKNET
351
+
352
+ on:
353
+ push:
354
+ branches: [main]
355
+
356
+ jobs:
357
+ deploy:
358
+ runs-on: ubuntu-latest
359
+ steps:
360
+ - uses: actions/checkout@v3
361
+ - name: Deploy Lambda
362
+ run: |
363
+ aws lambda update-function-code \
364
+ --function-name sparknet-processor \
365
+ --zip-file fileb://package.zip
366
+ ```
367
+
368
+ ## Monitoring
369
+
370
+ ### CloudWatch Metrics
371
+
372
+ - Lambda invocations and duration
373
+ - S3 request counts
374
+ - OpenSearch query latency
375
+ - Bedrock token usage
376
+
377
+ ### Dashboards
378
+
379
+ - Processing throughput
380
+ - Error rates
381
+ - Cost tracking
382
+ - Vector store statistics
383
+
384
+ ## Next Steps
385
+
386
+ 1. **Implement Storage Abstraction**: Create S3 adapter
387
+ 2. **Add Textract Engine**: Implement AWS OCR
388
+ 3. **Create Bedrock Adapter**: LLM migration
389
+ 4. **Deploy OpenSearch**: Vector store setup
390
+ 5. **Build Lambda Functions**: Serverless compute
391
+ 6. **Setup Step Functions**: Workflow orchestration
392
+ 7. **Configure CI/CD**: Automated deployment
docs/DOCUMENT_INTELLIGENCE.md ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPARKNET Document Intelligence
2
+
3
+ A vision-first agentic document understanding platform that goes beyond OCR, supports complex layouts, and produces LLM-ready, visually grounded outputs suitable for RAG and field extraction at scale.
4
+
5
+ ## Overview
6
+
7
+ The Document Intelligence subsystem provides:
8
+
9
+ - **Vision-First Understanding**: Treats documents as visual objects, not just text
10
+ - **Semantic Chunking**: Classifies regions by type (text, table, figure, chart, form, etc.)
11
+ - **Visual Grounding**: Every extraction includes evidence (page, bbox, snippet, confidence)
12
+ - **Zero-Shot Capability**: Works across diverse document formats without training
13
+ - **Schema-Driven Extraction**: Define fields using JSON Schema or Pydantic models
14
+ - **Abstention Policy**: Never guesses - abstains when confidence is low
15
+ - **Local-First**: All processing happens locally for privacy
16
+
17
+ ## Quick Start
18
+
19
+ ### Basic Parsing
20
+
21
+ ```python
22
+ from src.document_intelligence import DocumentParser, ParserConfig
23
+
24
+ # Configure parser
25
+ config = ParserConfig(
26
+ render_dpi=200,
27
+ max_pages=10,
28
+ include_markdown=True,
29
+ )
30
+
31
+ parser = DocumentParser(config=config)
32
+ result = parser.parse("document.pdf")
33
+
34
+ print(f"Parsed {len(result.chunks)} chunks from {result.num_pages} pages")
35
+
36
+ # Access chunks
37
+ for chunk in result.chunks:
38
+ print(f"[Page {chunk.page}] {chunk.chunk_type.value}: {chunk.text[:100]}...")
39
+ ```
40
+
41
+ ### Field Extraction
42
+
43
+ ```python
44
+ from src.document_intelligence import (
45
+ FieldExtractor,
46
+ ExtractionSchema,
47
+ create_invoice_schema,
48
+ )
49
+
50
+ # Use preset schema
51
+ schema = create_invoice_schema()
52
+
53
+ # Or create custom schema
54
+ schema = ExtractionSchema(name="CustomSchema")
55
+ schema.add_string_field("company_name", "Name of the company", required=True)
56
+ schema.add_date_field("document_date", "Date on document")
57
+ schema.add_currency_field("total_amount", "Total amount")
58
+
59
+ # Extract fields
60
+ extractor = FieldExtractor()
61
+ extraction = extractor.extract(parse_result, schema)
62
+
63
+ print("Extracted Data:")
64
+ for key, value in extraction.data.items():
65
+ if key in extraction.abstained_fields:
66
+ print(f" {key}: [ABSTAINED]")
67
+ else:
68
+ print(f" {key}: {value}")
69
+
70
+ print(f"Confidence: {extraction.overall_confidence:.2f}")
71
+ ```
72
+
73
+ ### Visual Grounding
74
+
75
+ ```python
76
+ from src.document_intelligence import (
77
+ load_document,
78
+ RenderOptions,
79
+ )
80
+ from src.document_intelligence.grounding import (
81
+ crop_region,
82
+ create_annotated_image,
83
+ EvidenceBuilder,
84
+ )
85
+
86
+ # Load and render page
87
+ loader, renderer = load_document("document.pdf")
88
+ page_image = renderer.render_page(1, RenderOptions(dpi=200))
89
+
90
+ # Create annotated visualization
91
+ bboxes = [chunk.bbox for chunk in result.chunks if chunk.page == 1]
92
+ labels = [chunk.chunk_type.value for chunk in result.chunks if chunk.page == 1]
93
+ annotated = create_annotated_image(page_image, bboxes, labels)
94
+
95
+ # Crop specific region
96
+ crop = crop_region(page_image, chunk.bbox, padding_percent=0.02)
97
+ ```
98
+
99
+ ### Question Answering
100
+
101
+ ```python
102
+ from src.document_intelligence.tools import get_tool
103
+
104
+ qa_tool = get_tool("answer_question")
105
+ result = qa_tool.execute(
106
+ parse_result=parse_result,
107
+ question="What is the total amount due?",
108
+ )
109
+
110
+ if result.success:
111
+ print(f"Answer: {result.data['answer']}")
112
+ print(f"Confidence: {result.data['confidence']:.2f}")
113
+
114
+ for ev in result.evidence:
115
+ print(f" Evidence: Page {ev['page']}, {ev['snippet'][:50]}...")
116
+ ```
117
+
118
+ ## Architecture
119
+
120
+ ### Module Structure
121
+
122
+ ```
123
+ src/document_intelligence/
124
+ ├── __init__.py # Main exports
125
+ ├── chunks/ # Core data models
126
+ │ ├── models.py # BoundingBox, DocumentChunk, TableChunk, etc.
127
+ │ └── __init__.py
128
+ ├── io/ # Document loading
129
+ │ ├── base.py # Abstract interfaces
130
+ │ ├── pdf.py # PDF loading (PyMuPDF)
131
+ │ ├── image.py # Image loading (PIL)
132
+ │ ├── cache.py # Page caching
133
+ │ └── __init__.py
134
+ ├── models/ # Model interfaces
135
+ │ ├── base.py # BaseModel, BatchableModel
136
+ │ ├── ocr.py # OCRModel interface
137
+ │ ├── layout.py # LayoutModel interface
138
+ │ ├── table.py # TableModel interface
139
+ │ ├── chart.py # ChartModel interface
140
+ │ ├── vlm.py # VisionLanguageModel interface
141
+ │ └── __init__.py
142
+ ├── parsing/ # Document parsing
143
+ │ ├── parser.py # DocumentParser orchestrator
144
+ │ ├── chunking.py # Semantic chunking utilities
145
+ │ └── __init__.py
146
+ ├── grounding/ # Visual evidence
147
+ │ ├── evidence.py # EvidenceBuilder, EvidenceTracker
148
+ │ ├── crops.py # Image cropping utilities
149
+ │ └── __init__.py
150
+ ├── extraction/ # Field extraction
151
+ │ ├── schema.py # ExtractionSchema, FieldSpec
152
+ │ ├── extractor.py # FieldExtractor
153
+ │ ├── validator.py # ExtractionValidator
154
+ │ └── __init__.py
155
+ ├── tools/ # Agent tools
156
+ │ ├── document_tools.py # Tool implementations
157
+ │ └── __init__.py
158
+ ├── validation/ # Result validation
159
+ │ └── __init__.py
160
+ └── agent_adapter.py # Agent integration
161
+ ```
162
+
163
+ ### Data Models
164
+
165
+ #### BoundingBox
166
+
167
+ Represents a rectangular region in XYXY format:
168
+
169
+ ```python
170
+ from src.document_intelligence.chunks import BoundingBox
171
+
172
+ # Normalized coordinates (0-1)
173
+ bbox = BoundingBox(
174
+ x_min=0.1, y_min=0.2,
175
+ x_max=0.9, y_max=0.3,
176
+ normalized=True
177
+ )
178
+
179
+ # Convert to pixels
180
+ pixel_bbox = bbox.to_pixel(width=1000, height=800)
181
+
182
+ # Calculate IoU
183
+ overlap = bbox1.iou(bbox2)
184
+
185
+ # Check containment
186
+ is_inside = bbox.contains((0.5, 0.25))
187
+ ```
188
+
189
+ #### DocumentChunk
190
+
191
+ Base semantic chunk:
192
+
193
+ ```python
194
+ from src.document_intelligence.chunks import DocumentChunk, ChunkType
195
+
196
+ chunk = DocumentChunk(
197
+ chunk_id="abc123",
198
+ doc_id="doc001",
199
+ chunk_type=ChunkType.PARAGRAPH,
200
+ text="Content...",
201
+ page=1,
202
+ bbox=bbox,
203
+ confidence=0.95,
204
+ sequence_index=0,
205
+ )
206
+ ```
207
+
208
+ #### TableChunk
209
+
210
+ Table with cell structure:
211
+
212
+ ```python
213
+ from src.document_intelligence.chunks import TableChunk, TableCell
214
+
215
+ # Access cells
216
+ cell = table.get_cell(row=0, col=1)
217
+
218
+ # Export formats
219
+ csv_data = table.to_csv()
220
+ markdown = table.to_markdown()
221
+ json_data = table.to_structured_json()
222
+ ```
223
+
224
+ #### EvidenceRef
225
+
226
+ Links extractions to visual sources:
227
+
228
+ ```python
229
+ from src.document_intelligence.chunks import EvidenceRef
230
+
231
+ evidence = EvidenceRef(
232
+ chunk_id="chunk_001",
233
+ doc_id="doc_001",
234
+ page=1,
235
+ bbox=bbox,
236
+ source_type="text",
237
+ snippet="The total is $500",
238
+ confidence=0.9,
239
+ cell_id=None, # For table cells
240
+ crop_path=None, # Path to cropped image
241
+ )
242
+ ```
243
+
244
+ ## CLI Commands
245
+
246
+ ```bash
247
+ # Parse document
248
+ sparknet docint parse document.pdf -o result.json
249
+ sparknet docint parse document.pdf --format markdown
250
+
251
+ # Extract fields
252
+ sparknet docint extract invoice.pdf --preset invoice
253
+ sparknet docint extract doc.pdf -f vendor_name -f total_amount
254
+ sparknet docint extract doc.pdf --schema my_schema.json
255
+
256
+ # Ask questions
257
+ sparknet docint ask document.pdf "What is the contract value?"
258
+
259
+ # Classify document
260
+ sparknet docint classify document.pdf
261
+
262
+ # Search content
263
+ sparknet docint search document.pdf -q "payment terms"
264
+ sparknet docint search document.pdf --type table
265
+
266
+ # Visualize regions
267
+ sparknet docint visualize document.pdf --page 1 --annotate
268
+ ```
269
+
270
+ ## Configuration
271
+
272
+ ### Parser Configuration
273
+
274
+ ```python
275
+ from src.document_intelligence import ParserConfig
276
+
277
+ config = ParserConfig(
278
+ # Rendering
279
+ render_dpi=200, # DPI for page rasterization
280
+ max_pages=None, # Limit pages (None = all)
281
+
282
+ # OCR
283
+ ocr_enabled=True,
284
+ ocr_languages=["en"],
285
+ ocr_min_confidence=0.5,
286
+
287
+ # Layout
288
+ layout_enabled=True,
289
+ reading_order_enabled=True,
290
+
291
+ # Specialized extraction
292
+ table_extraction_enabled=True,
293
+ chart_extraction_enabled=True,
294
+
295
+ # Chunking
296
+ merge_adjacent_text=True,
297
+ min_chunk_chars=10,
298
+ max_chunk_chars=4000,
299
+
300
+ # Output
301
+ include_markdown=True,
302
+ cache_enabled=True,
303
+ )
304
+ ```
305
+
306
+ ### Extraction Configuration
307
+
308
+ ```python
309
+ from src.document_intelligence import ExtractionConfig
310
+
311
+ config = ExtractionConfig(
312
+ # Confidence
313
+ min_field_confidence=0.5,
314
+ min_overall_confidence=0.5,
315
+
316
+ # Abstention
317
+ abstain_on_low_confidence=True,
318
+ abstain_threshold=0.3,
319
+
320
+ # Search
321
+ search_all_chunks=True,
322
+ prefer_structured_sources=True,
323
+
324
+ # Validation
325
+ validate_extracted_values=True,
326
+ normalize_values=True,
327
+ )
328
+ ```
329
+
330
+ ## Preset Schemas
331
+
332
+ ### Invoice
333
+
334
+ ```python
335
+ from src.document_intelligence import create_invoice_schema
336
+
337
+ schema = create_invoice_schema()
338
+ # Fields: invoice_number, invoice_date, due_date, vendor_name, vendor_address,
339
+ # customer_name, customer_address, subtotal, tax_amount, total_amount,
340
+ # currency, payment_terms
341
+ ```
342
+
343
+ ### Receipt
344
+
345
+ ```python
346
+ from src.document_intelligence import create_receipt_schema
347
+
348
+ schema = create_receipt_schema()
349
+ # Fields: merchant_name, merchant_address, transaction_date, transaction_time,
350
+ # subtotal, tax_amount, total_amount, payment_method, last_four_digits
351
+ ```
352
+
353
+ ### Contract
354
+
355
+ ```python
356
+ from src.document_intelligence import create_contract_schema
357
+
358
+ schema = create_contract_schema()
359
+ # Fields: contract_title, effective_date, expiration_date, party_a_name,
360
+ # party_b_name, contract_value, governing_law, termination_clause
361
+ ```
362
+
363
+ ## Agent Integration
364
+
365
+ ```python
366
+ from src.document_intelligence.agent_adapter import (
367
+ DocumentIntelligenceAdapter,
368
+ EnhancedDocumentAgent,
369
+ AgentConfig,
370
+ )
371
+
372
+ # Create adapter
373
+ config = AgentConfig(
374
+ render_dpi=200,
375
+ min_confidence=0.5,
376
+ max_iterations=10,
377
+ )
378
+
379
+ # With existing LLM client
380
+ agent = EnhancedDocumentAgent(
381
+ llm_client=ollama_client,
382
+ config=config,
383
+ )
384
+
385
+ # Load document
386
+ await agent.load_document("document.pdf")
387
+
388
+ # Extract with schema
389
+ result = await agent.extract_fields(schema)
390
+
391
+ # Answer questions
392
+ answer, evidence = await agent.answer_question("What is the total?")
393
+
394
+ # Classify
395
+ classification = await agent.classify()
396
+ ```
397
+
398
+ ## Available Tools
399
+
400
+ | Tool | Description |
401
+ |------|-------------|
402
+ | `parse_document` | Parse document into semantic chunks |
403
+ | `extract_fields` | Schema-driven field extraction |
404
+ | `search_chunks` | Search document content |
405
+ | `get_chunk_details` | Get detailed chunk information |
406
+ | `get_table_data` | Extract structured table data |
407
+ | `answer_question` | Document Q&A |
408
+ | `crop_region` | Extract visual regions |
409
+
410
+ ## Best Practices
411
+
412
+ ### 1. Always Check Confidence
413
+
414
+ ```python
415
+ if extraction.overall_confidence < 0.7:
416
+ print("Low confidence - manual review recommended")
417
+
418
+ for field, value in extraction.data.items():
419
+ if field in extraction.abstained_fields:
420
+ print(f"{field}: Needs manual verification")
421
+ ```
422
+
423
+ ### 2. Use Evidence for Verification
424
+
425
+ ```python
426
+ for evidence in extraction.evidence:
427
+ print(f"Found on page {evidence.page}")
428
+ print(f"Location: {evidence.bbox.xyxy}")
429
+ print(f"Source text: {evidence.snippet}")
430
+ ```
431
+
432
+ ### 3. Handle Abstention Gracefully
433
+
434
+ ```python
435
+ result = extractor.extract(parse_result, schema)
436
+
437
+ for field in schema.get_required_fields():
438
+ if field.name in result.abstained_fields:
439
+ # Request human review
440
+ flag_for_review(field.name, parse_result.doc_id)
441
+ ```
442
+
443
+ ### 4. Validate Before Use
444
+
445
+ ```python
446
+ from src.document_intelligence import ExtractionValidator
447
+
448
+ validator = ExtractionValidator(min_confidence=0.7)
449
+ validation = validator.validate(result, schema)
450
+
451
+ if not validation.is_valid:
452
+ for issue in validation.issues:
453
+ print(f"[{issue.severity}] {issue.field_name}: {issue.message}")
454
+ ```
455
+
456
+ ## Dependencies
457
+
458
+ - `pymupdf` - PDF loading and rendering
459
+ - `pillow` - Image processing
460
+ - `numpy` - Array operations
461
+ - `pydantic` - Data validation
462
+
463
+ Optional:
464
+ - `paddleocr` - OCR engine
465
+ - `tesseract` - Alternative OCR
466
+ - `chromadb` - Vector storage for RAG
467
+
468
+ ## License
469
+
470
+ MIT License - see LICENSE file for details.
docs/SPARKNET_Progress_Report.py ADDED
@@ -0,0 +1,1432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ SPARKNET Progress Report & Future Work PDF Generator
4
+ Generates a comprehensive stakeholder presentation document.
5
+ """
6
+
7
+ from reportlab.lib import colors
8
+ from reportlab.lib.pagesizes import A4, landscape
9
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
10
+ from reportlab.lib.units import inch, cm
11
+ from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY, TA_RIGHT
12
+ from reportlab.platypus import (
13
+ SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle,
14
+ PageBreak, Image, ListFlowable, ListItem, KeepTogether,
15
+ Flowable, HRFlowable
16
+ )
17
+ from reportlab.graphics.shapes import Drawing, Rect, String, Line, Polygon
18
+ from reportlab.graphics.charts.barcharts import VerticalBarChart
19
+ from reportlab.graphics.charts.piecharts import Pie
20
+ from reportlab.graphics import renderPDF
21
+ from reportlab.pdfgen import canvas
22
+ from datetime import datetime
23
+ import os
24
+
25
+ # Color Scheme - Professional Blue Theme
26
+ PRIMARY_BLUE = colors.HexColor('#1e3a5f')
27
+ SECONDARY_BLUE = colors.HexColor('#2d5a87')
28
+ ACCENT_BLUE = colors.HexColor('#4a90d9')
29
+ LIGHT_BLUE = colors.HexColor('#e8f4fc')
30
+ SUCCESS_GREEN = colors.HexColor('#28a745')
31
+ WARNING_ORANGE = colors.HexColor('#fd7e14')
32
+ DANGER_RED = colors.HexColor('#dc3545')
33
+ GRAY_DARK = colors.HexColor('#343a40')
34
+ GRAY_LIGHT = colors.HexColor('#f8f9fa')
35
+ WHITE = colors.white
36
+
37
+
38
+ class DiagramFlowable(Flowable):
39
+ """Custom flowable for drawing architecture diagrams."""
40
+
41
+ def __init__(self, width, height, diagram_type='architecture'):
42
+ Flowable.__init__(self)
43
+ self.width = width
44
+ self.height = height
45
+ self.diagram_type = diagram_type
46
+
47
+ def draw(self):
48
+ if self.diagram_type == 'architecture':
49
+ self._draw_architecture()
50
+ elif self.diagram_type == 'rag_pipeline':
51
+ self._draw_rag_pipeline()
52
+ elif self.diagram_type == 'document_pipeline':
53
+ self._draw_document_pipeline()
54
+ elif self.diagram_type == 'agent_interaction':
55
+ self._draw_agent_interaction()
56
+ elif self.diagram_type == 'data_flow':
57
+ self._draw_data_flow()
58
+
59
+ def _draw_box(self, x, y, w, h, text, fill_color, text_color=WHITE, font_size=9):
60
+ """Draw a rounded box with text."""
61
+ self.canv.setFillColor(fill_color)
62
+ self.canv.roundRect(x, y, w, h, 5, fill=1, stroke=0)
63
+ self.canv.setFillColor(text_color)
64
+ self.canv.setFont('Helvetica-Bold', font_size)
65
+ # Center text
66
+ text_width = self.canv.stringWidth(text, 'Helvetica-Bold', font_size)
67
+ self.canv.drawString(x + (w - text_width) / 2, y + h/2 - 3, text)
68
+
69
+ def _draw_arrow(self, x1, y1, x2, y2, color=GRAY_DARK):
70
+ """Draw an arrow from (x1,y1) to (x2,y2)."""
71
+ self.canv.setStrokeColor(color)
72
+ self.canv.setLineWidth(2)
73
+ self.canv.line(x1, y1, x2, y2)
74
+ # Arrow head
75
+ import math
76
+ angle = math.atan2(y2-y1, x2-x1)
77
+ arrow_len = 8
78
+ self.canv.line(x2, y2, x2 - arrow_len * math.cos(angle - 0.4), y2 - arrow_len * math.sin(angle - 0.4))
79
+ self.canv.line(x2, y2, x2 - arrow_len * math.cos(angle + 0.4), y2 - arrow_len * math.sin(angle + 0.4))
80
+
81
+ def _draw_architecture(self):
82
+ """Draw the high-level SPARKNET architecture."""
83
+ # Title
84
+ self.canv.setFillColor(PRIMARY_BLUE)
85
+ self.canv.setFont('Helvetica-Bold', 12)
86
+ self.canv.drawCentredString(self.width/2, self.height - 20, 'SPARKNET Architecture Overview')
87
+
88
+ # User Layer
89
+ self._draw_box(self.width/2 - 60, self.height - 70, 120, 35, 'User Interface', ACCENT_BLUE)
90
+
91
+ # Demo Layer
92
+ self.canv.setFillColor(LIGHT_BLUE)
93
+ self.canv.roundRect(30, self.height - 160, self.width - 60, 70, 8, fill=1, stroke=0)
94
+ self.canv.setFillColor(PRIMARY_BLUE)
95
+ self.canv.setFont('Helvetica-Bold', 10)
96
+ self.canv.drawString(40, self.height - 100, 'Streamlit Demo Application')
97
+
98
+ # Demo pages
99
+ pages = ['Live\nProcessing', 'Interactive\nRAG', 'Doc\nComparison', 'Evidence\nViewer', 'Doc\nViewer']
100
+ page_width = (self.width - 100) / 5
101
+ for i, page in enumerate(pages):
102
+ x = 45 + i * page_width
103
+ self._draw_box(x, self.height - 150, page_width - 10, 35, page.replace('\n', ' '), SECONDARY_BLUE, font_size=7)
104
+
105
+ # Arrow from UI to Demo
106
+ self._draw_arrow(self.width/2, self.height - 70, self.width/2, self.height - 90, ACCENT_BLUE)
107
+
108
+ # Core Services Layer
109
+ self.canv.setFillColor(LIGHT_BLUE)
110
+ self.canv.roundRect(30, self.height - 280, self.width - 60, 100, 8, fill=1, stroke=0)
111
+ self.canv.setFillColor(PRIMARY_BLUE)
112
+ self.canv.setFont('Helvetica-Bold', 10)
113
+ self.canv.drawString(40, self.height - 190, 'Core Services')
114
+
115
+ # Core boxes
116
+ self._draw_box(50, self.height - 230, 100, 30, 'Document Intel', PRIMARY_BLUE, font_size=8)
117
+ self._draw_box(170, self.height - 230, 100, 30, 'Multi-Agent RAG', PRIMARY_BLUE, font_size=8)
118
+ self._draw_box(290, self.height - 230, 100, 30, 'Vector Store', PRIMARY_BLUE, font_size=8)
119
+ self._draw_box(410, self.height - 230, 80, 30, 'LLM Layer', PRIMARY_BLUE, font_size=8)
120
+
121
+ # Sub-components
122
+ self._draw_box(50, self.height - 270, 100, 30, 'OCR + Layout', SECONDARY_BLUE, font_size=7)
123
+ self._draw_box(170, self.height - 270, 100, 30, '5 Agents', SECONDARY_BLUE, font_size=7)
124
+ self._draw_box(290, self.height - 270, 100, 30, 'ChromaDB', SECONDARY_BLUE, font_size=7)
125
+ self._draw_box(410, self.height - 270, 80, 30, 'Ollama', SECONDARY_BLUE, font_size=7)
126
+
127
+ # Arrow from Demo to Core
128
+ self._draw_arrow(self.width/2, self.height - 160, self.width/2, self.height - 180, ACCENT_BLUE)
129
+
130
+ # Storage Layer
131
+ self.canv.setFillColor(GRAY_LIGHT)
132
+ self.canv.roundRect(30, self.height - 340, self.width - 60, 45, 8, fill=1, stroke=0)
133
+ self.canv.setFillColor(GRAY_DARK)
134
+ self.canv.setFont('Helvetica-Bold', 10)
135
+ self.canv.drawString(40, self.height - 310, 'Persistent Storage')
136
+
137
+ self._draw_box(150, self.height - 335, 80, 25, 'Embeddings', GRAY_DARK, font_size=7)
138
+ self._draw_box(250, self.height - 335, 80, 25, 'Documents', GRAY_DARK, font_size=7)
139
+ self._draw_box(350, self.height - 335, 80, 25, 'Cache', GRAY_DARK, font_size=7)
140
+
141
+ # Arrow
142
+ self._draw_arrow(self.width/2, self.height - 280, self.width/2, self.height - 295, GRAY_DARK)
143
+
144
+ def _draw_rag_pipeline(self):
145
+ """Draw the Multi-Agent RAG Pipeline."""
146
+ self.canv.setFillColor(PRIMARY_BLUE)
147
+ self.canv.setFont('Helvetica-Bold', 12)
148
+ self.canv.drawCentredString(self.width/2, self.height - 20, 'Multi-Agent RAG Pipeline')
149
+
150
+ # Query input
151
+ self._draw_box(20, self.height - 70, 80, 30, 'User Query', ACCENT_BLUE, font_size=8)
152
+
153
+ # Agents in sequence
154
+ agents = [
155
+ ('QueryPlanner', PRIMARY_BLUE, 'Intent Classification\nQuery Decomposition'),
156
+ ('Retriever', SECONDARY_BLUE, 'Hybrid Search\nDense + Sparse'),
157
+ ('Reranker', SECONDARY_BLUE, 'Cross-Encoder\nMMR Diversity'),
158
+ ('Synthesizer', PRIMARY_BLUE, 'Answer Generation\nCitation Tracking'),
159
+ ('Critic', WARNING_ORANGE, 'Hallucination Check\nValidation'),
160
+ ]
161
+
162
+ x_start = 120
163
+ box_width = 80
164
+ spacing = 10
165
+
166
+ for i, (name, color, desc) in enumerate(agents):
167
+ x = x_start + i * (box_width + spacing)
168
+ self._draw_box(x, self.height - 70, box_width, 30, name, color, font_size=7)
169
+ # Description below
170
+ self.canv.setFillColor(GRAY_DARK)
171
+ self.canv.setFont('Helvetica', 6)
172
+ lines = desc.split('\n')
173
+ for j, line in enumerate(lines):
174
+ self.canv.drawCentredString(x + box_width/2, self.height - 85 - j*8, line)
175
+
176
+ # Arrow to next
177
+ if i < len(agents) - 1:
178
+ self._draw_arrow(x + box_width, self.height - 55, x + box_width + spacing, self.height - 55, GRAY_DARK)
179
+
180
+ # Arrow from query to first agent
181
+ self._draw_arrow(100, self.height - 55, 120, self.height - 55, ACCENT_BLUE)
182
+
183
+ # Revision loop
184
+ self.canv.setStrokeColor(WARNING_ORANGE)
185
+ self.canv.setLineWidth(1.5)
186
+ self.canv.setDash(3, 3)
187
+ # Draw curved line for revision
188
+ critic_x = x_start + 4 * (box_width + spacing) + box_width
189
+ synth_x = x_start + 3 * (box_width + spacing)
190
+ self.canv.line(critic_x - 40, self.height - 100, synth_x + 40, self.height - 100)
191
+ self.canv.setDash()
192
+
193
+ self.canv.setFillColor(WARNING_ORANGE)
194
+ self.canv.setFont('Helvetica-Oblique', 7)
195
+ self.canv.drawCentredString((critic_x + synth_x)/2, self.height - 115, 'Revision Loop (if validation fails)')
196
+
197
+ # Final output
198
+ self._draw_box(critic_x + 20, self.height - 70, 80, 30, 'Response', SUCCESS_GREEN, font_size=8)
199
+ self._draw_arrow(critic_x, self.height - 55, critic_x + 20, self.height - 55, SUCCESS_GREEN)
200
+
201
+ # State tracking bar
202
+ self.canv.setFillColor(LIGHT_BLUE)
203
+ self.canv.roundRect(20, self.height - 160, self.width - 40, 35, 5, fill=1, stroke=0)
204
+ self.canv.setFillColor(PRIMARY_BLUE)
205
+ self.canv.setFont('Helvetica-Bold', 8)
206
+ self.canv.drawString(30, self.height - 145, 'RAGState: Query → Plan → Retrieved Chunks → Reranked → Answer → Validation → Citations')
207
+
208
+ def _draw_document_pipeline(self):
209
+ """Draw Document Processing Pipeline."""
210
+ self.canv.setFillColor(PRIMARY_BLUE)
211
+ self.canv.setFont('Helvetica-Bold', 12)
212
+ self.canv.drawCentredString(self.width/2, self.height - 20, 'Document Processing Pipeline')
213
+
214
+ stages = [
215
+ ('Input', 'PDF/Image\nUpload', ACCENT_BLUE),
216
+ ('OCR', 'PaddleOCR\nTesseract', PRIMARY_BLUE),
217
+ ('Layout', 'Region\nDetection', PRIMARY_BLUE),
218
+ ('Reading\nOrder', 'Sequence\nReconstruction', SECONDARY_BLUE),
219
+ ('Chunking', 'Semantic\nSplitting', SECONDARY_BLUE),
220
+ ('Indexing', 'ChromaDB\nEmbedding', SUCCESS_GREEN),
221
+ ]
222
+
223
+ box_width = 70
224
+ box_height = 45
225
+ spacing = 15
226
+ total_width = len(stages) * box_width + (len(stages) - 1) * spacing
227
+ x_start = (self.width - total_width) / 2
228
+ y_pos = self.height - 90
229
+
230
+ for i, (name, desc, color) in enumerate(stages):
231
+ x = x_start + i * (box_width + spacing)
232
+ # Main box
233
+ self._draw_box(x, y_pos, box_width, box_height, name.replace('\n', ' '), color, font_size=8)
234
+ # Description
235
+ self.canv.setFillColor(GRAY_DARK)
236
+ self.canv.setFont('Helvetica', 6)
237
+ lines = desc.split('\n')
238
+ for j, line in enumerate(lines):
239
+ self.canv.drawCentredString(x + box_width/2, y_pos - 15 - j*8, line)
240
+
241
+ # Arrow
242
+ if i < len(stages) - 1:
243
+ self._draw_arrow(x + box_width, y_pos + box_height/2, x + box_width + spacing, y_pos + box_height/2)
244
+
245
+ # Output description
246
+ self.canv.setFillColor(PRIMARY_BLUE)
247
+ self.canv.setFont('Helvetica-Bold', 9)
248
+ self.canv.drawCentredString(self.width/2, self.height - 160, 'Output: ProcessedDocument with chunks, OCR regions, layout data, bounding boxes')
249
+
250
+ def _draw_agent_interaction(self):
251
+ """Draw Agent Interaction Diagram."""
252
+ self.canv.setFillColor(PRIMARY_BLUE)
253
+ self.canv.setFont('Helvetica-Bold', 12)
254
+ self.canv.drawCentredString(self.width/2, self.height - 20, 'Agent Interaction & Data Flow')
255
+
256
+ # Central orchestrator
257
+ center_x, center_y = self.width/2, self.height/2 - 20
258
+ self._draw_box(center_x - 50, center_y - 20, 100, 40, 'Orchestrator', PRIMARY_BLUE, font_size=9)
259
+
260
+ # Surrounding agents
261
+ import math
262
+ agents = [
263
+ ('QueryPlanner', -120, 60),
264
+ ('Retriever', 0, 90),
265
+ ('Reranker', 120, 60),
266
+ ('Synthesizer', 120, -60),
267
+ ('Critic', 0, -90),
268
+ ]
269
+
270
+ for name, dx, dy in agents:
271
+ x = center_x + dx - 45
272
+ y = center_y + dy - 15
273
+ self._draw_box(x, y, 90, 30, name, SECONDARY_BLUE, font_size=8)
274
+ # Arrow to/from orchestrator
275
+ if dy > 0:
276
+ self._draw_arrow(center_x, center_y + 20, center_x + dx*0.3, center_y + dy - 15, ACCENT_BLUE)
277
+ else:
278
+ self._draw_arrow(center_x + dx*0.3, center_y + dy + 15, center_x, center_y - 20, ACCENT_BLUE)
279
+
280
+ # External connections
281
+ # Vector Store
282
+ self._draw_box(30, center_y - 15, 70, 30, 'ChromaDB', SUCCESS_GREEN, font_size=8)
283
+ self._draw_arrow(100, center_y, center_x - 50, center_y, SUCCESS_GREEN)
284
+
285
+ # LLM
286
+ self._draw_box(self.width - 100, center_y - 15, 70, 30, 'Ollama LLM', WARNING_ORANGE, font_size=8)
287
+ self._draw_arrow(self.width - 100, center_y, center_x + 50, center_y, WARNING_ORANGE)
288
+
289
+ def _draw_data_flow(self):
290
+ """Draw Data Flow Diagram."""
291
+ self.canv.setFillColor(PRIMARY_BLUE)
292
+ self.canv.setFont('Helvetica-Bold', 12)
293
+ self.canv.drawCentredString(self.width/2, self.height - 20, 'End-to-End Data Flow')
294
+
295
+ # Vertical flow
296
+ items = [
297
+ ('Document Upload', ACCENT_BLUE, 'PDF, Images, Text files'),
298
+ ('Document Processor', PRIMARY_BLUE, 'OCR → Layout → Chunking'),
299
+ ('State Manager', SECONDARY_BLUE, 'ProcessedDocument storage'),
300
+ ('Embedder', SECONDARY_BLUE, 'mxbai-embed-large (1024d)'),
301
+ ('ChromaDB', SUCCESS_GREEN, 'Vector indexing & storage'),
302
+ ('RAG Query', WARNING_ORANGE, 'User question processing'),
303
+ ('Multi-Agent Pipeline', PRIMARY_BLUE, '5-agent collaboration'),
304
+ ('Response', SUCCESS_GREEN, 'Answer with citations'),
305
+ ]
306
+
307
+ box_height = 28
308
+ spacing = 8
309
+ total_height = len(items) * box_height + (len(items) - 1) * spacing
310
+ y_start = self.height - 50
311
+ box_width = 160
312
+ x_center = self.width / 2 - box_width / 2
313
+
314
+ for i, (name, color, desc) in enumerate(items):
315
+ y = y_start - i * (box_height + spacing)
316
+ self._draw_box(x_center, y - box_height, box_width, box_height, name, color, font_size=8)
317
+ # Description on right
318
+ self.canv.setFillColor(GRAY_DARK)
319
+ self.canv.setFont('Helvetica', 7)
320
+ self.canv.drawString(x_center + box_width + 15, y - box_height/2 - 3, desc)
321
+
322
+ # Arrow
323
+ if i < len(items) - 1:
324
+ self._draw_arrow(x_center + box_width/2, y - box_height, x_center + box_width/2, y - box_height - spacing + 2)
325
+
326
+
327
+ def create_styles():
328
+ """Create custom paragraph styles."""
329
+ styles = getSampleStyleSheet()
330
+
331
+ # Title style
332
+ styles.add(ParagraphStyle(
333
+ name='MainTitle',
334
+ parent=styles['Title'],
335
+ fontSize=28,
336
+ textColor=PRIMARY_BLUE,
337
+ spaceAfter=30,
338
+ alignment=TA_CENTER,
339
+ fontName='Helvetica-Bold'
340
+ ))
341
+
342
+ # Subtitle
343
+ styles.add(ParagraphStyle(
344
+ name='Subtitle',
345
+ parent=styles['Normal'],
346
+ fontSize=16,
347
+ textColor=SECONDARY_BLUE,
348
+ spaceAfter=20,
349
+ alignment=TA_CENTER,
350
+ fontName='Helvetica'
351
+ ))
352
+
353
+ # Section Header
354
+ styles.add(ParagraphStyle(
355
+ name='SectionHeader',
356
+ parent=styles['Heading1'],
357
+ fontSize=18,
358
+ textColor=PRIMARY_BLUE,
359
+ spaceBefore=25,
360
+ spaceAfter=15,
361
+ fontName='Helvetica-Bold',
362
+ borderColor=ACCENT_BLUE,
363
+ borderWidth=2,
364
+ borderPadding=5,
365
+ ))
366
+
367
+ # Subsection Header
368
+ styles.add(ParagraphStyle(
369
+ name='SubsectionHeader',
370
+ parent=styles['Heading2'],
371
+ fontSize=14,
372
+ textColor=SECONDARY_BLUE,
373
+ spaceBefore=15,
374
+ spaceAfter=10,
375
+ fontName='Helvetica-Bold'
376
+ ))
377
+
378
+ # Body text
379
+ styles.add(ParagraphStyle(
380
+ name='CustomBody',
381
+ parent=styles['Normal'],
382
+ fontSize=10,
383
+ textColor=GRAY_DARK,
384
+ spaceAfter=8,
385
+ alignment=TA_JUSTIFY,
386
+ leading=14
387
+ ))
388
+
389
+ # Bullet style
390
+ styles.add(ParagraphStyle(
391
+ name='BulletText',
392
+ parent=styles['Normal'],
393
+ fontSize=10,
394
+ textColor=GRAY_DARK,
395
+ leftIndent=20,
396
+ spaceAfter=5,
397
+ leading=13
398
+ ))
399
+
400
+ # Caption
401
+ styles.add(ParagraphStyle(
402
+ name='Caption',
403
+ parent=styles['Normal'],
404
+ fontSize=9,
405
+ textColor=GRAY_DARK,
406
+ alignment=TA_CENTER,
407
+ spaceAfter=15,
408
+ fontName='Helvetica-Oblique'
409
+ ))
410
+
411
+ # Highlight box text
412
+ styles.add(ParagraphStyle(
413
+ name='HighlightText',
414
+ parent=styles['Normal'],
415
+ fontSize=10,
416
+ textColor=PRIMARY_BLUE,
417
+ spaceAfter=5,
418
+ fontName='Helvetica-Bold'
419
+ ))
420
+
421
+ return styles
422
+
423
+
424
+ def create_highlight_box(text, styles, color=LIGHT_BLUE):
425
+ """Create a highlighted text box."""
426
+ data = [[Paragraph(text, styles['HighlightText'])]]
427
+ table = Table(data, colWidths=[450])
428
+ table.setStyle(TableStyle([
429
+ ('BACKGROUND', (0, 0), (-1, -1), color),
430
+ ('BOX', (0, 0), (-1, -1), 1, ACCENT_BLUE),
431
+ ('PADDING', (0, 0), (-1, -1), 12),
432
+ ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
433
+ ]))
434
+ return table
435
+
436
+
437
+ def create_status_table(items, styles):
438
+ """Create a status table with colored indicators."""
439
+ data = [['Component', 'Status', 'Completion']]
440
+ for item, status, completion in items:
441
+ if status == 'Complete':
442
+ status_color = SUCCESS_GREEN
443
+ elif status == 'In Progress':
444
+ status_color = WARNING_ORANGE
445
+ else:
446
+ status_color = DANGER_RED
447
+ data.append([item, status, completion])
448
+
449
+ table = Table(data, colWidths=[250, 100, 100])
450
+ table.setStyle(TableStyle([
451
+ ('BACKGROUND', (0, 0), (-1, 0), PRIMARY_BLUE),
452
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
453
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
454
+ ('FONTSIZE', (0, 0), (-1, -1), 10),
455
+ ('ALIGN', (1, 0), (-1, -1), 'CENTER'),
456
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
457
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
458
+ ('PADDING', (0, 0), (-1, -1), 8),
459
+ ]))
460
+ return table
461
+
462
+
463
+ def create_metrics_table(metrics, styles):
464
+ """Create a metrics display table."""
465
+ data = []
466
+ for metric, value, change in metrics:
467
+ data.append([metric, value, change])
468
+
469
+ table = Table(data, colWidths=[200, 150, 100])
470
+ table.setStyle(TableStyle([
471
+ ('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
472
+ ('FONTSIZE', (0, 0), (-1, -1), 11),
473
+ ('TEXTCOLOR', (1, 0), (1, -1), PRIMARY_BLUE),
474
+ ('ALIGN', (1, 0), (-1, -1), 'CENTER'),
475
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
476
+ ('PADDING', (0, 0), (-1, -1), 10),
477
+ ('ROWBACKGROUNDS', (0, 0), (-1, -1), [LIGHT_BLUE, WHITE]),
478
+ ]))
479
+ return table
480
+
481
+
482
+ def generate_report():
483
+ """Generate the complete SPARKNET progress report PDF."""
484
+
485
+ filename = '/home/mhamdan/SPARKNET/docs/SPARKNET_Progress_Report.pdf'
486
+ os.makedirs(os.path.dirname(filename), exist_ok=True)
487
+
488
+ doc = SimpleDocTemplate(
489
+ filename,
490
+ pagesize=A4,
491
+ rightMargin=50,
492
+ leftMargin=50,
493
+ topMargin=60,
494
+ bottomMargin=60
495
+ )
496
+
497
+ styles = create_styles()
498
+ story = []
499
+
500
+ # ========== TITLE PAGE ==========
501
+ story.append(Spacer(1, 100))
502
+ story.append(Paragraph('SPARKNET', styles['MainTitle']))
503
+ story.append(Paragraph('Multi-Agentic Document Intelligence Framework', styles['Subtitle']))
504
+ story.append(Spacer(1, 30))
505
+ story.append(Paragraph('Progress Report & Future Roadmap', styles['Subtitle']))
506
+ story.append(Spacer(1, 50))
507
+
508
+ # Version info box
509
+ version_data = [
510
+ ['Version', '1.0.0-beta'],
511
+ ['Report Date', datetime.now().strftime('%B %d, %Y')],
512
+ ['Document Type', 'Stakeholder Progress Report'],
513
+ ['Classification', 'Internal / Confidential'],
514
+ ]
515
+ version_table = Table(version_data, colWidths=[150, 200])
516
+ version_table.setStyle(TableStyle([
517
+ ('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
518
+ ('FONTSIZE', (0, 0), (-1, -1), 10),
519
+ ('TEXTCOLOR', (0, 0), (-1, -1), GRAY_DARK),
520
+ ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
521
+ ('GRID', (0, 0), (-1, -1), 0.5, ACCENT_BLUE),
522
+ ('PADDING', (0, 0), (-1, -1), 8),
523
+ ('BACKGROUND', (0, 0), (-1, -1), LIGHT_BLUE),
524
+ ]))
525
+ story.append(version_table)
526
+
527
+ story.append(PageBreak())
528
+
529
+ # ========== TABLE OF CONTENTS ==========
530
+ story.append(Paragraph('Table of Contents', styles['SectionHeader']))
531
+ story.append(Spacer(1, 20))
532
+
533
+ toc_items = [
534
+ ('1. Executive Summary', '3'),
535
+ ('2. Project Overview', '4'),
536
+ ('3. Technical Architecture', '5'),
537
+ ('4. Component Deep Dive', '8'),
538
+ ('5. Current Progress & Achievements', '12'),
539
+ ('6. Gap Analysis', '14'),
540
+ ('7. Future Work & Roadmap', '17'),
541
+ ('8. Risk Assessment', '20'),
542
+ ('9. Resource Requirements', '21'),
543
+ ('10. Conclusion & Recommendations', '22'),
544
+ ]
545
+
546
+ toc_data = [[Paragraph(f'<b>{item}</b>', styles['CustomBody']), page] for item, page in toc_items]
547
+ toc_table = Table(toc_data, colWidths=[400, 50])
548
+ toc_table.setStyle(TableStyle([
549
+ ('FONTSIZE', (0, 0), (-1, -1), 11),
550
+ ('ALIGN', (1, 0), (1, -1), 'RIGHT'),
551
+ ('BOTTOMPADDING', (0, 0), (-1, -1), 8),
552
+ ('LINEBELOW', (0, 0), (-1, -2), 0.5, colors.lightgrey),
553
+ ]))
554
+ story.append(toc_table)
555
+
556
+ story.append(PageBreak())
557
+
558
+ # ========== 1. EXECUTIVE SUMMARY ==========
559
+ story.append(Paragraph('1. Executive Summary', styles['SectionHeader']))
560
+
561
+ story.append(Paragraph(
562
+ '''SPARKNET represents a next-generation document intelligence platform that combines
563
+ advanced OCR capabilities, sophisticated layout analysis, and a state-of-the-art
564
+ Multi-Agent Retrieval-Augmented Generation (RAG) system. This report provides a
565
+ comprehensive overview of the project's current state, technical achievements,
566
+ identified gaps, and the strategic roadmap for future development.''',
567
+ styles['CustomBody']
568
+ ))
569
+
570
+ story.append(Spacer(1, 15))
571
+ story.append(Paragraph('<b>Key Highlights</b>', styles['SubsectionHeader']))
572
+
573
+ highlights = [
574
+ '<b>Multi-Agent RAG Architecture:</b> Successfully implemented a 5-agent pipeline (QueryPlanner, Retriever, Reranker, Synthesizer, Critic) with self-correction capabilities.',
575
+ '<b>Document Processing Pipeline:</b> Complete end-to-end document processing with OCR, layout detection, and semantic chunking.',
576
+ '<b>Production-Ready Demo:</b> Fully functional Streamlit application with 5 interactive modules for document intelligence workflows.',
577
+ '<b>Hallucination Detection:</b> Built-in validation and criticism system to ensure factual accuracy of generated responses.',
578
+ '<b>Unified State Management:</b> Cross-module communication enabling seamless user experience across all application components.',
579
+ ]
580
+
581
+ for h in highlights:
582
+ story.append(Paragraph(f'• {h}', styles['BulletText']))
583
+
584
+ story.append(Spacer(1, 20))
585
+
586
+ # Key Metrics
587
+ story.append(Paragraph('<b>Current System Metrics</b>', styles['SubsectionHeader']))
588
+ metrics = [
589
+ ('RAG Pipeline Agents', '5 Specialized Agents', '✓ Complete'),
590
+ ('Document Formats Supported', 'PDF, Images', '2 formats'),
591
+ ('Vector Dimensions', '1024 (mxbai-embed-large)', 'Production'),
592
+ ('Demo Application Pages', '5 Interactive Modules', '✓ Complete'),
593
+ ('LLM Integration', 'Ollama (Local)', 'Self-hosted'),
594
+ ]
595
+ story.append(create_metrics_table(metrics, styles))
596
+
597
+ story.append(PageBreak())
598
+
599
+ # ========== 2. PROJECT OVERVIEW ==========
600
+ story.append(Paragraph('2. Project Overview', styles['SectionHeader']))
601
+
602
+ story.append(Paragraph('<b>2.1 Vision & Objectives</b>', styles['SubsectionHeader']))
603
+ story.append(Paragraph(
604
+ '''SPARKNET aims to revolutionize document intelligence by providing an integrated
605
+ platform that can understand, process, and intelligently query complex documents.
606
+ The system leverages cutting-edge AI techniques including multi-agent collaboration,
607
+ hybrid retrieval, and sophisticated answer synthesis with built-in validation.''',
608
+ styles['CustomBody']
609
+ ))
610
+
611
+ story.append(Spacer(1, 10))
612
+ story.append(Paragraph('<b>Core Objectives:</b>', styles['CustomBody']))
613
+
614
+ objectives = [
615
+ '<b>Intelligent Document Understanding:</b> Extract and structure information from diverse document formats with high accuracy.',
616
+ '<b>Conversational Intelligence:</b> Enable natural language querying over document collections with citation-backed responses.',
617
+ '<b>Reliability & Trust:</b> Implement hallucination detection and self-correction to ensure factual accuracy.',
618
+ '<b>Scalability:</b> Design for enterprise-scale document processing and retrieval workloads.',
619
+ '<b>Extensibility:</b> Modular architecture allowing easy integration of new capabilities and models.',
620
+ ]
621
+
622
+ for obj in objectives:
623
+ story.append(Paragraph(f'• {obj}', styles['BulletText']))
624
+
625
+ story.append(Spacer(1, 15))
626
+ story.append(Paragraph('<b>2.2 Target Use Cases</b>', styles['SubsectionHeader']))
627
+
628
+ use_cases = [
629
+ ['Use Case', 'Description', 'Status'],
630
+ ['Legal Document Analysis', 'Contract review, clause extraction, compliance checking', 'Supported'],
631
+ ['Research Paper Synthesis', 'Multi-paper querying, citation tracking, summary generation', 'Supported'],
632
+ ['Technical Documentation', 'API docs, manuals, knowledge base querying', 'Supported'],
633
+ ['Financial Reports', 'Annual reports, SEC filings, financial data extraction', 'Planned'],
634
+ ['Medical Records', 'Clinical notes, diagnostic reports (HIPAA compliance needed)', 'Future'],
635
+ ]
636
+
637
+ uc_table = Table(use_cases, colWidths=[130, 230, 90])
638
+ uc_table.setStyle(TableStyle([
639
+ ('BACKGROUND', (0, 0), (-1, 0), PRIMARY_BLUE),
640
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
641
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
642
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
643
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
644
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
645
+ ('PADDING', (0, 0), (-1, -1), 6),
646
+ ('ALIGN', (2, 0), (2, -1), 'CENTER'),
647
+ ]))
648
+ story.append(uc_table)
649
+
650
+ story.append(PageBreak())
651
+
652
+ # ========== 3. TECHNICAL ARCHITECTURE ==========
653
+ story.append(Paragraph('3. Technical Architecture', styles['SectionHeader']))
654
+
655
+ story.append(Paragraph('<b>3.1 High-Level Architecture</b>', styles['SubsectionHeader']))
656
+ story.append(Paragraph(
657
+ '''SPARKNET follows a layered microservices-inspired architecture with clear separation
658
+ of concerns. The system is organized into presentation, service, and persistence layers,
659
+ with a central orchestration mechanism coordinating multi-agent workflows.''',
660
+ styles['CustomBody']
661
+ ))
662
+
663
+ story.append(Spacer(1, 10))
664
+
665
+ # Architecture Diagram
666
+ arch_diagram = DiagramFlowable(500, 350, 'architecture')
667
+ story.append(arch_diagram)
668
+ story.append(Paragraph('Figure 1: SPARKNET High-Level Architecture', styles['Caption']))
669
+
670
+ story.append(Spacer(1, 15))
671
+ story.append(Paragraph('<b>3.2 Multi-Agent RAG Pipeline</b>', styles['SubsectionHeader']))
672
+ story.append(Paragraph(
673
+ '''The heart of SPARKNET is its Multi-Agent RAG system, which orchestrates five
674
+ specialized agents in a sophisticated pipeline with self-correction capabilities.''',
675
+ styles['CustomBody']
676
+ ))
677
+
678
+ story.append(Spacer(1, 10))
679
+
680
+ # RAG Pipeline Diagram
681
+ rag_diagram = DiagramFlowable(500, 180, 'rag_pipeline')
682
+ story.append(rag_diagram)
683
+ story.append(Paragraph('Figure 2: Multi-Agent RAG Pipeline with Revision Loop', styles['Caption']))
684
+
685
+ story.append(PageBreak())
686
+
687
+ story.append(Paragraph('<b>3.3 Document Processing Pipeline</b>', styles['SubsectionHeader']))
688
+ story.append(Paragraph(
689
+ '''Documents undergo a multi-stage processing pipeline that extracts text, identifies
690
+ layout structure, establishes reading order, and creates semantically coherent chunks
691
+ optimized for retrieval.''',
692
+ styles['CustomBody']
693
+ ))
694
+
695
+ story.append(Spacer(1, 10))
696
+
697
+ # Document Pipeline Diagram
698
+ doc_diagram = DiagramFlowable(500, 180, 'document_pipeline')
699
+ story.append(doc_diagram)
700
+ story.append(Paragraph('Figure 3: Document Processing Pipeline', styles['Caption']))
701
+
702
+ story.append(Spacer(1, 15))
703
+ story.append(Paragraph('<b>3.4 Agent Interaction Model</b>', styles['SubsectionHeader']))
704
+ story.append(Paragraph(
705
+ '''The orchestrator coordinates all agents, managing state transitions and ensuring
706
+ proper data flow between components. External services (Vector Store, LLM) are
707
+ accessed through well-defined interfaces.''',
708
+ styles['CustomBody']
709
+ ))
710
+
711
+ story.append(Spacer(1, 10))
712
+
713
+ # Agent Interaction Diagram
714
+ agent_diagram = DiagramFlowable(500, 250, 'agent_interaction')
715
+ story.append(agent_diagram)
716
+ story.append(Paragraph('Figure 4: Agent Interaction Model', styles['Caption']))
717
+
718
+ story.append(PageBreak())
719
+
720
+ story.append(Paragraph('<b>3.5 Data Flow Architecture</b>', styles['SubsectionHeader']))
721
+ story.append(Paragraph(
722
+ '''The end-to-end data flow illustrates how documents are processed from upload
723
+ through indexing, and how queries are handled through the multi-agent pipeline
724
+ to produce validated, citation-backed responses.''',
725
+ styles['CustomBody']
726
+ ))
727
+
728
+ story.append(Spacer(1, 10))
729
+
730
+ # Data Flow Diagram
731
+ flow_diagram = DiagramFlowable(500, 320, 'data_flow')
732
+ story.append(flow_diagram)
733
+ story.append(Paragraph('Figure 5: End-to-End Data Flow', styles['Caption']))
734
+
735
+ story.append(PageBreak())
736
+
737
+ # ========== 4. COMPONENT DEEP DIVE ==========
738
+ story.append(Paragraph('4. Component Deep Dive', styles['SectionHeader']))
739
+
740
+ story.append(Paragraph('<b>4.1 Query Planning Agent</b>', styles['SubsectionHeader']))
741
+ story.append(Paragraph(
742
+ '''The QueryPlannerAgent is responsible for understanding user intent, classifying
743
+ query types, and decomposing complex queries into manageable sub-queries.''',
744
+ styles['CustomBody']
745
+ ))
746
+
747
+ # Query types table
748
+ query_types = [
749
+ ['Intent Type', 'Description', 'Example'],
750
+ ['FACTOID', 'Simple fact lookup', '"What is the revenue for Q4?"'],
751
+ ['COMPARISON', 'Multi-entity comparison', '"Compare product A vs B features"'],
752
+ ['AGGREGATION', 'Cross-document summary', '"Summarize all quarterly reports"'],
753
+ ['CAUSAL', 'Why/how explanations', '"Why did revenue decline?"'],
754
+ ['PROCEDURAL', 'Step-by-step instructions', '"How to configure the system?"'],
755
+ ['MULTI_HOP', 'Multi-step reasoning', '"Which supplier has the lowest cost for product X?"'],
756
+ ]
757
+
758
+ qt_table = Table(query_types, colWidths=[90, 180, 180])
759
+ qt_table.setStyle(TableStyle([
760
+ ('BACKGROUND', (0, 0), (-1, 0), SECONDARY_BLUE),
761
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
762
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
763
+ ('FONTSIZE', (0, 0), (-1, -1), 8),
764
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
765
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
766
+ ('PADDING', (0, 0), (-1, -1), 5),
767
+ ]))
768
+ story.append(qt_table)
769
+ story.append(Paragraph('Table 1: Supported Query Intent Types', styles['Caption']))
770
+
771
+ story.append(Spacer(1, 10))
772
+ story.append(Paragraph('<b>4.2 Hybrid Retrieval System</b>', styles['SubsectionHeader']))
773
+ story.append(Paragraph(
774
+ '''The RetrieverAgent implements a sophisticated hybrid search combining dense
775
+ semantic retrieval with sparse keyword matching, using Reciprocal Rank Fusion (RRF)
776
+ to merge results optimally.''',
777
+ styles['CustomBody']
778
+ ))
779
+
780
+ retrieval_features = [
781
+ '<b>Dense Retrieval:</b> Embedding-based semantic search using mxbai-embed-large (1024 dimensions)',
782
+ '<b>Sparse Retrieval:</b> BM25-style keyword matching for precise term matching',
783
+ '<b>RRF Fusion:</b> Combines rankings using formula: RRF = Σ(1 / (k + rank))',
784
+ '<b>Intent-Adaptive Weights:</b> Adjusts dense/sparse balance based on query type (e.g., 80/20 for definitions, 50/50 for comparisons)',
785
+ ]
786
+
787
+ for feat in retrieval_features:
788
+ story.append(Paragraph(f'• {feat}', styles['BulletText']))
789
+
790
+ story.append(Spacer(1, 10))
791
+ story.append(Paragraph('<b>4.3 Cross-Encoder Reranking</b>', styles['SubsectionHeader']))
792
+ story.append(Paragraph(
793
+ '''The RerankerAgent applies LLM-based cross-encoder scoring to refine retrieval
794
+ results, implementing deduplication and Maximal Marginal Relevance (MMR) for
795
+ diversity promotion.''',
796
+ styles['CustomBody']
797
+ ))
798
+
799
+ reranker_config = [
800
+ ['Parameter', 'Value', 'Purpose'],
801
+ ['top_k', '5', 'Final result count'],
802
+ ['min_relevance_score', '0.3', 'Quality threshold'],
803
+ ['dedup_threshold', '0.9', 'Similarity for duplicate detection'],
804
+ ['MMR lambda', '0.7', 'Relevance vs diversity balance'],
805
+ ]
806
+
807
+ rr_table = Table(reranker_config, colWidths=[140, 80, 230])
808
+ rr_table.setStyle(TableStyle([
809
+ ('BACKGROUND', (0, 0), (-1, 0), SECONDARY_BLUE),
810
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
811
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
812
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
813
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
814
+ ('PADDING', (0, 0), (-1, -1), 6),
815
+ ]))
816
+ story.append(rr_table)
817
+ story.append(Paragraph('Table 2: Reranker Configuration', styles['Caption']))
818
+
819
+ story.append(PageBreak())
820
+
821
+ story.append(Paragraph('<b>4.4 Answer Synthesis</b>', styles['SubsectionHeader']))
822
+ story.append(Paragraph(
823
+ '''The SynthesizerAgent generates comprehensive answers with automatic citation
824
+ tracking, supporting multiple output formats and implementing intelligent abstention
825
+ when evidence is insufficient.''',
826
+ styles['CustomBody']
827
+ ))
828
+
829
+ story.append(Paragraph('<b>Supported Answer Formats:</b>', styles['CustomBody']))
830
+ formats = ['PROSE - Flowing paragraph narrative', 'BULLET_POINTS - Enumerated key points',
831
+ 'TABLE - Comparative tabular format', 'STEP_BY_STEP - Procedural instructions']
832
+ for fmt in formats:
833
+ story.append(Paragraph(f'• {fmt}', styles['BulletText']))
834
+
835
+ story.append(Paragraph('<b>Confidence Calculation:</b>', styles['CustomBody']))
836
+ story.append(Paragraph('confidence = 0.5 × source_relevance + 0.3 × source_count_factor + 0.2 × consistency', styles['BulletText']))
837
+
838
+ story.append(Spacer(1, 10))
839
+ story.append(Paragraph('<b>4.5 Validation & Hallucination Detection</b>', styles['SubsectionHeader']))
840
+ story.append(Paragraph(
841
+ '''The CriticAgent performs comprehensive validation including hallucination detection,
842
+ citation verification, and factual consistency checking. It can trigger revision
843
+ cycles when issues are detected.''',
844
+ styles['CustomBody']
845
+ ))
846
+
847
+ issue_types = [
848
+ ['Issue Type', 'Description', 'Severity'],
849
+ ['HALLUCINATION', 'Information not supported by sources', 'Critical'],
850
+ ['UNSUPPORTED_CLAIM', 'Statement without citation', 'High'],
851
+ ['INCORRECT_CITATION', 'Citation references wrong source', 'High'],
852
+ ['CONTRADICTION', 'Internal inconsistency in answer', 'Medium'],
853
+ ['INCOMPLETE', 'Missing important information', 'Medium'],
854
+ ['FACTUAL_ERROR', 'Verifiable factual mistake', 'Critical'],
855
+ ]
856
+
857
+ it_table = Table(issue_types, colWidths=[130, 230, 90])
858
+ it_table.setStyle(TableStyle([
859
+ ('BACKGROUND', (0, 0), (-1, 0), WARNING_ORANGE),
860
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
861
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
862
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
863
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
864
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
865
+ ('PADDING', (0, 0), (-1, -1), 5),
866
+ ]))
867
+ story.append(it_table)
868
+ story.append(Paragraph('Table 3: Validation Issue Types', styles['Caption']))
869
+
870
+ story.append(PageBreak())
871
+
872
+ story.append(Paragraph('<b>4.6 Document Processing Components</b>', styles['SubsectionHeader']))
873
+
874
+ story.append(Paragraph('<b>OCR Engines:</b>', styles['CustomBody']))
875
+ ocr_comparison = [
876
+ ['Feature', 'PaddleOCR', 'Tesseract'],
877
+ ['GPU Acceleration', '✓ Yes', '✗ No'],
878
+ ['Multi-language', '✓ 80+ languages', '✓ 100+ languages'],
879
+ ['Accuracy (Clean)', '~95%', '~90%'],
880
+ ['Accuracy (Complex)', '~85%', '~75%'],
881
+ ['Speed', 'Fast', 'Moderate'],
882
+ ['Confidence Scores', '✓ Per-region', '✓ Per-word'],
883
+ ]
884
+
885
+ ocr_table = Table(ocr_comparison, colWidths=[130, 160, 160])
886
+ ocr_table.setStyle(TableStyle([
887
+ ('BACKGROUND', (0, 0), (-1, 0), PRIMARY_BLUE),
888
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
889
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
890
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
891
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
892
+ ('PADDING', (0, 0), (-1, -1), 5),
893
+ ]))
894
+ story.append(ocr_table)
895
+ story.append(Paragraph('Table 4: OCR Engine Comparison', styles['Caption']))
896
+
897
+ story.append(Spacer(1, 10))
898
+ story.append(Paragraph('<b>Layout Detection:</b>', styles['CustomBody']))
899
+ layout_types = ['TEXT, TITLE, HEADING, PARAGRAPH - Text regions',
900
+ 'TABLE, FIGURE, CHART - Visual elements',
901
+ 'CAPTION, FOOTNOTE - Supplementary text',
902
+ 'HEADER, FOOTER - Page elements',
903
+ 'FORMULA - Mathematical expressions']
904
+ for lt in layout_types:
905
+ story.append(Paragraph(f'• {lt}', styles['BulletText']))
906
+
907
+ story.append(Spacer(1, 10))
908
+ story.append(Paragraph('<b>Chunking Configuration:</b>', styles['CustomBody']))
909
+ chunk_config = [
910
+ ['Parameter', 'Default', 'Description'],
911
+ ['max_chunk_chars', '1000', 'Maximum characters per chunk'],
912
+ ['min_chunk_chars', '50', 'Minimum viable chunk size'],
913
+ ['overlap_chars', '100', 'Overlap between consecutive chunks'],
914
+ ['Strategy', 'Semantic', 'Respects layout boundaries'],
915
+ ]
916
+
917
+ cc_table = Table(chunk_config, colWidths=[120, 80, 250])
918
+ cc_table.setStyle(TableStyle([
919
+ ('BACKGROUND', (0, 0), (-1, 0), SECONDARY_BLUE),
920
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
921
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
922
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
923
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
924
+ ('PADDING', (0, 0), (-1, -1), 5),
925
+ ]))
926
+ story.append(cc_table)
927
+ story.append(Paragraph('Table 5: Chunking Configuration', styles['Caption']))
928
+
929
+ story.append(PageBreak())
930
+
931
+ # ========== 5. CURRENT PROGRESS ==========
932
+ story.append(Paragraph('5. Current Progress & Achievements', styles['SectionHeader']))
933
+
934
+ story.append(Paragraph('<b>5.1 Development Milestones</b>', styles['SubsectionHeader']))
935
+
936
+ milestones = [
937
+ ['Milestone', 'Status', 'Completion'],
938
+ ['Core RAG Pipeline', 'Complete', '100%'],
939
+ ['5-Agent Architecture', 'Complete', '100%'],
940
+ ['Document Processing Pipeline', 'Complete', '100%'],
941
+ ['ChromaDB Integration', 'Complete', '100%'],
942
+ ['Ollama LLM Integration', 'Complete', '100%'],
943
+ ['Streamlit Demo Application', 'Complete', '100%'],
944
+ ['State Management System', 'Complete', '100%'],
945
+ ['Hallucination Detection', 'Complete', '100%'],
946
+ ['PDF Processing', 'Complete', '100%'],
947
+ ['Self-Correction Loop', 'Complete', '100%'],
948
+ ]
949
+
950
+ ms_table = Table(milestones, colWidths=[220, 120, 110])
951
+ ms_table.setStyle(TableStyle([
952
+ ('BACKGROUND', (0, 0), (-1, 0), PRIMARY_BLUE),
953
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
954
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
955
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
956
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
957
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
958
+ ('PADDING', (0, 0), (-1, -1), 6),
959
+ ('ALIGN', (1, 0), (-1, -1), 'CENTER'),
960
+ ]))
961
+ story.append(ms_table)
962
+ story.append(Paragraph('Table 6: Development Milestones', styles['Caption']))
963
+
964
+ story.append(Spacer(1, 15))
965
+ story.append(Paragraph('<b>5.2 Demo Application Features</b>', styles['SubsectionHeader']))
966
+
967
+ demo_features = [
968
+ ['Page', 'Features', 'Status'],
969
+ ['Live Processing', 'Real-time document processing, progress tracking, auto-indexing', '✓ Complete'],
970
+ ['Interactive RAG', 'Query interface, document filtering, chunk preview, citations', '✓ Complete'],
971
+ ['Document Comparison', 'Semantic similarity, structure analysis, content diff', '✓ Complete'],
972
+ ['Evidence Viewer', 'Confidence coloring, bounding boxes, OCR regions, export', '✓ Complete'],
973
+ ['Document Viewer', 'Multi-tab view, chunk display, layout visualization', '✓ Complete'],
974
+ ]
975
+
976
+ df_table = Table(demo_features, colWidths=[110, 270, 70])
977
+ df_table.setStyle(TableStyle([
978
+ ('BACKGROUND', (0, 0), (-1, 0), SECONDARY_BLUE),
979
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
980
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
981
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
982
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
983
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
984
+ ('PADDING', (0, 0), (-1, -1), 5),
985
+ ('ALIGN', (2, 0), (2, -1), 'CENTER'),
986
+ ]))
987
+ story.append(df_table)
988
+ story.append(Paragraph('Table 7: Demo Application Features', styles['Caption']))
989
+
990
+ story.append(Spacer(1, 15))
991
+ story.append(Paragraph('<b>5.3 Technical Achievements</b>', styles['SubsectionHeader']))
992
+
993
+ achievements = [
994
+ '<b>Hybrid Retrieval:</b> Successfully combined dense and sparse retrieval with RRF fusion, achieving better recall than either method alone.',
995
+ '<b>Self-Correction:</b> Implemented revision loop allowing the system to automatically fix issues detected by the Critic agent.',
996
+ '<b>Citation Tracking:</b> Automatic citation generation with [N] notation linking answers to source documents.',
997
+ '<b>Confidence Scoring:</b> Multi-factor confidence calculation providing transparency into answer reliability.',
998
+ '<b>Streaming Support:</b> Real-time response streaming for improved user experience during long generations.',
999
+ '<b>Cross-Module Communication:</b> Unified state manager enabling seamless navigation between application modules.',
1000
+ ]
1001
+
1002
+ for ach in achievements:
1003
+ story.append(Paragraph(f'• {ach}', styles['BulletText']))
1004
+
1005
+ story.append(PageBreak())
1006
+
1007
+ # ========== 6. GAP ANALYSIS ==========
1008
+ story.append(Paragraph('6. Gap Analysis', styles['SectionHeader']))
1009
+
1010
+ story.append(Paragraph(
1011
+ '''This section identifies current limitations and gaps in the SPARKNET system
1012
+ that represent opportunities for improvement and future development.''',
1013
+ styles['CustomBody']
1014
+ ))
1015
+
1016
+ story.append(Spacer(1, 10))
1017
+ story.append(Paragraph('<b>6.1 Functional Gaps</b>', styles['SubsectionHeader']))
1018
+
1019
+ functional_gaps = [
1020
+ ['Gap ID', 'Category', 'Description', 'Impact', 'Priority'],
1021
+ ['FG-001', 'Document Support', 'Limited to PDF and images; no Word, Excel, PowerPoint support', 'High', 'P1'],
1022
+ ['FG-002', 'Table Extraction', 'Table structure not preserved during chunking', 'High', 'P1'],
1023
+ ['FG-003', 'Multi-modal', 'No image/chart understanding within documents', 'Medium', 'P2'],
1024
+ ['FG-004', 'Languages', 'Primarily English; limited multi-language support', 'Medium', 'P2'],
1025
+ ['FG-005', 'Batch Processing', 'No bulk document upload/processing capability', 'Medium', 'P2'],
1026
+ ['FG-006', 'Document Updates', 'No incremental update; full reprocessing required', 'Medium', 'P2'],
1027
+ ['FG-007', 'User Feedback', 'No mechanism to learn from user corrections', 'Low', 'P3'],
1028
+ ]
1029
+
1030
+ fg_table = Table(functional_gaps, colWidths=[50, 85, 200, 55, 55])
1031
+ fg_table.setStyle(TableStyle([
1032
+ ('BACKGROUND', (0, 0), (-1, 0), DANGER_RED),
1033
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
1034
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
1035
+ ('FONTSIZE', (0, 0), (-1, -1), 8),
1036
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
1037
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
1038
+ ('PADDING', (0, 0), (-1, -1), 4),
1039
+ ('ALIGN', (0, 0), (0, -1), 'CENTER'),
1040
+ ('ALIGN', (3, 0), (-1, -1), 'CENTER'),
1041
+ ]))
1042
+ story.append(fg_table)
1043
+ story.append(Paragraph('Table 8: Functional Gaps', styles['Caption']))
1044
+
1045
+ story.append(Spacer(1, 15))
1046
+ story.append(Paragraph('<b>6.2 Technical Gaps</b>', styles['SubsectionHeader']))
1047
+
1048
+ technical_gaps = [
1049
+ ['Gap ID', 'Category', 'Description', 'Impact', 'Priority'],
1050
+ ['TG-001', 'Scalability', 'Single-node architecture; no distributed processing', 'High', 'P1'],
1051
+ ['TG-002', 'Authentication', 'No user authentication or access control', 'High', 'P1'],
1052
+ ['TG-003', 'API', 'No REST API for external integration', 'High', 'P1'],
1053
+ ['TG-004', 'Caching', 'Limited query result caching; redundant LLM calls', 'Medium', 'P2'],
1054
+ ['TG-005', 'Monitoring', 'Basic logging only; no metrics/alerting system', 'Medium', 'P2'],
1055
+ ['TG-006', 'Testing', 'Limited test coverage; no integration tests', 'Medium', 'P2'],
1056
+ ['TG-007', 'Cloud Deploy', 'Not containerized; no Kubernetes manifests', 'Medium', 'P2'],
1057
+ ['TG-008', 'GPU Sharing', 'Single GPU utilization; no multi-GPU support', 'Low', 'P3'],
1058
+ ]
1059
+
1060
+ tg_table = Table(technical_gaps, colWidths=[50, 80, 205, 55, 55])
1061
+ tg_table.setStyle(TableStyle([
1062
+ ('BACKGROUND', (0, 0), (-1, 0), WARNING_ORANGE),
1063
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
1064
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
1065
+ ('FONTSIZE', (0, 0), (-1, -1), 8),
1066
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
1067
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
1068
+ ('PADDING', (0, 0), (-1, -1), 4),
1069
+ ('ALIGN', (0, 0), (0, -1), 'CENTER'),
1070
+ ('ALIGN', (3, 0), (-1, -1), 'CENTER'),
1071
+ ]))
1072
+ story.append(tg_table)
1073
+ story.append(Paragraph('Table 9: Technical Gaps', styles['Caption']))
1074
+
1075
+ story.append(PageBreak())
1076
+
1077
+ story.append(Paragraph('<b>6.3 Performance Gaps</b>', styles['SubsectionHeader']))
1078
+
1079
+ perf_gaps = [
1080
+ ['Gap ID', 'Metric', 'Current', 'Target', 'Gap'],
1081
+ ['PG-001', 'Query Latency (simple)', '3-5 seconds', '<2 seconds', '~2x improvement needed'],
1082
+ ['PG-002', 'Query Latency (complex)', '10-20 seconds', '<5 seconds', '~3x improvement needed'],
1083
+ ['PG-003', 'Document Processing', '30-60 sec/page', '<10 sec/page', '~4x improvement needed'],
1084
+ ['PG-004', 'Concurrent Users', '1-5', '50+', 'Major scaling required'],
1085
+ ['PG-005', 'Index Size', '10K chunks', '1M+ chunks', 'Architecture redesign'],
1086
+ ['PG-006', 'Accuracy (hallucination)', '~85%', '>95%', '~10% improvement'],
1087
+ ]
1088
+
1089
+ pg_table = Table(perf_gaps, colWidths=[50, 120, 90, 90, 100])
1090
+ pg_table.setStyle(TableStyle([
1091
+ ('BACKGROUND', (0, 0), (-1, 0), SECONDARY_BLUE),
1092
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
1093
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
1094
+ ('FONTSIZE', (0, 0), (-1, -1), 8),
1095
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
1096
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
1097
+ ('PADDING', (0, 0), (-1, -1), 4),
1098
+ ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
1099
+ ]))
1100
+ story.append(pg_table)
1101
+ story.append(Paragraph('Table 10: Performance Gaps', styles['Caption']))
1102
+
1103
+ story.append(Spacer(1, 15))
1104
+ story.append(Paragraph('<b>6.4 Security & Compliance Gaps</b>', styles['SubsectionHeader']))
1105
+
1106
+ security_gaps = [
1107
+ '<b>No Authentication:</b> Currently no user login or session management',
1108
+ '<b>No Authorization:</b> Missing role-based access control (RBAC) for documents',
1109
+ '<b>Data Encryption:</b> Documents and embeddings stored unencrypted at rest',
1110
+ '<b>Audit Logging:</b> No comprehensive audit trail for compliance requirements',
1111
+ '<b>PII Detection:</b> No automatic detection/redaction of personally identifiable information',
1112
+ '<b>GDPR/HIPAA:</b> Not compliant with major data protection regulations',
1113
+ ]
1114
+
1115
+ for sg in security_gaps:
1116
+ story.append(Paragraph(f'• {sg}', styles['BulletText']))
1117
+
1118
+ story.append(PageBreak())
1119
+
1120
+ # ========== 7. FUTURE WORK & ROADMAP ==========
1121
+ story.append(Paragraph('7. Future Work & Roadmap', styles['SectionHeader']))
1122
+
1123
+ story.append(Paragraph('<b>7.1 Strategic Roadmap Overview</b>', styles['SubsectionHeader']))
1124
+ story.append(Paragraph(
1125
+ '''The SPARKNET roadmap is organized into three phases, each building upon the
1126
+ previous to transform the current prototype into a production-ready enterprise
1127
+ solution.''',
1128
+ styles['CustomBody']
1129
+ ))
1130
+
1131
+ story.append(Spacer(1, 10))
1132
+
1133
+ # Roadmap phases
1134
+ roadmap = [
1135
+ ['Phase', 'Timeline', 'Focus Areas', 'Key Deliverables'],
1136
+ ['Phase 1:\nFoundation', 'Q1-Q2 2026',
1137
+ 'Stability, Core Features,\nBasic Security',
1138
+ '• REST API\n• Authentication\n• Extended document formats\n• Basic containerization'],
1139
+ ['Phase 2:\nScale', 'Q3-Q4 2026',
1140
+ 'Performance, Scalability,\nEnterprise Features',
1141
+ '• Distributed processing\n• Advanced caching\n• Multi-tenancy\n• Monitoring & alerting'],
1142
+ ['Phase 3:\nInnovation', 'Q1-Q2 2027',
1143
+ 'Advanced AI, Compliance,\nEcosystem',
1144
+ '• Multi-modal understanding\n• Compliance frameworks\n• Plugin architecture\n• Advanced analytics'],
1145
+ ]
1146
+
1147
+ rm_table = Table(roadmap, colWidths=[70, 80, 130, 170])
1148
+ rm_table.setStyle(TableStyle([
1149
+ ('BACKGROUND', (0, 0), (-1, 0), PRIMARY_BLUE),
1150
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
1151
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
1152
+ ('FONTSIZE', (0, 0), (-1, -1), 8),
1153
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
1154
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [LIGHT_BLUE, WHITE]),
1155
+ ('PADDING', (0, 0), (-1, -1), 6),
1156
+ ('VALIGN', (0, 0), (-1, -1), 'TOP'),
1157
+ ]))
1158
+ story.append(rm_table)
1159
+ story.append(Paragraph('Table 11: Strategic Roadmap', styles['Caption']))
1160
+
1161
+ story.append(Spacer(1, 15))
1162
+ story.append(Paragraph('<b>7.2 Phase 1: Foundation (Q1-Q2 2026)</b>', styles['SubsectionHeader']))
1163
+
1164
+ phase1_items = [
1165
+ ['Item', 'Description', 'Effort', 'Dependencies'],
1166
+ ['REST API Development', 'FastAPI-based API for all core functions', '4 weeks', 'None'],
1167
+ ['User Authentication', 'JWT-based auth with OAuth2 support', '3 weeks', 'API'],
1168
+ ['Document Format Extension', 'Add Word, Excel, PowerPoint support', '4 weeks', 'None'],
1169
+ ['Table Extraction', 'Preserve table structure in processing', '3 weeks', 'None'],
1170
+ ['Docker Containerization', 'Production-ready Docker images', '2 weeks', 'None'],
1171
+ ['Basic CI/CD Pipeline', 'Automated testing and deployment', '2 weeks', 'Docker'],
1172
+ ['Query Result Caching', 'Redis-based caching layer', '2 weeks', 'API'],
1173
+ ['Unit Test Coverage', 'Achieve 80% code coverage', '3 weeks', 'Ongoing'],
1174
+ ]
1175
+
1176
+ p1_table = Table(phase1_items, colWidths=[130, 180, 60, 80])
1177
+ p1_table.setStyle(TableStyle([
1178
+ ('BACKGROUND', (0, 0), (-1, 0), SUCCESS_GREEN),
1179
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
1180
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
1181
+ ('FONTSIZE', (0, 0), (-1, -1), 8),
1182
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
1183
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
1184
+ ('PADDING', (0, 0), (-1, -1), 4),
1185
+ ]))
1186
+ story.append(p1_table)
1187
+ story.append(Paragraph('Table 12: Phase 1 Deliverables', styles['Caption']))
1188
+
1189
+ story.append(PageBreak())
1190
+
1191
+ story.append(Paragraph('<b>7.3 Phase 2: Scale (Q3-Q4 2026)</b>', styles['SubsectionHeader']))
1192
+
1193
+ phase2_items = [
1194
+ ['Item', 'Description', 'Effort', 'Dependencies'],
1195
+ ['Distributed Processing', 'Celery/Ray for parallel document processing', '6 weeks', 'Phase 1'],
1196
+ ['Vector Store Scaling', 'Milvus/Pinecone for large-scale indices', '4 weeks', 'Phase 1'],
1197
+ ['Multi-tenancy', 'Organization-based data isolation', '4 weeks', 'Auth'],
1198
+ ['Kubernetes Deployment', 'Full K8s manifests and Helm charts', '3 weeks', 'Docker'],
1199
+ ['Monitoring Stack', 'Prometheus, Grafana, ELK integration', '3 weeks', 'K8s'],
1200
+ ['Batch Processing', 'Bulk document upload and processing', '3 weeks', 'Distributed'],
1201
+ ['Advanced Caching', 'Semantic caching for similar queries', '3 weeks', 'Cache'],
1202
+ ['Performance Optimization', 'Achieve <2s simple query latency', '4 weeks', 'Caching'],
1203
+ ]
1204
+
1205
+ p2_table = Table(phase2_items, colWidths=[130, 180, 60, 80])
1206
+ p2_table.setStyle(TableStyle([
1207
+ ('BACKGROUND', (0, 0), (-1, 0), WARNING_ORANGE),
1208
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
1209
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
1210
+ ('FONTSIZE', (0, 0), (-1, -1), 8),
1211
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
1212
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
1213
+ ('PADDING', (0, 0), (-1, -1), 4),
1214
+ ]))
1215
+ story.append(p2_table)
1216
+ story.append(Paragraph('Table 13: Phase 2 Deliverables', styles['Caption']))
1217
+
1218
+ story.append(Spacer(1, 15))
1219
+ story.append(Paragraph('<b>7.4 Phase 3: Innovation (Q1-Q2 2027)</b>', styles['SubsectionHeader']))
1220
+
1221
+ phase3_items = [
1222
+ ['Item', 'Description', 'Effort', 'Dependencies'],
1223
+ ['Multi-modal Understanding', 'GPT-4V/Claude Vision for image analysis', '6 weeks', 'Phase 2'],
1224
+ ['Advanced Table QA', 'SQL-like queries over extracted tables', '4 weeks', 'Table Extract'],
1225
+ ['PII Detection/Redaction', 'Automatic sensitive data handling', '4 weeks', 'None'],
1226
+ ['Compliance Framework', 'GDPR, HIPAA, SOC2 compliance', '8 weeks', 'PII'],
1227
+ ['Plugin Architecture', 'Extensible agent and tool system', '4 weeks', 'Phase 2'],
1228
+ ['Analytics Dashboard', 'Usage analytics and insights', '3 weeks', 'Monitoring'],
1229
+ ['Multi-language Support', 'Full support for top 10 languages', '4 weeks', 'None'],
1230
+ ['Feedback Learning', 'Learn from user corrections', '4 weeks', 'Analytics'],
1231
+ ]
1232
+
1233
+ p3_table = Table(phase3_items, colWidths=[130, 180, 60, 80])
1234
+ p3_table.setStyle(TableStyle([
1235
+ ('BACKGROUND', (0, 0), (-1, 0), ACCENT_BLUE),
1236
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
1237
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
1238
+ ('FONTSIZE', (0, 0), (-1, -1), 8),
1239
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
1240
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
1241
+ ('PADDING', (0, 0), (-1, -1), 4),
1242
+ ]))
1243
+ story.append(p3_table)
1244
+ story.append(Paragraph('Table 14: Phase 3 Deliverables', styles['Caption']))
1245
+
1246
+ story.append(PageBreak())
1247
+
1248
+ # ========== 8. RISK ASSESSMENT ==========
1249
+ story.append(Paragraph('8. Risk Assessment', styles['SectionHeader']))
1250
+
1251
+ story.append(Paragraph('<b>8.1 Technical Risks</b>', styles['SubsectionHeader']))
1252
+
1253
+ tech_risks = [
1254
+ ['Risk', 'Probability', 'Impact', 'Mitigation'],
1255
+ ['LLM API Changes', 'Medium', 'High', 'Abstract LLM interface; support multiple providers'],
1256
+ ['Scaling Bottlenecks', 'High', 'High', 'Early load testing; phased rollout'],
1257
+ ['Model Accuracy Plateau', 'Medium', 'Medium', 'Ensemble approaches; fine-tuning capability'],
1258
+ ['Dependency Vulnerabilities', 'Medium', 'Medium', 'Regular dependency audits; Dependabot'],
1259
+ ['Data Loss', 'Low', 'Critical', 'Automated backups; disaster recovery plan'],
1260
+ ]
1261
+
1262
+ tr_table = Table(tech_risks, colWidths=[120, 70, 70, 190])
1263
+ tr_table.setStyle(TableStyle([
1264
+ ('BACKGROUND', (0, 0), (-1, 0), DANGER_RED),
1265
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
1266
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
1267
+ ('FONTSIZE', (0, 0), (-1, -1), 8),
1268
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
1269
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
1270
+ ('PADDING', (0, 0), (-1, -1), 5),
1271
+ ('ALIGN', (1, 0), (2, -1), 'CENTER'),
1272
+ ]))
1273
+ story.append(tr_table)
1274
+ story.append(Paragraph('Table 15: Technical Risks', styles['Caption']))
1275
+
1276
+ story.append(Spacer(1, 15))
1277
+ story.append(Paragraph('<b>8.2 Project Risks</b>', styles['SubsectionHeader']))
1278
+
1279
+ proj_risks = [
1280
+ ['Risk', 'Probability', 'Impact', 'Mitigation'],
1281
+ ['Scope Creep', 'High', 'Medium', 'Strict phase gates; change control process'],
1282
+ ['Resource Constraints', 'Medium', 'High', 'Prioritized backlog; MVP focus'],
1283
+ ['Timeline Slippage', 'Medium', 'Medium', 'Buffer time; parallel workstreams'],
1284
+ ['Knowledge Silos', 'Medium', 'Medium', 'Documentation; pair programming; code reviews'],
1285
+ ['Stakeholder Alignment', 'Low', 'High', 'Regular demos; feedback cycles'],
1286
+ ]
1287
+
1288
+ pr_table = Table(proj_risks, colWidths=[120, 70, 70, 190])
1289
+ pr_table.setStyle(TableStyle([
1290
+ ('BACKGROUND', (0, 0), (-1, 0), WARNING_ORANGE),
1291
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
1292
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
1293
+ ('FONTSIZE', (0, 0), (-1, -1), 8),
1294
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
1295
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
1296
+ ('PADDING', (0, 0), (-1, -1), 5),
1297
+ ('ALIGN', (1, 0), (2, -1), 'CENTER'),
1298
+ ]))
1299
+ story.append(pr_table)
1300
+ story.append(Paragraph('Table 16: Project Risks', styles['Caption']))
1301
+
1302
+ story.append(PageBreak())
1303
+
1304
+ # ========== 9. RESOURCE REQUIREMENTS ==========
1305
+ story.append(Paragraph('9. Resource Requirements', styles['SectionHeader']))
1306
+
1307
+ story.append(Paragraph('<b>9.1 Team Structure (Recommended)</b>', styles['SubsectionHeader']))
1308
+
1309
+ team = [
1310
+ ['Role', 'Count', 'Phase 1', 'Phase 2', 'Phase 3'],
1311
+ ['Senior ML Engineer', '2', '✓', '✓', '✓'],
1312
+ ['Backend Developer', '2', '✓', '✓', '✓'],
1313
+ ['Frontend Developer', '1', '✓', '✓', '✓'],
1314
+ ['DevOps Engineer', '1', '✓', '✓', '✓'],
1315
+ ['QA Engineer', '1', '—', '✓', '✓'],
1316
+ ['Technical Lead', '1', '✓', '✓', '✓'],
1317
+ ['Product Manager', '1', '✓', '✓', '✓'],
1318
+ ]
1319
+
1320
+ team_table = Table(team, colWidths=[130, 60, 70, 70, 70])
1321
+ team_table.setStyle(TableStyle([
1322
+ ('BACKGROUND', (0, 0), (-1, 0), PRIMARY_BLUE),
1323
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
1324
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
1325
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
1326
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
1327
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
1328
+ ('PADDING', (0, 0), (-1, -1), 6),
1329
+ ('ALIGN', (1, 0), (-1, -1), 'CENTER'),
1330
+ ]))
1331
+ story.append(team_table)
1332
+ story.append(Paragraph('Table 17: Team Structure', styles['Caption']))
1333
+
1334
+ story.append(Spacer(1, 15))
1335
+ story.append(Paragraph('<b>9.2 Infrastructure Requirements</b>', styles['SubsectionHeader']))
1336
+
1337
+ infra = [
1338
+ ['Component', 'Development', 'Staging', 'Production'],
1339
+ ['GPU Servers', '1x A100 40GB', '2x A100 40GB', '4x A100 80GB'],
1340
+ ['CPU Servers', '4 vCPU, 16GB', '8 vCPU, 32GB', '16 vCPU, 64GB x3'],
1341
+ ['Storage', '500GB SSD', '2TB SSD', '10TB SSD + S3'],
1342
+ ['Vector DB', 'ChromaDB local', 'Milvus single', 'Milvus cluster'],
1343
+ ['Cache', 'In-memory', 'Redis single', 'Redis cluster'],
1344
+ ['Load Balancer', 'None', 'Nginx', 'AWS ALB / GCP LB'],
1345
+ ]
1346
+
1347
+ infra_table = Table(infra, colWidths=[100, 120, 120, 110])
1348
+ infra_table.setStyle(TableStyle([
1349
+ ('BACKGROUND', (0, 0), (-1, 0), SECONDARY_BLUE),
1350
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
1351
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
1352
+ ('FONTSIZE', (0, 0), (-1, -1), 8),
1353
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
1354
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, GRAY_LIGHT]),
1355
+ ('PADDING', (0, 0), (-1, -1), 5),
1356
+ ]))
1357
+ story.append(infra_table)
1358
+ story.append(Paragraph('Table 18: Infrastructure Requirements', styles['Caption']))
1359
+
1360
+ story.append(PageBreak())
1361
+
1362
+ # ========== 10. CONCLUSION ==========
1363
+ story.append(Paragraph('10. Conclusion & Recommendations', styles['SectionHeader']))
1364
+
1365
+ story.append(Paragraph('<b>10.1 Summary</b>', styles['SubsectionHeader']))
1366
+ story.append(Paragraph(
1367
+ '''SPARKNET has achieved significant progress as a proof-of-concept for multi-agentic
1368
+ document intelligence. The core RAG pipeline is functional, demonstrating the viability
1369
+ of the 5-agent architecture with self-correction capabilities. The system successfully
1370
+ processes documents, performs hybrid retrieval, and generates citation-backed responses.''',
1371
+ styles['CustomBody']
1372
+ ))
1373
+
1374
+ story.append(Spacer(1, 10))
1375
+ story.append(Paragraph('<b>10.2 Key Recommendations</b>', styles['SubsectionHeader']))
1376
+
1377
+ recommendations = [
1378
+ '<b>Prioritize API Development:</b> Enable external integrations and unlock enterprise adoption.',
1379
+ '<b>Invest in Security:</b> Authentication and authorization are prerequisites for any production deployment.',
1380
+ '<b>Focus on Performance:</b> Current latency is acceptable for demos but needs significant improvement for production use.',
1381
+ '<b>Expand Document Support:</b> Office formats (Word, Excel, PowerPoint) are critical for enterprise adoption.',
1382
+ '<b>Implement Monitoring:</b> Observability is essential for maintaining and scaling the system.',
1383
+ '<b>Plan for Scale Early:</b> Architectural decisions made now will impact scalability; consider distributed architecture.',
1384
+ ]
1385
+
1386
+ for rec in recommendations:
1387
+ story.append(Paragraph(f'• {rec}', styles['BulletText']))
1388
+
1389
+ story.append(Spacer(1, 15))
1390
+ story.append(Paragraph('<b>10.3 Immediate Next Steps</b>', styles['SubsectionHeader']))
1391
+
1392
+ next_steps = [
1393
+ '1. Finalize Phase 1 scope and create detailed sprint plans',
1394
+ '2. Set up development infrastructure and CI/CD pipeline',
1395
+ '3. Begin REST API development (target: 4 weeks)',
1396
+ '4. Initiate security assessment and authentication design',
1397
+ '5. Start documentation and knowledge transfer activities',
1398
+ '6. Schedule bi-weekly stakeholder demos for continuous feedback',
1399
+ ]
1400
+
1401
+ for step in next_steps:
1402
+ story.append(Paragraph(step, styles['BulletText']))
1403
+
1404
+ story.append(Spacer(1, 30))
1405
+
1406
+ # Final signature block
1407
+ story.append(HRFlowable(width='100%', thickness=1, color=PRIMARY_BLUE))
1408
+ story.append(Spacer(1, 15))
1409
+
1410
+ story.append(Paragraph(
1411
+ f'''<b>Document prepared by:</b> SPARKNET Development Team<br/>
1412
+ <b>Report Date:</b> {datetime.now().strftime('%B %d, %Y')}<br/>
1413
+ <b>Version:</b> 1.0<br/>
1414
+ <b>Classification:</b> Internal / Confidential''',
1415
+ styles['CustomBody']
1416
+ ))
1417
+
1418
+ story.append(Spacer(1, 20))
1419
+ story.append(Paragraph(
1420
+ '<i>This document contains confidential information intended for stakeholder review. '
1421
+ 'Please do not distribute without authorization.</i>',
1422
+ styles['Caption']
1423
+ ))
1424
+
1425
+ # Build PDF
1426
+ doc.build(story)
1427
+ print(f"Report generated: {filename}")
1428
+ return filename
1429
+
1430
+
1431
+ if __name__ == '__main__':
1432
+ generate_report()
examples/document_agent.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Example: DocumentAgent with ReAct-style Processing
3
+
4
+ Demonstrates:
5
+ 1. Loading and processing documents
6
+ 2. Field extraction with evidence
7
+ 3. Document classification
8
+ 4. Question answering with grounding
9
+ """
10
+
11
+ import asyncio
12
+ from pathlib import Path
13
+ from loguru import logger
14
+
15
+ # Import DocumentAgent
16
+ from src.agents.document_agent import (
17
+ DocumentAgent,
18
+ AgentConfig,
19
+ )
20
+ from src.document.schemas.extraction import (
21
+ ExtractionSchema,
22
+ FieldDefinition,
23
+ )
24
+
25
+
26
+ async def example_basic_agent():
27
+ """Basic agent usage."""
28
+ print("=" * 50)
29
+ print("Basic DocumentAgent Usage")
30
+ print("=" * 50)
31
+
32
+ # Create agent with custom config
33
+ config = AgentConfig(
34
+ default_model="llama3.2:3b",
35
+ max_iterations=10,
36
+ temperature=0.1,
37
+ )
38
+ agent = DocumentAgent(config)
39
+
40
+ # Load document
41
+ sample_doc = Path("./data/sample.pdf")
42
+ if not sample_doc.exists():
43
+ print(f"Sample document not found: {sample_doc}")
44
+ print("Create a sample PDF at ./data/sample.pdf")
45
+ return
46
+
47
+ print(f"\nLoading document: {sample_doc}")
48
+ await agent.load_document(str(sample_doc))
49
+
50
+ print(f"Document loaded: {agent.document.metadata.filename}")
51
+ print(f"Pages: {agent.document.metadata.num_pages}")
52
+ print(f"Chunks: {len(agent.document.chunks)}")
53
+
54
+
55
+ async def example_field_extraction():
56
+ """Extract structured fields with evidence."""
57
+ print("\n" + "=" * 50)
58
+ print("Field Extraction with Evidence")
59
+ print("=" * 50)
60
+
61
+ agent = DocumentAgent()
62
+
63
+ sample_doc = Path("./data/sample.pdf")
64
+ if not sample_doc.exists():
65
+ print("Sample document not found")
66
+ return
67
+
68
+ await agent.load_document(str(sample_doc))
69
+
70
+ # Define extraction schema
71
+ schema = ExtractionSchema(
72
+ name="document_info",
73
+ description="Extract key document information",
74
+ fields=[
75
+ FieldDefinition(
76
+ name="title",
77
+ field_type="string",
78
+ description="Document title",
79
+ required=True,
80
+ ),
81
+ FieldDefinition(
82
+ name="author",
83
+ field_type="string",
84
+ description="Document author or organization",
85
+ required=False,
86
+ ),
87
+ FieldDefinition(
88
+ name="date",
89
+ field_type="string",
90
+ description="Document date",
91
+ required=False,
92
+ ),
93
+ FieldDefinition(
94
+ name="summary",
95
+ field_type="string",
96
+ description="Brief summary of document content",
97
+ required=True,
98
+ ),
99
+ ],
100
+ )
101
+
102
+ # Extract fields
103
+ print("\nExtracting fields...")
104
+ result = await agent.extract_fields(schema)
105
+
106
+ print(f"\nExtracted Fields:")
107
+ for field, value in result.fields.items():
108
+ print(f" {field}: {value}")
109
+
110
+ print(f"\nConfidence: {result.confidence:.2f}")
111
+
112
+ if result.evidence:
113
+ print(f"\nEvidence ({len(result.evidence)} sources):")
114
+ for ev in result.evidence[:3]:
115
+ print(f" - Page {ev.page + 1}: {ev.snippet[:80]}...")
116
+
117
+
118
+ async def example_classification():
119
+ """Classify document type."""
120
+ print("\n" + "=" * 50)
121
+ print("Document Classification")
122
+ print("=" * 50)
123
+
124
+ agent = DocumentAgent()
125
+
126
+ sample_doc = Path("./data/sample.pdf")
127
+ if not sample_doc.exists():
128
+ print("Sample document not found")
129
+ return
130
+
131
+ await agent.load_document(str(sample_doc))
132
+
133
+ # Classify
134
+ print("\nClassifying document...")
135
+ classification = await agent.classify()
136
+
137
+ print(f"\nDocument Type: {classification.document_type.value}")
138
+ print(f"Confidence: {classification.confidence:.2f}")
139
+ print(f"Reasoning: {classification.reasoning}")
140
+
141
+ if classification.metadata:
142
+ print(f"\nAdditional metadata:")
143
+ for key, value in classification.metadata.items():
144
+ print(f" {key}: {value}")
145
+
146
+
147
+ async def example_question_answering():
148
+ """Answer questions about document with evidence."""
149
+ print("\n" + "=" * 50)
150
+ print("Question Answering with Evidence")
151
+ print("=" * 50)
152
+
153
+ agent = DocumentAgent()
154
+
155
+ sample_doc = Path("./data/sample.pdf")
156
+ if not sample_doc.exists():
157
+ print("Sample document not found")
158
+ return
159
+
160
+ await agent.load_document(str(sample_doc))
161
+
162
+ # Questions to ask
163
+ questions = [
164
+ "What is this document about?",
165
+ "What are the main findings or conclusions?",
166
+ "Are there any tables or figures? What do they show?",
167
+ ]
168
+
169
+ for question in questions:
170
+ print(f"\nQ: {question}")
171
+ print("-" * 40)
172
+
173
+ answer, evidence = await agent.answer_question(question)
174
+
175
+ print(f"A: {answer}")
176
+
177
+ if evidence:
178
+ print(f"\nEvidence:")
179
+ for ev in evidence[:2]:
180
+ print(f" - Page {ev.page + 1} ({ev.source_type}): {ev.snippet[:60]}...")
181
+
182
+
183
+ async def example_react_task():
184
+ """Run a complex task with ReAct-style reasoning."""
185
+ print("\n" + "=" * 50)
186
+ print("ReAct-style Task Execution")
187
+ print("=" * 50)
188
+
189
+ agent = DocumentAgent()
190
+
191
+ sample_doc = Path("./data/sample.pdf")
192
+ if not sample_doc.exists():
193
+ print("Sample document not found")
194
+ return
195
+
196
+ await agent.load_document(str(sample_doc))
197
+
198
+ # Complex task
199
+ task = """
200
+ Analyze this document and provide:
201
+ 1. A brief summary of the content
202
+ 2. The document type and purpose
203
+ 3. Any key data points or figures mentioned
204
+ 4. Your confidence in the analysis
205
+ """
206
+
207
+ print(f"\nTask: {task}")
208
+ print("-" * 40)
209
+
210
+ # Run with trace
211
+ result, trace = await agent.run(task)
212
+
213
+ print(f"\nResult:\n{result}")
214
+
215
+ print(f"\n--- Agent Trace ---")
216
+ print(f"Steps: {len(trace.steps)}")
217
+ print(f"Tools used: {trace.tools_used}")
218
+ print(f"Total time: {trace.total_time:.2f}s")
219
+
220
+ # Show thinking process
221
+ print(f"\nReasoning trace:")
222
+ for i, step in enumerate(trace.steps[:5], 1):
223
+ print(f"\n[Step {i}] {step.action}")
224
+ if step.thought:
225
+ print(f" Thought: {step.thought[:100]}...")
226
+ if step.observation:
227
+ print(f" Observation: {step.observation[:100]}...")
228
+
229
+
230
+ async def main():
231
+ """Run all examples."""
232
+ await example_basic_agent()
233
+ await example_field_extraction()
234
+ await example_classification()
235
+ await example_question_answering()
236
+ await example_react_task()
237
+
238
+
239
+ if __name__ == "__main__":
240
+ asyncio.run(main())
examples/document_intelligence_demo.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Document Intelligence Demo
4
+
5
+ Demonstrates the capabilities of the SPARKNET document_intelligence subsystem:
6
+ - Document parsing with OCR and layout detection
7
+ - Schema-driven field extraction
8
+ - Visual grounding with evidence
9
+ - Question answering
10
+ - Document classification
11
+ """
12
+
13
+ import asyncio
14
+ import json
15
+ from pathlib import Path
16
+
17
+ # Add project root to path
18
+ import sys
19
+ sys.path.insert(0, str(Path(__file__).parent.parent))
20
+
21
+
22
+ def demo_parse_document(doc_path: str):
23
+ """Demo: Parse a document into semantic chunks."""
24
+ print("\n" + "=" * 60)
25
+ print("1. DOCUMENT PARSING")
26
+ print("=" * 60)
27
+
28
+ from src.document_intelligence import (
29
+ DocumentParser,
30
+ ParserConfig,
31
+ )
32
+
33
+ # Configure parser
34
+ config = ParserConfig(
35
+ render_dpi=200,
36
+ max_pages=5, # Limit for demo
37
+ include_markdown=True,
38
+ )
39
+
40
+ parser = DocumentParser(config=config)
41
+
42
+ print(f"\nParsing: {doc_path}")
43
+ result = parser.parse(doc_path)
44
+
45
+ print(f"\nDocument ID: {result.doc_id}")
46
+ print(f"Filename: {result.filename}")
47
+ print(f"Pages: {result.num_pages}")
48
+ print(f"Chunks: {len(result.chunks)}")
49
+ print(f"Processing time: {result.processing_time_ms:.0f}ms")
50
+
51
+ # Show chunk summary by type
52
+ print("\nChunk types:")
53
+ by_type = {}
54
+ for chunk in result.chunks:
55
+ t = chunk.chunk_type.value
56
+ by_type[t] = by_type.get(t, 0) + 1
57
+
58
+ for t, count in sorted(by_type.items()):
59
+ print(f" - {t}: {count}")
60
+
61
+ # Show first few chunks
62
+ print("\nFirst 3 chunks:")
63
+ for i, chunk in enumerate(result.chunks[:3]):
64
+ print(f"\n [{i+1}] Type: {chunk.chunk_type.value}, Page: {chunk.page}")
65
+ print(f" ID: {chunk.chunk_id}")
66
+ print(f" Text: {chunk.text[:100]}...")
67
+ print(f" BBox: {chunk.bbox.xyxy}")
68
+ print(f" Confidence: {chunk.confidence:.2f}")
69
+
70
+ return result
71
+
72
+
73
+ def demo_extract_fields(parse_result):
74
+ """Demo: Extract fields using a schema."""
75
+ print("\n" + "=" * 60)
76
+ print("2. SCHEMA-DRIVEN EXTRACTION")
77
+ print("=" * 60)
78
+
79
+ from src.document_intelligence import (
80
+ FieldExtractor,
81
+ ExtractionSchema,
82
+ FieldType,
83
+ ExtractionValidator,
84
+ )
85
+
86
+ # Create a custom schema
87
+ schema = ExtractionSchema(
88
+ name="DocumentInfo",
89
+ description="Basic document information",
90
+ )
91
+
92
+ schema.add_string_field("title", "Document title or heading", required=True)
93
+ schema.add_string_field("date", "Document date", required=False)
94
+ schema.add_string_field("author", "Author or organization name", required=False)
95
+ schema.add_string_field("reference_number", "Reference or ID number", required=False)
96
+
97
+ print(f"\nExtraction schema: {schema.name}")
98
+ print("Fields:")
99
+ for field in schema.fields:
100
+ req = "required" if field.required else "optional"
101
+ print(f" - {field.name} ({field.field_type.value}, {req})")
102
+
103
+ # Extract fields
104
+ extractor = FieldExtractor()
105
+ result = extractor.extract(parse_result, schema)
106
+
107
+ print("\nExtracted data:")
108
+ for key, value in result.data.items():
109
+ status = " [ABSTAINED]" if key in result.abstained_fields else ""
110
+ print(f" {key}: {value}{status}")
111
+
112
+ print(f"\nOverall confidence: {result.overall_confidence:.2f}")
113
+
114
+ # Show evidence
115
+ if result.evidence:
116
+ print("\nEvidence:")
117
+ for ev in result.evidence[:3]:
118
+ print(f" - Page {ev.page}, Chunk {ev.chunk_id[:12]}...")
119
+ print(f" Snippet: {ev.snippet[:80]}...")
120
+
121
+ # Validate
122
+ validator = ExtractionValidator()
123
+ validation = validator.validate(result, schema)
124
+
125
+ print(f"\nValidation: {'PASSED' if validation.is_valid else 'FAILED'}")
126
+ if validation.issues:
127
+ print("Issues:")
128
+ for issue in validation.issues[:3]:
129
+ print(f" - [{issue.severity}] {issue.field_name}: {issue.message}")
130
+
131
+ return result
132
+
133
+
134
+ def demo_search_and_qa(parse_result):
135
+ """Demo: Search and question answering."""
136
+ print("\n" + "=" * 60)
137
+ print("3. SEARCH AND Q&A")
138
+ print("=" * 60)
139
+
140
+ from src.document_intelligence.tools import get_tool
141
+
142
+ # Search demo
143
+ print("\nSearching for 'document'...")
144
+ search_tool = get_tool("search_chunks")
145
+ search_result = search_tool.execute(
146
+ parse_result=parse_result,
147
+ query="document",
148
+ top_k=5,
149
+ )
150
+
151
+ if search_result.success:
152
+ matches = search_result.data.get("results", [])
153
+ print(f"Found {len(matches)} matches:")
154
+ for i, match in enumerate(matches[:3], 1):
155
+ print(f" {i}. Page {match['page']}, Type: {match['type']}")
156
+ print(f" Score: {match['score']:.2f}")
157
+ print(f" Text: {match['text'][:80]}...")
158
+
159
+ # Q&A demo
160
+ print("\nAsking: 'What is this document about?'")
161
+ qa_tool = get_tool("answer_question")
162
+ qa_result = qa_tool.execute(
163
+ parse_result=parse_result,
164
+ question="What is this document about?",
165
+ )
166
+
167
+ if qa_result.success:
168
+ print(f"Answer: {qa_result.data.get('answer', 'No answer')}")
169
+ print(f"Confidence: {qa_result.data.get('confidence', 0):.2f}")
170
+
171
+
172
+ def demo_grounding(parse_result, doc_path: str):
173
+ """Demo: Visual grounding with crops."""
174
+ print("\n" + "=" * 60)
175
+ print("4. VISUAL GROUNDING")
176
+ print("=" * 60)
177
+
178
+ from src.document_intelligence import (
179
+ load_document,
180
+ RenderOptions,
181
+ )
182
+ from src.document_intelligence.grounding import (
183
+ EvidenceBuilder,
184
+ crop_region,
185
+ create_annotated_image,
186
+ )
187
+
188
+ # Load page image
189
+ loader, renderer = load_document(doc_path)
190
+ page_image = renderer.render_page(1, RenderOptions(dpi=200))
191
+ loader.close()
192
+
193
+ print(f"\nPage 1 image size: {page_image.shape}")
194
+
195
+ # Get chunks from page 1
196
+ page_chunks = [c for c in parse_result.chunks if c.page == 1]
197
+ print(f"Page 1 chunks: {len(page_chunks)}")
198
+
199
+ # Create evidence for first chunk
200
+ if page_chunks:
201
+ chunk = page_chunks[0]
202
+ evidence_builder = EvidenceBuilder()
203
+
204
+ evidence = evidence_builder.create_evidence(
205
+ chunk=chunk,
206
+ value=chunk.text[:50],
207
+ field_name="example_field",
208
+ )
209
+
210
+ print(f"\nEvidence created:")
211
+ print(f" Chunk ID: {evidence.chunk_id}")
212
+ print(f" Page: {evidence.page}")
213
+ print(f" BBox: {evidence.bbox.xyxy}")
214
+ print(f" Snippet: {evidence.snippet[:80]}...")
215
+
216
+ # Crop region
217
+ crop = crop_region(page_image, chunk.bbox)
218
+ print(f" Crop size: {crop.shape}")
219
+
220
+ # Create annotated image (preview)
221
+ print("\nAnnotated image would include bounding boxes for all chunks.")
222
+ print("Use the CLI 'sparknet docint visualize' command to generate.")
223
+
224
+
225
+ def demo_classification(parse_result):
226
+ """Demo: Document classification."""
227
+ print("\n" + "=" * 60)
228
+ print("5. DOCUMENT CLASSIFICATION")
229
+ print("=" * 60)
230
+
231
+ from src.document_intelligence.chunks import DocumentType
232
+
233
+ # Simple keyword-based classification
234
+ first_page = [c for c in parse_result.chunks if c.page == 1][:5]
235
+ content = " ".join(c.text for c in first_page).lower()
236
+
237
+ type_keywords = {
238
+ "invoice": ["invoice", "bill", "payment due", "amount due"],
239
+ "contract": ["agreement", "contract", "party", "whereas"],
240
+ "receipt": ["receipt", "paid", "transaction"],
241
+ "patent": ["patent", "claims", "invention"],
242
+ "report": ["report", "findings", "summary"],
243
+ }
244
+
245
+ detected_type = "other"
246
+ confidence = 0.3
247
+
248
+ for doc_type, keywords in type_keywords.items():
249
+ matches = sum(1 for k in keywords if k in content)
250
+ if matches >= 2:
251
+ detected_type = doc_type
252
+ confidence = min(0.95, 0.5 + matches * 0.15)
253
+ break
254
+
255
+ print(f"\nDetected type: {detected_type}")
256
+ print(f"Confidence: {confidence:.2f}")
257
+
258
+
259
+ def main():
260
+ """Run all demos."""
261
+ print("=" * 60)
262
+ print("SPARKNET Document Intelligence Demo")
263
+ print("=" * 60)
264
+
265
+ # Check for sample document
266
+ sample_paths = [
267
+ Path("Dataset/Patent_1.pdf"),
268
+ Path("data/sample.pdf"),
269
+ Path("tests/fixtures/sample.pdf"),
270
+ ]
271
+
272
+ doc_path = None
273
+ for path in sample_paths:
274
+ if path.exists():
275
+ doc_path = str(path)
276
+ break
277
+
278
+ if not doc_path:
279
+ print("\nNo sample document found.")
280
+ print("Please provide a PDF file path as argument.")
281
+ print("\nUsage: python document_intelligence_demo.py [path/to/document.pdf]")
282
+
283
+ if len(sys.argv) > 1:
284
+ doc_path = sys.argv[1]
285
+ else:
286
+ return
287
+
288
+ print(f"\nUsing document: {doc_path}")
289
+
290
+ try:
291
+ # Run demos
292
+ parse_result = demo_parse_document(doc_path)
293
+ demo_extract_fields(parse_result)
294
+ demo_search_and_qa(parse_result)
295
+ demo_grounding(parse_result, doc_path)
296
+ demo_classification(parse_result)
297
+
298
+ print("\n" + "=" * 60)
299
+ print("Demo complete!")
300
+ print("=" * 60)
301
+
302
+ except ImportError as e:
303
+ print(f"\nImport error: {e}")
304
+ print("Make sure all dependencies are installed:")
305
+ print(" pip install pymupdf pillow numpy pydantic")
306
+
307
+ except Exception as e:
308
+ print(f"\nError: {e}")
309
+ import traceback
310
+ traceback.print_exc()
311
+
312
+
313
+ if __name__ == "__main__":
314
+ main()
examples/document_processing.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Example: Document Processing Pipeline
3
+
4
+ Demonstrates:
5
+ 1. Processing a PDF document
6
+ 2. Extracting text with OCR
7
+ 3. Layout detection
8
+ 4. Semantic chunking
9
+ """
10
+
11
+ import asyncio
12
+ from pathlib import Path
13
+ from loguru import logger
14
+
15
+ # Import document processing components
16
+ from src.document.pipeline import (
17
+ PipelineConfig,
18
+ DocumentProcessor,
19
+ process_document,
20
+ )
21
+ from src.document.ocr import OCRConfig
22
+
23
+
24
+ def example_basic_processing():
25
+ """Basic document processing example."""
26
+ print("=" * 50)
27
+ print("Basic Document Processing")
28
+ print("=" * 50)
29
+
30
+ # Configure pipeline
31
+ config = PipelineConfig(
32
+ ocr=OCRConfig(engine="paddleocr"),
33
+ render_dpi=300,
34
+ max_pages=5, # Limit for demo
35
+ )
36
+
37
+ # Create processor
38
+ processor = DocumentProcessor(config)
39
+
40
+ # Process a sample document
41
+ # NOTE: Replace with actual document path
42
+ sample_doc = Path("./data/sample.pdf")
43
+
44
+ if not sample_doc.exists():
45
+ print(f"Sample document not found: {sample_doc}")
46
+ print("Create a sample PDF at ./data/sample.pdf to run this example")
47
+ return
48
+
49
+ # Process
50
+ result = processor.process(sample_doc)
51
+
52
+ # Display results
53
+ print(f"\nDocument: {result.metadata.filename}")
54
+ print(f"Pages: {result.metadata.num_pages}")
55
+ print(f"Chunks: {result.metadata.total_chunks}")
56
+ print(f"Characters: {result.metadata.total_characters}")
57
+ print(f"OCR Confidence: {result.metadata.ocr_confidence_avg:.2%}")
58
+
59
+ print("\n--- Sample Chunks ---")
60
+ for i, chunk in enumerate(result.chunks[:3]):
61
+ print(f"\n[Chunk {i+1}] Type: {chunk.chunk_type.value}, Page: {chunk.page+1}")
62
+ print(f"Text: {chunk.text[:200]}...")
63
+ print(f"BBox: ({chunk.bbox.x_min:.0f}, {chunk.bbox.y_min:.0f}) -> ({chunk.bbox.x_max:.0f}, {chunk.bbox.y_max:.0f})")
64
+
65
+
66
+ def example_with_layout():
67
+ """Document processing with layout analysis."""
68
+ print("\n" + "=" * 50)
69
+ print("Document Processing with Layout Analysis")
70
+ print("=" * 50)
71
+
72
+ from src.document.layout import LayoutConfig, LayoutType
73
+
74
+ # Configure with layout detection
75
+ config = PipelineConfig(
76
+ ocr=OCRConfig(engine="paddleocr"),
77
+ layout=LayoutConfig(method="rule_based"),
78
+ include_layout_regions=True,
79
+ )
80
+
81
+ processor = DocumentProcessor(config)
82
+
83
+ sample_doc = Path("./data/sample.pdf")
84
+ if not sample_doc.exists():
85
+ print("Sample document not found")
86
+ return
87
+
88
+ result = processor.process(sample_doc)
89
+
90
+ # Count layout types
91
+ layout_counts = {}
92
+ for region in result.layout_regions:
93
+ layout_type = region.layout_type.value
94
+ layout_counts[layout_type] = layout_counts.get(layout_type, 0) + 1
95
+
96
+ print(f"\nLayout Analysis:")
97
+ for layout_type, count in sorted(layout_counts.items()):
98
+ print(f" {layout_type}: {count} regions")
99
+
100
+ # Show tables if found
101
+ tables = [r for r in result.layout_regions if r.layout_type == LayoutType.TABLE]
102
+ if tables:
103
+ print(f"\n--- Tables Found ({len(tables)}) ---")
104
+ for i, table in enumerate(tables[:2]):
105
+ print(f"\nTable {i+1}: Page {table.page+1}")
106
+ print(f" Position: ({table.bbox.x_min:.0f}, {table.bbox.y_min:.0f})")
107
+ print(f" Size: {table.bbox.width:.0f} x {table.bbox.height:.0f}")
108
+
109
+
110
+ def example_convenience_function():
111
+ """Using the convenience function."""
112
+ print("\n" + "=" * 50)
113
+ print("Using Convenience Function")
114
+ print("=" * 50)
115
+
116
+ sample_doc = Path("./data/sample.pdf")
117
+ if not sample_doc.exists():
118
+ print("Sample document not found")
119
+ return
120
+
121
+ # Simple one-liner
122
+ result = process_document(sample_doc)
123
+
124
+ print(f"Processed: {result.metadata.filename}")
125
+ print(f"Chunks: {len(result.chunks)}")
126
+ print(f"\nFull text preview:")
127
+ print(result.full_text[:500] + "..." if len(result.full_text) > 500 else result.full_text)
128
+
129
+
130
+ if __name__ == "__main__":
131
+ example_basic_processing()
132
+ example_with_layout()
133
+ example_convenience_function()
examples/document_rag_end_to_end.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Document Intelligence RAG End-to-End Example
4
+
5
+ Demonstrates the complete RAG workflow:
6
+ 1. Parse documents into semantic chunks
7
+ 2. Index chunks into vector store
8
+ 3. Semantic retrieval with filters
9
+ 4. Grounded question answering with evidence
10
+ 5. Evidence visualization
11
+
12
+ Requirements:
13
+ - ChromaDB: pip install chromadb
14
+ - Ollama running with nomic-embed-text model: ollama pull nomic-embed-text
15
+ - PyMuPDF: pip install pymupdf
16
+ """
17
+
18
+ import sys
19
+ from pathlib import Path
20
+
21
+ # Add project root to path
22
+ sys.path.insert(0, str(Path(__file__).parent.parent))
23
+
24
+
25
+ def check_dependencies():
26
+ """Check that required dependencies are available."""
27
+ missing = []
28
+
29
+ try:
30
+ import chromadb
31
+ except ImportError:
32
+ missing.append("chromadb")
33
+
34
+ try:
35
+ import fitz # PyMuPDF
36
+ except ImportError:
37
+ missing.append("pymupdf")
38
+
39
+ if missing:
40
+ print("Missing dependencies:")
41
+ for dep in missing:
42
+ print(f" - {dep}")
43
+ print("\nInstall with: pip install " + " ".join(missing))
44
+ return False
45
+
46
+ # Check Ollama
47
+ try:
48
+ import requests
49
+ response = requests.get("http://localhost:11434/api/tags", timeout=2)
50
+ if response.status_code != 200:
51
+ print("Warning: Ollama server not responding")
52
+ print("Start Ollama with: ollama serve")
53
+ print("Then pull the embedding model: ollama pull nomic-embed-text")
54
+ except:
55
+ print("Warning: Could not connect to Ollama server")
56
+ print("The example will still work but with mock embeddings")
57
+
58
+ return True
59
+
60
+
61
+ def demo_parse_and_index(doc_paths: list):
62
+ """
63
+ Demo: Parse documents and index into vector store.
64
+
65
+ Args:
66
+ doc_paths: List of document file paths
67
+ """
68
+ print("\n" + "=" * 60)
69
+ print("STEP 1: PARSE AND INDEX DOCUMENTS")
70
+ print("=" * 60)
71
+
72
+ from src.document_intelligence import DocumentParser, ParserConfig
73
+ from src.document_intelligence.tools import get_rag_tool
74
+
75
+ # Get the index tool
76
+ index_tool = get_rag_tool("index_document")
77
+
78
+ results = []
79
+ for doc_path in doc_paths:
80
+ print(f"\nProcessing: {doc_path}")
81
+
82
+ # Parse document first (optional - tool can do this)
83
+ config = ParserConfig(render_dpi=200, max_pages=10)
84
+ parser = DocumentParser(config=config)
85
+
86
+ try:
87
+ parse_result = parser.parse(doc_path)
88
+ print(f" Parsed: {len(parse_result.chunks)} chunks, {parse_result.num_pages} pages")
89
+
90
+ # Index the parse result
91
+ result = index_tool.execute(parse_result=parse_result)
92
+
93
+ if result.success:
94
+ print(f" Indexed: {result.data['chunks_indexed']} chunks")
95
+ print(f" Document ID: {result.data['document_id']}")
96
+ results.append({
97
+ "path": doc_path,
98
+ "doc_id": result.data['document_id'],
99
+ "chunks": result.data['chunks_indexed'],
100
+ })
101
+ else:
102
+ print(f" Error: {result.error}")
103
+
104
+ except Exception as e:
105
+ print(f" Failed: {e}")
106
+
107
+ return results
108
+
109
+
110
+ def demo_semantic_retrieval(query: str, document_id: str = None):
111
+ """
112
+ Demo: Semantic retrieval from vector store.
113
+
114
+ Args:
115
+ query: Search query
116
+ document_id: Optional document filter
117
+ """
118
+ print("\n" + "=" * 60)
119
+ print("STEP 2: SEMANTIC RETRIEVAL")
120
+ print("=" * 60)
121
+
122
+ from src.document_intelligence.tools import get_rag_tool
123
+
124
+ retrieve_tool = get_rag_tool("retrieve_chunks")
125
+
126
+ print(f"\nQuery: \"{query}\"")
127
+ if document_id:
128
+ print(f"Document filter: {document_id}")
129
+
130
+ result = retrieve_tool.execute(
131
+ query=query,
132
+ top_k=5,
133
+ document_id=document_id,
134
+ include_evidence=True,
135
+ )
136
+
137
+ if result.success:
138
+ chunks = result.data.get("chunks", [])
139
+ print(f"\nFound {len(chunks)} relevant chunks:\n")
140
+
141
+ for i, chunk in enumerate(chunks, 1):
142
+ print(f"{i}. [similarity={chunk['similarity']:.3f}]")
143
+ print(f" Page {chunk.get('page', '?')}, Type: {chunk.get('chunk_type', 'unknown')}")
144
+ print(f" Text: {chunk['text'][:150]}...")
145
+ print()
146
+
147
+ # Show evidence
148
+ if result.evidence:
149
+ print("Evidence references:")
150
+ for ev in result.evidence[:3]:
151
+ print(f" - Chunk {ev['chunk_id'][:12]}... Page {ev.get('page', '?')}")
152
+
153
+ return chunks
154
+ else:
155
+ print(f"Error: {result.error}")
156
+ return []
157
+
158
+
159
+ def demo_grounded_qa(question: str, document_id: str = None):
160
+ """
161
+ Demo: Grounded question answering with evidence.
162
+
163
+ Args:
164
+ question: Question to answer
165
+ document_id: Optional document filter
166
+ """
167
+ print("\n" + "=" * 60)
168
+ print("STEP 3: GROUNDED QUESTION ANSWERING")
169
+ print("=" * 60)
170
+
171
+ from src.document_intelligence.tools import get_rag_tool
172
+
173
+ qa_tool = get_rag_tool("rag_answer")
174
+
175
+ print(f"\nQuestion: \"{question}\"")
176
+
177
+ result = qa_tool.execute(
178
+ question=question,
179
+ document_id=document_id,
180
+ top_k=5,
181
+ )
182
+
183
+ if result.success:
184
+ data = result.data
185
+ print(f"\nAnswer: {data.get('answer', 'No answer')}")
186
+ print(f"Confidence: {data.get('confidence', 0):.2f}")
187
+
188
+ if data.get('abstained'):
189
+ print("Note: System abstained due to low confidence")
190
+
191
+ # Show citations if any
192
+ citations = data.get('citations', [])
193
+ if citations:
194
+ print("\nCitations:")
195
+ for cit in citations:
196
+ print(f" [{cit['index']}] {cit.get('text', '')[:80]}...")
197
+
198
+ # Show evidence
199
+ if result.evidence:
200
+ print("\nEvidence locations:")
201
+ for ev in result.evidence:
202
+ print(f" - Page {ev.get('page', '?')}: {ev.get('snippet', '')[:60]}...")
203
+
204
+ return data
205
+ else:
206
+ print(f"Error: {result.error}")
207
+ return None
208
+
209
+
210
+ def demo_filtered_retrieval():
211
+ """
212
+ Demo: Retrieval with various filters.
213
+ """
214
+ print("\n" + "=" * 60)
215
+ print("STEP 4: FILTERED RETRIEVAL")
216
+ print("=" * 60)
217
+
218
+ from src.document_intelligence.tools import get_rag_tool
219
+
220
+ retrieve_tool = get_rag_tool("retrieve_chunks")
221
+
222
+ # Filter by chunk type
223
+ print("\n--- Retrieving only table chunks ---")
224
+ result = retrieve_tool.execute(
225
+ query="data values",
226
+ top_k=3,
227
+ chunk_types=["table"],
228
+ )
229
+
230
+ if result.success:
231
+ chunks = result.data.get("chunks", [])
232
+ print(f"Found {len(chunks)} table chunks")
233
+ for chunk in chunks:
234
+ print(f" - Page {chunk.get('page', '?')}: {chunk['text'][:80]}...")
235
+
236
+ # Filter by page range
237
+ print("\n--- Retrieving from pages 1-3 only ---")
238
+ result = retrieve_tool.execute(
239
+ query="content",
240
+ top_k=3,
241
+ page_range=(1, 3),
242
+ )
243
+
244
+ if result.success:
245
+ chunks = result.data.get("chunks", [])
246
+ print(f"Found {len(chunks)} chunks from pages 1-3")
247
+ for chunk in chunks:
248
+ print(f" - Page {chunk.get('page', '?')}: {chunk['text'][:80]}...")
249
+
250
+
251
+ def demo_index_stats():
252
+ """
253
+ Demo: Show index statistics.
254
+ """
255
+ print("\n" + "=" * 60)
256
+ print("INDEX STATISTICS")
257
+ print("=" * 60)
258
+
259
+ from src.document_intelligence.tools import get_rag_tool
260
+
261
+ stats_tool = get_rag_tool("get_index_stats")
262
+ result = stats_tool.execute()
263
+
264
+ if result.success:
265
+ data = result.data
266
+ print(f"\nTotal chunks indexed: {data.get('total_chunks', 0)}")
267
+ print(f"Embedding model: {data.get('embedding_model', 'unknown')}")
268
+ print(f"Embedding dimension: {data.get('embedding_dimension', 'unknown')}")
269
+ else:
270
+ print(f"Error: {result.error}")
271
+
272
+
273
+ def main():
274
+ """Run the complete RAG demo."""
275
+ print("=" * 60)
276
+ print("SPARKNET Document Intelligence RAG Demo")
277
+ print("=" * 60)
278
+
279
+ # Check dependencies
280
+ if not check_dependencies():
281
+ print("\nPlease install missing dependencies and try again.")
282
+ return
283
+
284
+ # Find sample documents
285
+ sample_paths = [
286
+ Path("Dataset/Patent_1.pdf"),
287
+ Path("data/sample.pdf"),
288
+ Path("tests/fixtures/sample.pdf"),
289
+ ]
290
+
291
+ doc_paths = []
292
+ for path in sample_paths:
293
+ if path.exists():
294
+ doc_paths.append(str(path))
295
+ break
296
+
297
+ if not doc_paths:
298
+ print("\nNo sample documents found.")
299
+ print("Please provide a PDF file path as argument.")
300
+ print("\nUsage: python document_rag_end_to_end.py [path/to/document.pdf]")
301
+
302
+ if len(sys.argv) > 1:
303
+ doc_paths = sys.argv[1:]
304
+ else:
305
+ return
306
+
307
+ print(f"\nUsing documents: {doc_paths}")
308
+
309
+ try:
310
+ # Step 1: Parse and index
311
+ indexed_docs = demo_parse_and_index(doc_paths)
312
+
313
+ if not indexed_docs:
314
+ print("\nNo documents were indexed. Exiting.")
315
+ return
316
+
317
+ # Get first document ID for filtering
318
+ first_doc_id = indexed_docs[0]["doc_id"]
319
+
320
+ # Step 2: Semantic retrieval
321
+ demo_semantic_retrieval(
322
+ query="main topic content",
323
+ document_id=first_doc_id,
324
+ )
325
+
326
+ # Step 3: Grounded Q&A
327
+ demo_grounded_qa(
328
+ question="What is this document about?",
329
+ document_id=first_doc_id,
330
+ )
331
+
332
+ # Step 4: Filtered retrieval
333
+ demo_filtered_retrieval()
334
+
335
+ # Show stats
336
+ demo_index_stats()
337
+
338
+ print("\n" + "=" * 60)
339
+ print("Demo complete!")
340
+ print("=" * 60)
341
+
342
+ print("\nNext steps:")
343
+ print(" 1. Try the CLI: sparknet docint index your_document.pdf")
344
+ print(" 2. Query the index: sparknet docint retrieve 'your query'")
345
+ print(" 3. Ask questions: sparknet docint ask doc.pdf 'question' --use-rag")
346
+
347
+ except ImportError as e:
348
+ print(f"\nImport error: {e}")
349
+ print("Make sure all dependencies are installed:")
350
+ print(" pip install pymupdf pillow numpy pydantic chromadb")
351
+
352
+ except Exception as e:
353
+ print(f"\nError: {e}")
354
+ import traceback
355
+ traceback.print_exc()
356
+
357
+
358
+ if __name__ == "__main__":
359
+ main()
examples/rag_pipeline.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Example: RAG Pipeline
3
+
4
+ Demonstrates:
5
+ 1. Indexing documents into vector store
6
+ 2. Semantic search
7
+ 3. Question answering with citations
8
+ """
9
+
10
+ from pathlib import Path
11
+ from loguru import logger
12
+
13
+ # Import RAG components
14
+ from src.rag import (
15
+ VectorStoreConfig,
16
+ EmbeddingConfig,
17
+ RetrieverConfig,
18
+ GeneratorConfig,
19
+ get_document_indexer,
20
+ get_document_retriever,
21
+ get_grounded_generator,
22
+ )
23
+
24
+
25
+ def example_indexing():
26
+ """Index documents into vector store."""
27
+ print("=" * 50)
28
+ print("Document Indexing")
29
+ print("=" * 50)
30
+
31
+ # Get indexer
32
+ indexer = get_document_indexer()
33
+
34
+ # Index a document
35
+ sample_doc = Path("./data/sample.pdf")
36
+
37
+ if not sample_doc.exists():
38
+ print(f"Sample document not found: {sample_doc}")
39
+ print("Create a sample PDF at ./data/sample.pdf")
40
+ return False
41
+
42
+ # Index
43
+ result = indexer.index_document(sample_doc)
44
+
45
+ if result.success:
46
+ print(f"\nIndexed: {result.source_path}")
47
+ print(f" Document ID: {result.document_id}")
48
+ print(f" Chunks indexed: {result.num_chunks_indexed}")
49
+ print(f" Chunks skipped: {result.num_chunks_skipped}")
50
+ else:
51
+ print(f"Indexing failed: {result.error}")
52
+ return False
53
+
54
+ # Show stats
55
+ stats = indexer.get_index_stats()
56
+ print(f"\nIndex Stats:")
57
+ print(f" Total chunks: {stats['total_chunks']}")
58
+ print(f" Documents: {stats['num_documents']}")
59
+ print(f" Embedding model: {stats['embedding_model']}")
60
+
61
+ return True
62
+
63
+
64
+ def example_search():
65
+ """Search indexed documents."""
66
+ print("\n" + "=" * 50)
67
+ print("Semantic Search")
68
+ print("=" * 50)
69
+
70
+ # Get retriever
71
+ retriever = get_document_retriever()
72
+
73
+ # Search queries
74
+ queries = [
75
+ "What is the main topic?",
76
+ "key findings",
77
+ "conclusions and recommendations",
78
+ ]
79
+
80
+ for query in queries:
81
+ print(f"\nQuery: '{query}'")
82
+
83
+ chunks = retriever.retrieve(query, top_k=3)
84
+
85
+ if not chunks:
86
+ print(" No results found")
87
+ continue
88
+
89
+ for i, chunk in enumerate(chunks, 1):
90
+ print(f"\n [{i}] Similarity: {chunk.similarity:.3f}")
91
+ if chunk.page is not None:
92
+ print(f" Page: {chunk.page + 1}")
93
+ print(f" Text: {chunk.text[:150]}...")
94
+
95
+
96
+ def example_question_answering():
97
+ """Answer questions using RAG."""
98
+ print("\n" + "=" * 50)
99
+ print("Question Answering with Citations")
100
+ print("=" * 50)
101
+
102
+ # Get generator
103
+ generator = get_grounded_generator()
104
+
105
+ # Questions
106
+ questions = [
107
+ "What is the main purpose of this document?",
108
+ "What are the key findings?",
109
+ "What recommendations are made?",
110
+ ]
111
+
112
+ for question in questions:
113
+ print(f"\nQuestion: {question}")
114
+ print("-" * 40)
115
+
116
+ result = generator.answer_question(question, top_k=5)
117
+
118
+ print(f"\nAnswer: {result.answer}")
119
+ print(f"\nConfidence: {result.confidence:.2f}")
120
+
121
+ if result.abstained:
122
+ print(f"Note: {result.abstain_reason}")
123
+
124
+ if result.citations:
125
+ print(f"\nCitations ({len(result.citations)}):")
126
+ for citation in result.citations:
127
+ page = f"Page {citation.page + 1}" if citation.page is not None else ""
128
+ print(f" [{citation.index}] {page}: {citation.text_snippet[:60]}...")
129
+
130
+
131
+ def example_filtered_search():
132
+ """Search with metadata filters."""
133
+ print("\n" + "=" * 50)
134
+ print("Filtered Search")
135
+ print("=" * 50)
136
+
137
+ retriever = get_document_retriever()
138
+
139
+ # Search only in tables
140
+ print("\nSearching for tables only...")
141
+ table_chunks = retriever.retrieve_tables("data values", top_k=3)
142
+
143
+ if table_chunks:
144
+ print(f"Found {len(table_chunks)} table chunks:")
145
+ for chunk in table_chunks:
146
+ print(f" - Page {chunk.page + 1}: {chunk.text[:100]}...")
147
+ else:
148
+ print("No table chunks found")
149
+
150
+ # Search specific page range
151
+ print("\nSearching pages 1-3...")
152
+ page_chunks = retriever.retrieve_by_page(
153
+ "introduction",
154
+ page_range=(0, 2),
155
+ top_k=3,
156
+ )
157
+
158
+ if page_chunks:
159
+ print(f"Found {len(page_chunks)} chunks in pages 1-3:")
160
+ for chunk in page_chunks:
161
+ print(f" - Page {chunk.page + 1}: {chunk.text[:100]}...")
162
+ else:
163
+ print("No chunks found in specified pages")
164
+
165
+
166
+ def example_full_pipeline():
167
+ """Complete RAG pipeline demo."""
168
+ print("\n" + "=" * 50)
169
+ print("Full RAG Pipeline Demo")
170
+ print("=" * 50)
171
+
172
+ # Step 1: Index
173
+ print("\n[Step 1] Indexing documents...")
174
+ if not example_indexing():
175
+ return
176
+
177
+ # Step 2: Search
178
+ print("\n[Step 2] Testing search...")
179
+ example_search()
180
+
181
+ # Step 3: Q&A
182
+ print("\n[Step 3] Question answering...")
183
+ example_question_answering()
184
+
185
+ print("\n" + "=" * 50)
186
+ print("Pipeline demo complete!")
187
+ print("=" * 50)
188
+
189
+
190
+ if __name__ == "__main__":
191
+ # Run full pipeline
192
+ example_full_pipeline()
nginx/nginx.conf ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPARKNET Production Nginx Configuration
2
+ # Reverse proxy for API and Demo services
3
+
4
+ user nginx;
5
+ worker_processes auto;
6
+ error_log /var/log/nginx/error.log warn;
7
+ pid /var/run/nginx.pid;
8
+
9
+ events {
10
+ worker_connections 1024;
11
+ use epoll;
12
+ multi_accept on;
13
+ }
14
+
15
+ http {
16
+ include /etc/nginx/mime.types;
17
+ default_type application/octet-stream;
18
+
19
+ # Logging format
20
+ log_format main '$remote_addr - $remote_user [$time_local] "$request" '
21
+ '$status $body_bytes_sent "$http_referer" '
22
+ '"$http_user_agent" "$http_x_forwarded_for" '
23
+ 'rt=$request_time uct="$upstream_connect_time" '
24
+ 'uht="$upstream_header_time" urt="$upstream_response_time"';
25
+
26
+ access_log /var/log/nginx/access.log main;
27
+
28
+ # Performance optimizations
29
+ sendfile on;
30
+ tcp_nopush on;
31
+ tcp_nodelay on;
32
+ keepalive_timeout 65;
33
+ types_hash_max_size 2048;
34
+
35
+ # Gzip compression
36
+ gzip on;
37
+ gzip_vary on;
38
+ gzip_proxied any;
39
+ gzip_comp_level 6;
40
+ gzip_types text/plain text/css text/xml application/json application/javascript
41
+ application/xml application/xml+rss text/javascript application/x-javascript;
42
+ gzip_min_length 1000;
43
+
44
+ # Rate limiting zones
45
+ limit_req_zone $binary_remote_addr zone=api_limit:10m rate=30r/s;
46
+ limit_req_zone $binary_remote_addr zone=upload_limit:10m rate=5r/s;
47
+ limit_conn_zone $binary_remote_addr zone=conn_limit:10m;
48
+
49
+ # Security headers map
50
+ map $sent_http_content_type $security_headers {
51
+ default "always";
52
+ }
53
+
54
+ # Upstream servers
55
+ upstream sparknet_api {
56
+ server sparknet-api:8000;
57
+ keepalive 32;
58
+ }
59
+
60
+ upstream sparknet_demo {
61
+ server sparknet-demo:4000;
62
+ keepalive 32;
63
+ }
64
+
65
+ # HTTP redirect to HTTPS (uncomment for production with SSL)
66
+ # server {
67
+ # listen 80;
68
+ # listen [::]:80;
69
+ # server_name _;
70
+ # return 301 https://$host$request_uri;
71
+ # }
72
+
73
+ # Main HTTP server (development/internal)
74
+ server {
75
+ listen 80;
76
+ listen [::]:80;
77
+ server_name _;
78
+
79
+ # Connection limits
80
+ limit_conn conn_limit 20;
81
+
82
+ # Security headers
83
+ add_header X-Frame-Options "SAMEORIGIN" always;
84
+ add_header X-Content-Type-Options "nosniff" always;
85
+ add_header X-XSS-Protection "1; mode=block" always;
86
+ add_header Referrer-Policy "strict-origin-when-cross-origin" always;
87
+
88
+ # Client body size for file uploads
89
+ client_max_body_size 100M;
90
+ client_body_buffer_size 128k;
91
+ client_body_timeout 300s;
92
+
93
+ # Proxy timeouts
94
+ proxy_connect_timeout 60s;
95
+ proxy_send_timeout 300s;
96
+ proxy_read_timeout 300s;
97
+
98
+ # Health check endpoint (no rate limiting)
99
+ location /api/health {
100
+ proxy_pass http://sparknet_api;
101
+ proxy_http_version 1.1;
102
+ proxy_set_header Host $host;
103
+ proxy_set_header X-Real-IP $remote_addr;
104
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
105
+ proxy_set_header X-Forwarded-Proto $scheme;
106
+ }
107
+
108
+ # API endpoints
109
+ location /api/ {
110
+ # Rate limiting
111
+ limit_req zone=api_limit burst=50 nodelay;
112
+
113
+ proxy_pass http://sparknet_api;
114
+ proxy_http_version 1.1;
115
+
116
+ # Headers
117
+ proxy_set_header Host $host;
118
+ proxy_set_header X-Real-IP $remote_addr;
119
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
120
+ proxy_set_header X-Forwarded-Proto $scheme;
121
+ proxy_set_header Connection "";
122
+
123
+ # CORS headers (if not handled by FastAPI)
124
+ # add_header Access-Control-Allow-Origin "*" always;
125
+ # add_header Access-Control-Allow-Methods "GET, POST, PUT, DELETE, OPTIONS" always;
126
+ # add_header Access-Control-Allow-Headers "Authorization, Content-Type" always;
127
+
128
+ # Handle OPTIONS for CORS preflight
129
+ if ($request_method = 'OPTIONS') {
130
+ add_header Access-Control-Allow-Origin "*";
131
+ add_header Access-Control-Allow-Methods "GET, POST, PUT, DELETE, OPTIONS";
132
+ add_header Access-Control-Allow-Headers "Authorization, Content-Type";
133
+ add_header Access-Control-Max-Age 3600;
134
+ add_header Content-Length 0;
135
+ add_header Content-Type text/plain;
136
+ return 204;
137
+ }
138
+ }
139
+
140
+ # Document upload endpoint (lower rate limit)
141
+ location /api/documents/upload {
142
+ limit_req zone=upload_limit burst=10 nodelay;
143
+
144
+ proxy_pass http://sparknet_api;
145
+ proxy_http_version 1.1;
146
+
147
+ proxy_set_header Host $host;
148
+ proxy_set_header X-Real-IP $remote_addr;
149
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
150
+ proxy_set_header X-Forwarded-Proto $scheme;
151
+
152
+ # Increased timeout for large uploads
153
+ proxy_connect_timeout 120s;
154
+ proxy_send_timeout 600s;
155
+ proxy_read_timeout 600s;
156
+ }
157
+
158
+ # RAG streaming endpoint (SSE support)
159
+ location /api/rag/query/stream {
160
+ proxy_pass http://sparknet_api;
161
+ proxy_http_version 1.1;
162
+
163
+ proxy_set_header Host $host;
164
+ proxy_set_header X-Real-IP $remote_addr;
165
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
166
+ proxy_set_header X-Forwarded-Proto $scheme;
167
+ proxy_set_header Connection "";
168
+
169
+ # SSE-specific settings
170
+ proxy_buffering off;
171
+ proxy_cache off;
172
+ chunked_transfer_encoding off;
173
+ proxy_read_timeout 3600s;
174
+ }
175
+
176
+ # Streamlit Demo (with WebSocket support)
177
+ location / {
178
+ proxy_pass http://sparknet_demo;
179
+ proxy_http_version 1.1;
180
+
181
+ proxy_set_header Host $host;
182
+ proxy_set_header X-Real-IP $remote_addr;
183
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
184
+ proxy_set_header X-Forwarded-Proto $scheme;
185
+
186
+ # WebSocket support for Streamlit
187
+ proxy_set_header Upgrade $http_upgrade;
188
+ proxy_set_header Connection "upgrade";
189
+
190
+ # Streamlit specific
191
+ proxy_read_timeout 86400;
192
+ }
193
+
194
+ # Streamlit WebSocket endpoint
195
+ location /_stcore/stream {
196
+ proxy_pass http://sparknet_demo;
197
+ proxy_http_version 1.1;
198
+
199
+ proxy_set_header Host $host;
200
+ proxy_set_header X-Real-IP $remote_addr;
201
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
202
+
203
+ # WebSocket
204
+ proxy_set_header Upgrade $http_upgrade;
205
+ proxy_set_header Connection "upgrade";
206
+
207
+ proxy_read_timeout 86400;
208
+ proxy_buffering off;
209
+ }
210
+
211
+ # Streamlit static files
212
+ location /static {
213
+ proxy_pass http://sparknet_demo;
214
+ proxy_http_version 1.1;
215
+ proxy_set_header Host $host;
216
+
217
+ # Cache static assets
218
+ expires 1d;
219
+ add_header Cache-Control "public, immutable";
220
+ }
221
+
222
+ # Error pages
223
+ error_page 502 503 504 /50x.html;
224
+ location = /50x.html {
225
+ root /usr/share/nginx/html;
226
+ internal;
227
+ }
228
+ }
229
+
230
+ # HTTPS server (uncomment and configure for production)
231
+ # server {
232
+ # listen 443 ssl http2;
233
+ # listen [::]:443 ssl http2;
234
+ # server_name sparknet.example.com;
235
+ #
236
+ # # SSL configuration
237
+ # ssl_certificate /etc/nginx/ssl/fullchain.pem;
238
+ # ssl_certificate_key /etc/nginx/ssl/privkey.pem;
239
+ # ssl_session_timeout 1d;
240
+ # ssl_session_cache shared:SSL:50m;
241
+ # ssl_session_tickets off;
242
+ #
243
+ # # Modern SSL configuration
244
+ # ssl_protocols TLSv1.2 TLSv1.3;
245
+ # ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384;
246
+ # ssl_prefer_server_ciphers off;
247
+ #
248
+ # # HSTS
249
+ # add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
250
+ #
251
+ # # Include same location blocks as HTTP server above
252
+ # # ...
253
+ # }
254
+ }
run_demo.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ SPARKNET Demo Launcher
4
+
5
+ Cross-platform launcher for the Streamlit demo.
6
+
7
+ Usage:
8
+ python run_demo.py [--port PORT]
9
+ """
10
+
11
+ import subprocess
12
+ import sys
13
+ import os
14
+ from pathlib import Path
15
+
16
+ def check_dependencies():
17
+ """Check and install required dependencies."""
18
+ print("📦 Checking dependencies...")
19
+
20
+ try:
21
+ import streamlit
22
+ print(f" ✅ Streamlit {streamlit.__version__}")
23
+ except ImportError:
24
+ print(" 📥 Installing Streamlit...")
25
+ subprocess.run([sys.executable, "-m", "pip", "install", "streamlit"], check=True)
26
+
27
+ try:
28
+ import pandas
29
+ print(f" ✅ Pandas {pandas.__version__}")
30
+ except ImportError:
31
+ print(" 📥 Installing Pandas...")
32
+ subprocess.run([sys.executable, "-m", "pip", "install", "pandas"], check=True)
33
+
34
+ try:
35
+ import httpx
36
+ print(f" ✅ httpx {httpx.__version__}")
37
+ except ImportError:
38
+ print(" 📥 Installing httpx...")
39
+ subprocess.run([sys.executable, "-m", "pip", "install", "httpx"], check=True)
40
+
41
+
42
+ def check_ollama():
43
+ """Check if Ollama is running."""
44
+ print("\n🔍 Checking Ollama status...")
45
+
46
+ try:
47
+ import httpx
48
+ with httpx.Client(timeout=2.0) as client:
49
+ response = client.get("http://localhost:11434/api/tags")
50
+ if response.status_code == 200:
51
+ data = response.json()
52
+ models = len(data.get("models", []))
53
+ print(f" ✅ Ollama is running ({models} models)")
54
+ return True
55
+ except Exception:
56
+ pass
57
+
58
+ print(" ⚠️ Ollama not running (demo will use simulated responses)")
59
+ print(" Start with: ollama serve")
60
+ return False
61
+
62
+
63
+ def main():
64
+ """Main entry point."""
65
+ import argparse
66
+
67
+ parser = argparse.ArgumentParser(description="SPARKNET Demo Launcher")
68
+ parser.add_argument("--port", type=int, default=8501, help="Port to run on")
69
+ args = parser.parse_args()
70
+
71
+ print("=" * 50)
72
+ print("🔥 SPARKNET Demo Launcher")
73
+ print("=" * 50)
74
+ print()
75
+
76
+ # Get project root
77
+ project_root = Path(__file__).parent
78
+ demo_app = project_root / "demo" / "app.py"
79
+
80
+ if not demo_app.exists():
81
+ print(f"❌ Demo app not found: {demo_app}")
82
+ sys.exit(1)
83
+
84
+ # Check dependencies
85
+ check_dependencies()
86
+
87
+ # Check Ollama
88
+ check_ollama()
89
+
90
+ # Launch
91
+ print()
92
+ print(f"🚀 Launching SPARKNET Demo on port {args.port}...")
93
+ print(f" URL: http://localhost:{args.port}")
94
+ print()
95
+ print("Press Ctrl+C to stop")
96
+ print("=" * 50)
97
+ print()
98
+
99
+ # Run Streamlit
100
+ os.chdir(project_root)
101
+ subprocess.run([
102
+ sys.executable, "-m", "streamlit", "run",
103
+ str(demo_app),
104
+ "--server.port", str(args.port),
105
+ "--server.headless", "true",
106
+ ])
107
+
108
+
109
+ if __name__ == "__main__":
110
+ main()
run_demo.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # SPARKNET Demo Launcher
3
+ # Usage: ./run_demo.sh [port]
4
+
5
+ set -e
6
+
7
+ PORT=${1:-8501}
8
+ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
9
+
10
+ echo "🔥 SPARKNET Demo Launcher"
11
+ echo "========================="
12
+ echo ""
13
+
14
+ # Check Python
15
+ if ! command -v python3 &> /dev/null; then
16
+ echo "❌ Python3 not found. Please install Python 3.10+"
17
+ exit 1
18
+ fi
19
+
20
+ # Check Streamlit
21
+ if ! python3 -c "import streamlit" &> /dev/null; then
22
+ echo "📦 Installing Streamlit..."
23
+ pip install streamlit
24
+ fi
25
+
26
+ # Check demo dependencies
27
+ echo "📦 Checking dependencies..."
28
+ pip install -q -r "$SCRIPT_DIR/demo/requirements.txt" 2>/dev/null || true
29
+
30
+ # Check Ollama status
31
+ echo ""
32
+ echo "🔍 Checking Ollama status..."
33
+ if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
34
+ echo "✅ Ollama is running"
35
+ MODELS=$(curl -s http://localhost:11434/api/tags | python3 -c "import sys,json; d=json.load(sys.stdin); print(len(d.get('models', [])))" 2>/dev/null || echo "?")
36
+ echo " Models available: $MODELS"
37
+ else
38
+ echo "⚠️ Ollama not running (demo will use simulated responses)"
39
+ echo " Start with: ollama serve"
40
+ fi
41
+
42
+ # Launch demo
43
+ echo ""
44
+ echo "🚀 Launching SPARKNET Demo on port $PORT..."
45
+ echo " URL: http://localhost:$PORT"
46
+ echo ""
47
+ echo "Press Ctrl+C to stop"
48
+ echo "========================="
49
+ echo ""
50
+
51
+ cd "$SCRIPT_DIR"
52
+ streamlit run demo/app.py --server.port "$PORT" --server.headless true
scripts to get ideas from/ides.txt ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This introduces the fundamentals of document processing and how they connect to agentic AI workflows.
2
+
3
+ The core problem is that modern organizations are overwhelmed with digital documents such as PDFs, scans, receipts, contracts, and reports. These documents are designed for human reading, not machine processing, which makes searching, analyzing, and automating information extremely difficult. Valuable data is often trapped inside unstructured formats, requiring manual reading and re-entry, which does not scale.
4
+
5
+ The goal of document processing is to convert unstructured documents into structured, machine-readable data. Common output formats include JSON and Markdown or HTML. JSON is well suited for machines, APIs, databases, and analytics pipelines because it is hierarchical and easy to process programmatically. Markdown or HTML preserves layout elements such as headings, tables, and lists, making it ideal for human readers and large language models, especially in chat interfaces and retrieval-augmented generation systems.
6
+
7
+ When documents are scanned or photographed, the system only sees image pixels. Optical Character Recognition (OCR) is required to convert those pixels into text. OCR typically involves two main steps: image preprocessing, such as deskewing, denoising, and contrast adjustment, followed by text recognition, where visual patterns are matched to characters. The output is editable or searchable text.
8
+
9
+ However, OCR has important limitations. It does not understand document structure, meaning, or relationships between elements. It often produces a flat block of text and struggles with poor image quality, complex layouts, multi-column text, nested tables, handwriting, stamps, and stylized fonts. These weaknesses can lead to cascading errors during parsing and extraction. OCR provides perception, but not comprehension.
10
+
11
+ A key distinction introduced in this lesson is that processing is not the same as understanding. OCR can read characters but cannot determine what is a header, a value, a total amount, or a table entry. To move from raw text to meaningful structured data, an additional cognitive layer is required.
12
+
13
+ Agentic AI provides this missing layer. In document processing, an agent is an autonomous system that can perceive input, reason about goals, decide which tools to use, and act iteratively until the task is complete. In this context, OCR functions as the eyes, while the agent serves as the brain. Unlike rigid rule-based pipelines, agents can adapt to edge cases and unexpected document variations.
14
+
15
+ An agentic document system is typically composed of three components. The brain, implemented using a large language model, handles reasoning, planning, and decision-making. The eyes, implemented through OCR, convert visual content into text. The hands are the tools the agent can use, such as APIs, database queries, file operations, and function calls. Together, these components allow the system to answer high-level requests, such as identifying the total amount on an invoice, without hardcoding every step.
16
+
17
+ The lesson also introduces the ReAct framework, which describes how agents reason step by step. The agent alternates between thinking about what to do next, taking an action by calling a tool, observing the result, and then repeating the process. This loop enables adaptability, error correction, and transparency, since the agent’s reasoning and tool usage can be inspected.
18
+
19
+ The lesson concludes with a practical lab. Learners build a simple document agent that combines OCR, parsing, and agentic reasoning to read documents and extract structured information. The lab follows a step-by-step approach, reinforcing the bottom-up journey from pixels, to text, to structure, and finally to reasoning.
20
+ =========================================================================================================================
21
+
22
+ This walkthrough demonstrates how OCR, rule-based methods, and LLM-based agentic reasoning work together in document processing, and where each approach succeeds or fails.
23
+
24
+ OCR is first applied to extract raw text from documents, which works well for clean, printed invoices but produces unstructured, noisy text with no understanding of meaning. Simple rule-based approaches such as regular expressions are then used to extract values like tax and total, but they fail easily due to small OCR variations, ambiguous wording, or layout differences. This highlights how brittle traditional pipelines are when faced with real-world data.
25
+
26
+ An agentic approach is then introduced, combining OCR as perception, an LLM as the reasoning component, and tools within a ReAct-style loop. The agent decides when to call OCR, interprets the extracted text semantically, and outputs structured JSON without relying on hardcoded rules. This allows correct extraction of values such as totals even when multiple similar terms (e.g., subtotal vs total) appear.
27
+
28
+ More challenging examples show the limits of OCR and the strengths and weaknesses of LLM reasoning. Tables with complex layouts, handwriting, and low-quality receipts produce chaotic OCR outputs. The agent can often infer intent and recover partially correct information, but errors still occur when OCR inaccuracies distort the underlying data. In some cases, the LLM overcorrects or reasons from incorrect inputs, leading to plausible but wrong conclusions.
29
+
30
+ The key takeaway is that OCR provides reading but not understanding, regex provides rules without meaning, and LLM-based agents introduce semantic reasoning that significantly improves robustness. However, reliable real-world document understanding still requires multiple components working together, including OCR, layout analysis, vision-language models, agentic workflows, and validation mechanisms.
31
+ =========================================================================================================================
32
+
33
+ OCR has evolved from rule-based, procedural computer vision systems to modern deep learning–based approaches. Early OCR systems, represented by Tesseract, relied heavily on handcrafted pipelines such as line detection, character segmentation, and shape matching. These systems work well for clean, printed, black-and-white text with regular layouts and can run efficiently on CPUs, but they struggle with real-world variability such as complex layouts, curved text, images, or noise.
34
+
35
+ Around 2015, deep learning fundamentally changed OCR by introducing data-driven, end-to-end models. Modern OCR systems separate the problem into two modular stages: text detection (finding text regions) and text recognition (reading the text within those regions). PaddleOCR is a representative system from this era, using neural networks for both stages, specifically DBNet for detection and transformer-based models for recognition. This approach handles irregular layouts, curved or rotated text, and noisy real-world images far better than traditional methods, especially when accelerated with GPUs.
36
+
37
+ While both Tesseract and PaddleOCR are open source and support many languages, they are best suited to different use cases. Tesseract is ideal for simple document scanning such as books with clean layouts, whereas PaddleOCR performs better on complex, real-world documents like receipts, signage, and mixed-layout content. Overall, these tools illustrate how OCR has shifted from rigid, rule-based pipelines to flexible, learnable systems that can be integrated into larger document intelligence and agentic workflows.
38
+
39
+ =========================================================================================================================
40
+ A modern OCR pipeline is set up using PaddleOCR along with image and visualization tools. PaddleOCR runs an end-to-end process that includes preprocessing and two deep learning stages: text detection, which finds text regions and returns bounding boxes, and text recognition, which reads the text in each region and outputs the text with confidence scores. Compared to earlier OCR, this pipeline provides localization and improved accuracy on messy inputs such as receipts, which makes downstream reasoning tasks like verifying totals more reliable when combined with an LLM agent.
41
+
42
+ The same approach is tested on harder examples. On a complex table, PaddleOCR still makes errors such as misreading scientific notation, but an LLM agent can sometimes correct these issues using contextual reasoning and domain expectations. On handwriting, recognition improves over older OCR in some places, but key fields like names and several answers can still be misread, and the agent can only be as accurate as the OCR signal.
43
+
44
+ New document types expose major weaknesses related to layout and reading order. For report pages containing charts, the OCR may extract axis numbers without recognizing the full chart as a unit, losing context. For multi-column articles, the text can be read across columns incorrectly, producing garbled output. To address this, a layout detection model is added to segment the document into labeled regions such as title, abstract, text blocks, table, chart, footer, and numbers. This improves structure and preserves reading order by keeping text within coherent regions, although errors remain, such as splitting a table into multiple parts or failing to separate headers from table content in bank statements.
45
+
46
+ Overall, PaddleOCR significantly improves real-world OCR accuracy and adds bounding-box structure, and layout detection helps with region-level organization and reading order. However, these tools still fall short of full semantic document understanding, especially for complex layouts, tables, and small but important text.
47
+
48
+
49
+ =========================================================================================================================
50
+ Documents often have complex layouts, so extracting text and sending it directly to a language model can destroy structure and mix content such as columns, tables, captions, and figures. Layout detection addresses this by identifying and labeling page regions like paragraphs, tables, figures, headers, footers, and captions so downstream systems keep structure and target the right areas.
51
+
52
+ Reading order is a separate problem: it determines the sequence a human would read content, especially in multi-column pages and documents with floating elements. Older heuristic methods (top-to-bottom, left-to-right rules) fail on real layouts. LayoutReader replaces rules with a learned model trained on a large reading-order dataset, using OCR bounding boxes and visual-spatial features to reconstruct a human-like token sequence.
53
+
54
+ Even with correct reading order, OCR-only pipelines remain limited because OCR captures text but misses visual context such as charts, diagrams, and spatial relationships. Forms require linking labels to values and may need key-value models like LayoutLM and vision for elements like checkboxes. Tables need structure-preserving models such as Table Transformer, TableFormer, or TABLET to recover rows and columns and output usable formats like CSV/JSON/HTML. Handwriting often requires specialized ICR models trained on handwritten data. Multilingual documents add challenges like script detection and different reading directions.
55
+
56
+ Vision-Language Models (VLMs) extend LLMs by adding a vision encoder and projector so they can reason over images plus text, enabling interpretation of visual elements. However, VLMs can still struggle with small text, nested layouts, multi-page structure, hallucinations, and weak grounding unless they are guided by layout structure.
57
+
58
+ A practical hybrid approach combines layout detection and reading-order models for deterministic structure with VLMs for visually rich regions. An agent can orchestrate this workflow by using OCR plus bounding boxes, reordering text with LayoutReader, detecting regions (tables, charts, text blocks), and selectively sending cropped regions to specialized VLM-based tools for table and chart understanding based on the user’s question.
59
+
60
+
61
+
62
+ =========================================================================================================================
63
+ An agentic document intelligence pipeline combines OCR, reading-order reconstruction, layout detection, and vision-language model analysis for visual regions.
64
+
65
+ Text extraction uses PaddleOCR to produce, for each detected text region, the recognized string, a confidence score, and polygon bounding boxes. Bounding boxes are visualized for verification and converted into a standardized XYXY format using structured data objects for cleaner downstream processing.
66
+
67
+ Reading order is reconstructed with a LayoutReader model built on LayoutLMv3. OCR bounding boxes are normalized to the 0–1000 coordinate range expected by LayoutLM-style models, then the model predicts an ordering index for each region. Regions are sorted by this index to create a correctly sequenced text representation that can answer many questions without visual models.
68
+
69
+ Layout detection uses PaddleOCR’s layout detector to segment the page into labeled regions such as text blocks, titles, tables, charts, and figures. Each region is assigned a unique ID, stored in structured objects, and visualized with labeled boxes and confidence scores.
70
+
71
+ For tables and charts, regions are cropped from the original document and encoded in base64 to be sent to vision APIs. Cropping improves focus, reduces noise, and lowers cost, but localization can still be imperfect and requires careful prompt design.
72
+
73
+ Two specialized tools are defined for vision-language model calls: one for chart interpretation and one for table extraction. Each tool uses a structured prompt with explicit fields and a JSON output template to produce machine-readable results. A shared multimodal-call utility packages the prompt plus the cropped image, and the tools are exposed to the agent via a tool interface.
74
+
75
+ A tool-calling agent is created with a system context containing the ordered OCR text plus layout region IDs and types. For a given user question, the agent decides whether text alone is sufficient; if not, it selects the appropriate tool, analyzes the target region, and merges the tool output with the textual context into a final answer.
76
+
77
+
78
+ =========================================================================================================================
79
+ Agentic Document Extraction (ADE) is a unified, vision-first document intelligence system exposed through a single API that converts documents, images, presentations, and spreadsheets into structured Markdown and JSON.
80
+
81
+ The system is designed around three core principles. Vision-first processing treats documents as visual objects where meaning comes from layout, structure, and spatial relationships rather than raw text tokens. A data-centric approach emphasizes training on highly curated, document-specific datasets, prioritizing data quality alongside model design. An agentic architecture enables planning, routing, execution, and verification steps to iteratively reach high-quality outputs.
82
+
83
+ ADE replaces traditional pipelines built from OCR, layout analysis, and vision-language models with document-native vision transformers called DPTs (DPT-1, DPT-2, and DPT-2-mini). These models natively perform reading order reconstruction, layout detection, text recognition, and figure captioning within a single framework.
84
+
85
+ The core architecture consists of document-native vision models at the foundation, intelligent parsing and routing agents that handle different content types such as text, tables, and figures through separate paths, and an application layer that delivers user-facing capabilities like key-value (field) extraction, document splitting, and content preparation for retrieval-augmented generation.
86
+
87
+ Primary use cases include precise field extraction with traceability back to source regions, and preparation of complex documents for RAG systems that must preserve tables, figures, and structural context. ADE achieves state-of-the-art accuracy, exceeding human performance on the DocVQA benchmark, demonstrating strong performance on real scanned and handwritten documents.
88
+
89
+ The platform is accessible through a visual interface, REST APIs, and Python or TypeScript libraries, enabling flexible integration into document
90
+
91
+
92
+
93
+ =========================================================================================================================
94
+ Agentic Document Extraction (ADE) is used through an API-driven workflow to parse complex documents and extract structured, verifiable information using document-natiprocessing workflows at scale.ve vision models.
95
+
96
+ The process begins by sending documents to a parsing API powered by Document Pretrained Transformers (DPT-2-latest or DPT-1-latest). The parser converts each document into structured JSON and Markdown, identifying semantically meaningful chunks such as text blocks, tables, figures, charts, logos, margins, and attestations. Each chunk and even individual table cells receive unique identifiers and bounding boxes, enabling precise visual grounding and traceability.
97
+
98
+ Parsed outputs include:
99
+
100
+ * Structured chunks with type, coordinates, and page references
101
+ * Markdown representations of text, tables, and figures
102
+ * Cell-level identifiers for tables, enabling fine-grained referencing
103
+ * Rich visual descriptions for figures, charts, flowcharts, and illustrations
104
+
105
+ A schema-based extraction step is then applied. A user-defined JSON schema specifies the required fields, including nested objects, numeric values, strings, and booleans. The extraction API combines the parsed document representation with this schema to return structured key-value pairs along with metadata linking each extracted value back to its exact source region or table cell.
106
+
107
+ The system demonstrates robust performance across highly challenging document types:
108
+
109
+ * Utility bills with mixed text, tables, and usage charts
110
+ * Charts and flowcharts with implicit spatial relationships and arrows
111
+ * Sparse tables, merged cells, and very large “mega tables” with thousands of values
112
+ * Handwritten forms, checkboxes, circled answers, and medical annotations
113
+ * Mathematical handwriting with symbols, equations, and square roots
114
+ * Purely visual documents such as instruction manuals and infographics
115
+ * Official documents containing stamps, curved text, and handwritten signatures
116
+
117
+ ADE handles all of these cases through a single, consistent API without requiring custom OCR pipelines, layout rules, or manual model orchestration. The output supports downstream applications such as user interfaces, compliance workflows, analytics, and reliable field extraction with full visual traceability, even under extreme document variability and complexity.
118
+
119
+
120
+ =========================================================================================================================
121
+ A multi-document financial intake pipeline is built around LandingAI ADE to handle mixed uploads with unknown filenames and unknown document types.
122
+
123
+ 1. Batch parsing and page-level Markdown
124
+ Each uploaded file is sent to the Parse API using a DPT model. The response is requested as per-page Markdown so the first page can be used for fast identification while still keeping full parsed output available for extraction and grounding.
125
+
126
+ 2. Automatic document type classification
127
+ The Extract API is used to categorize each file by running a lightweight schema over the first-page Markdown. A Pydantic schema defines an enum of expected document types (for example investment statement, pay stub, bank statement, government ID, tax form) with rich descriptions to improve classification reliability. Pydantic is converted to JSON internally before extraction.
128
+
129
+ 3. Type-specific field extraction with dedicated schemas
130
+ For each identified document type, a separate Pydantic extraction schema is applied (ID fields, tax form fields, pay stub fields, bank statement fields, investment fields). The pipeline selects the schema dynamically based on the classified type, then calls Extract to return structured key-value pairs plus extraction metadata that links each value to chunk IDs for visual grounding.
131
+
132
+ 4. Grounding-focused visualization for review
133
+ Parsed outputs are rendered with bounding boxes to show detected chunks (text, tables, figures) and cell-level structure for tables. A second visualization focuses only on the specific fields requested by each schema, highlighting exactly where each extracted value came from, enabling fast human review.
134
+
135
+ 5. Consolidation into a structured summary table
136
+ All extracted fields across documents are aggregated into a single tabular summary (for example a Pandas DataFrame) with columns such as applicant folder, document name, detected type, field name, and field value. This replaces manual opening, searching, and retyping.
137
+
138
+ 6. Validation and consistency checks
139
+ Custom validation logic is applied across the extracted results, such as:
140
+
141
+ * Cross-document name matching to detect inconsistent applicants across uploaded files
142
+ * Recency checks by extracting years from dates and flagging outdated documents
143
+ * Asset aggregation by summing balances across bank and investment statements, scalable to many accounts
144
+
145
+ The result is an end-to-end workflow that parses heterogeneous documents, identifies their types, extracts structured fields with traceable grounding, produces a reviewer-friendly summary, and runs automated checks to surface inconsistencies and missing requirements.
146
+
147
+
148
+ =========================================================================================================================
149
+
150
+
151
+
src/agents/document_agent.py ADDED
@@ -0,0 +1,661 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DocumentAgent for SPARKNET
3
+
4
+ A ReAct-style agent for document intelligence tasks:
5
+ - Document parsing and extraction
6
+ - Field extraction with grounding
7
+ - Table and chart analysis
8
+ - Document classification
9
+ - Question answering over documents
10
+ """
11
+
12
+ from typing import List, Dict, Any, Optional, Tuple
13
+ from dataclasses import dataclass
14
+ from enum import Enum
15
+ import json
16
+ import time
17
+ from loguru import logger
18
+
19
+ from .base_agent import BaseAgent, Task, Message
20
+ from ..llm.langchain_ollama_client import LangChainOllamaClient
21
+ from ..document.schemas.core import (
22
+ ProcessedDocument,
23
+ DocumentChunk,
24
+ EvidenceRef,
25
+ ExtractionResult,
26
+ )
27
+ from ..document.schemas.extraction import ExtractionSchema, ExtractedField
28
+ from ..document.schemas.classification import DocumentClassification, DocumentType
29
+
30
+
31
+ class AgentAction(str, Enum):
32
+ """Actions the DocumentAgent can take."""
33
+ THINK = "think"
34
+ USE_TOOL = "use_tool"
35
+ ANSWER = "answer"
36
+ ABSTAIN = "abstain"
37
+
38
+
39
+ @dataclass
40
+ class ThoughtAction:
41
+ """A thought-action pair in the ReAct loop."""
42
+ thought: str
43
+ action: AgentAction
44
+ tool_name: Optional[str] = None
45
+ tool_args: Optional[Dict[str, Any]] = None
46
+ observation: Optional[str] = None
47
+ evidence: Optional[List[EvidenceRef]] = None
48
+
49
+
50
+ @dataclass
51
+ class AgentTrace:
52
+ """Full trace of agent execution for inspection."""
53
+ task: str
54
+ steps: List[ThoughtAction]
55
+ final_answer: Optional[Any] = None
56
+ confidence: float = 0.0
57
+ total_time_ms: float = 0.0
58
+ success: bool = True
59
+ error: Optional[str] = None
60
+
61
+
62
+ class DocumentAgent:
63
+ """
64
+ ReAct-style agent for document intelligence tasks.
65
+
66
+ Implements the Think -> Tool -> Observe -> Refine loop
67
+ with inspectable traces and grounded outputs.
68
+ """
69
+
70
+ # System prompt for ReAct reasoning
71
+ SYSTEM_PROMPT = """You are a document intelligence agent that analyzes documents
72
+ and extracts information with evidence.
73
+
74
+ You operate in a Think-Act-Observe loop:
75
+ 1. THINK: Analyze what you need to do and what information you have
76
+ 2. ACT: Choose a tool to use or provide an answer
77
+ 3. OBSERVE: Review the tool output and update your understanding
78
+
79
+ Available tools:
80
+ {tool_descriptions}
81
+
82
+ CRITICAL RULES:
83
+ - Every extraction MUST include evidence (page, bbox, text snippet)
84
+ - If you cannot find evidence for a value, ABSTAIN rather than guess
85
+ - Always cite the source of information with page numbers
86
+ - For tables, analyze structure before extracting data
87
+ - For charts, describe what you see before extracting values
88
+
89
+ Output format for each step:
90
+ THOUGHT: <your reasoning>
91
+ ACTION: <tool_name or ANSWER or ABSTAIN>
92
+ ACTION_INPUT: <JSON arguments for tool, or final answer>
93
+ """
94
+
95
+ # Available tools
96
+ TOOLS = {
97
+ "extract_text": {
98
+ "description": "Extract text from specific pages or regions",
99
+ "args": ["page_numbers", "region_bbox"],
100
+ },
101
+ "analyze_table": {
102
+ "description": "Analyze and extract structured data from a table region",
103
+ "args": ["page", "bbox", "expected_columns"],
104
+ },
105
+ "analyze_chart": {
106
+ "description": "Analyze a chart/graph and extract insights",
107
+ "args": ["page", "bbox"],
108
+ },
109
+ "extract_fields": {
110
+ "description": "Extract specific fields using a schema",
111
+ "args": ["schema", "context_chunks"],
112
+ },
113
+ "classify_document": {
114
+ "description": "Classify the document type",
115
+ "args": ["first_page_chunks"],
116
+ },
117
+ "search_text": {
118
+ "description": "Search for text patterns in the document",
119
+ "args": ["query", "page_range"],
120
+ },
121
+ }
122
+
123
+ def __init__(
124
+ self,
125
+ llm_client: LangChainOllamaClient,
126
+ memory_agent: Optional[Any] = None,
127
+ max_iterations: int = 10,
128
+ temperature: float = 0.3,
129
+ ):
130
+ """
131
+ Initialize DocumentAgent.
132
+
133
+ Args:
134
+ llm_client: LangChain Ollama client
135
+ memory_agent: Optional memory agent for context retrieval
136
+ max_iterations: Maximum ReAct iterations
137
+ temperature: LLM temperature for reasoning
138
+ """
139
+ self.llm_client = llm_client
140
+ self.memory_agent = memory_agent
141
+ self.max_iterations = max_iterations
142
+ self.temperature = temperature
143
+
144
+ # Current document context
145
+ self._current_document: Optional[ProcessedDocument] = None
146
+ self._page_images: Dict[int, Any] = {}
147
+
148
+ logger.info(f"Initialized DocumentAgent (max_iterations={max_iterations})")
149
+
150
+ def set_document(
151
+ self,
152
+ document: ProcessedDocument,
153
+ page_images: Optional[Dict[int, Any]] = None,
154
+ ):
155
+ """
156
+ Set the current document context.
157
+
158
+ Args:
159
+ document: Processed document
160
+ page_images: Optional dict of page number -> image array
161
+ """
162
+ self._current_document = document
163
+ self._page_images = page_images or {}
164
+ logger.info(f"Set document context: {document.metadata.document_id}")
165
+
166
+ async def run(
167
+ self,
168
+ task_description: str,
169
+ extraction_schema: Optional[ExtractionSchema] = None,
170
+ ) -> Tuple[Any, AgentTrace]:
171
+ """
172
+ Run the agent on a task.
173
+
174
+ Args:
175
+ task_description: Natural language task description
176
+ extraction_schema: Optional schema for structured extraction
177
+
178
+ Returns:
179
+ Tuple of (result, trace)
180
+ """
181
+ start_time = time.time()
182
+
183
+ if not self._current_document:
184
+ raise ValueError("No document set. Call set_document() first.")
185
+
186
+ trace = AgentTrace(task=task_description, steps=[])
187
+
188
+ try:
189
+ # Build initial context
190
+ context = self._build_context(extraction_schema)
191
+
192
+ # ReAct loop
193
+ result = None
194
+ for iteration in range(self.max_iterations):
195
+ logger.debug(f"ReAct iteration {iteration + 1}")
196
+
197
+ # Generate thought and action
198
+ step = await self._generate_step(task_description, context, trace.steps)
199
+ trace.steps.append(step)
200
+
201
+ # Check for terminal actions
202
+ if step.action == AgentAction.ANSWER:
203
+ result = self._parse_answer(step.tool_args)
204
+ trace.final_answer = result
205
+ trace.confidence = self._calculate_confidence(trace.steps)
206
+ break
207
+
208
+ elif step.action == AgentAction.ABSTAIN:
209
+ trace.final_answer = {
210
+ "abstained": True,
211
+ "reason": step.thought,
212
+ }
213
+ trace.confidence = 0.0
214
+ break
215
+
216
+ elif step.action == AgentAction.USE_TOOL:
217
+ # Execute tool and get observation
218
+ observation, evidence = await self._execute_tool(
219
+ step.tool_name, step.tool_args
220
+ )
221
+ step.observation = observation
222
+ step.evidence = evidence
223
+
224
+ # Update context with observation
225
+ context += f"\n\nObservation from {step.tool_name}:\n{observation}"
226
+
227
+ trace.success = True
228
+
229
+ except Exception as e:
230
+ logger.error(f"Agent execution failed: {e}")
231
+ trace.success = False
232
+ trace.error = str(e)
233
+
234
+ trace.total_time_ms = (time.time() - start_time) * 1000
235
+ return trace.final_answer, trace
236
+
237
+ async def extract_fields(
238
+ self,
239
+ schema: ExtractionSchema,
240
+ ) -> ExtractionResult:
241
+ """
242
+ Extract fields from the document using a schema.
243
+
244
+ Args:
245
+ schema: Extraction schema defining fields
246
+
247
+ Returns:
248
+ ExtractionResult with extracted data and evidence
249
+ """
250
+ task = f"Extract the following fields from this document: {', '.join(f.name for f in schema.fields)}"
251
+ result, trace = await self.run(task, schema)
252
+
253
+ # Build extraction result
254
+ data = {}
255
+ evidence = []
256
+ warnings = []
257
+ abstained = []
258
+
259
+ if isinstance(result, dict):
260
+ data = result.get("data", result)
261
+
262
+ # Collect evidence from trace
263
+ for step in trace.steps:
264
+ if step.evidence:
265
+ evidence.extend(step.evidence)
266
+
267
+ # Check for abstained fields
268
+ for field in schema.fields:
269
+ if field.name not in data and field.required:
270
+ abstained.append(field.name)
271
+ warnings.append(
272
+ f"Required field '{field.name}' not found with sufficient confidence"
273
+ )
274
+
275
+ return ExtractionResult(
276
+ data=data,
277
+ evidence=evidence,
278
+ warnings=warnings,
279
+ confidence=trace.confidence,
280
+ abstained_fields=abstained,
281
+ )
282
+
283
+ async def classify(self) -> DocumentClassification:
284
+ """
285
+ Classify the document type.
286
+
287
+ Returns:
288
+ DocumentClassification with type and confidence
289
+ """
290
+ task = "Classify this document into one of the standard document types (contract, invoice, patent, research_paper, report, letter, form, etc.)"
291
+ result, trace = await self.run(task)
292
+
293
+ # Parse classification result
294
+ doc_type = DocumentType.UNKNOWN
295
+ confidence = 0.0
296
+
297
+ if isinstance(result, dict):
298
+ type_str = result.get("document_type", "unknown")
299
+ try:
300
+ doc_type = DocumentType(type_str.lower())
301
+ except ValueError:
302
+ doc_type = DocumentType.OTHER
303
+
304
+ confidence = result.get("confidence", trace.confidence)
305
+
306
+ return DocumentClassification(
307
+ document_id=self._current_document.metadata.document_id,
308
+ primary_type=doc_type,
309
+ primary_confidence=confidence,
310
+ evidence=[e for step in trace.steps if step.evidence for e in step.evidence],
311
+ method="llm",
312
+ is_confident=confidence >= 0.7,
313
+ )
314
+
315
+ async def answer_question(self, question: str) -> Tuple[str, List[EvidenceRef]]:
316
+ """
317
+ Answer a question about the document.
318
+
319
+ Args:
320
+ question: Natural language question
321
+
322
+ Returns:
323
+ Tuple of (answer, evidence)
324
+ """
325
+ task = f"Answer this question about the document: {question}"
326
+ result, trace = await self.run(task)
327
+
328
+ answer = ""
329
+ evidence = []
330
+
331
+ if isinstance(result, dict):
332
+ answer = result.get("answer", str(result))
333
+ elif isinstance(result, str):
334
+ answer = result
335
+
336
+ # Collect evidence
337
+ for step in trace.steps:
338
+ if step.evidence:
339
+ evidence.extend(step.evidence)
340
+
341
+ return answer, evidence
342
+
343
+ def _build_context(self, schema: Optional[ExtractionSchema] = None) -> str:
344
+ """Build initial context from document."""
345
+ doc = self._current_document
346
+ context_parts = [
347
+ f"Document: {doc.metadata.filename}",
348
+ f"Type: {doc.metadata.file_type}",
349
+ f"Pages: {doc.metadata.num_pages}",
350
+ f"Chunks: {len(doc.chunks)}",
351
+ "",
352
+ "Document content summary:",
353
+ ]
354
+
355
+ # Add first few chunks as context
356
+ for chunk in doc.chunks[:10]:
357
+ context_parts.append(
358
+ f"[Page {chunk.page + 1}, {chunk.chunk_type.value}]: {chunk.text[:200]}..."
359
+ )
360
+
361
+ if schema:
362
+ context_parts.append("")
363
+ context_parts.append("Extraction schema:")
364
+ for field in schema.fields:
365
+ req = "required" if field.required else "optional"
366
+ context_parts.append(f"- {field.name} ({field.type.value}, {req}): {field.description}")
367
+
368
+ return "\n".join(context_parts)
369
+
370
+ async def _generate_step(
371
+ self,
372
+ task: str,
373
+ context: str,
374
+ previous_steps: List[ThoughtAction],
375
+ ) -> ThoughtAction:
376
+ """Generate the next thought-action step."""
377
+ # Build prompt
378
+ tool_descriptions = "\n".join(
379
+ f"- {name}: {info['description']}"
380
+ for name, info in self.TOOLS.items()
381
+ )
382
+
383
+ system_prompt = self.SYSTEM_PROMPT.format(tool_descriptions=tool_descriptions)
384
+
385
+ messages = [{"role": "system", "content": system_prompt}]
386
+
387
+ # Add task and context
388
+ user_content = f"TASK: {task}\n\nCONTEXT:\n{context}"
389
+
390
+ # Add previous steps
391
+ if previous_steps:
392
+ user_content += "\n\nPREVIOUS STEPS:"
393
+ for i, step in enumerate(previous_steps, 1):
394
+ user_content += f"\n\nStep {i}:"
395
+ user_content += f"\nTHOUGHT: {step.thought}"
396
+ user_content += f"\nACTION: {step.action.value}"
397
+ if step.tool_name:
398
+ user_content += f"\nTOOL: {step.tool_name}"
399
+ if step.observation:
400
+ user_content += f"\nOBSERVATION: {step.observation[:500]}..."
401
+
402
+ user_content += "\n\nNow generate your next step:"
403
+ messages.append({"role": "user", "content": user_content})
404
+
405
+ # Generate response
406
+ llm = self.llm_client.get_llm(complexity="complex", temperature=self.temperature)
407
+
408
+ from langchain_core.messages import HumanMessage, SystemMessage
409
+ lc_messages = [
410
+ SystemMessage(content=system_prompt),
411
+ HumanMessage(content=user_content),
412
+ ]
413
+
414
+ response = await llm.ainvoke(lc_messages)
415
+ response_text = response.content
416
+
417
+ # Parse response
418
+ return self._parse_step(response_text)
419
+
420
+ def _parse_step(self, response: str) -> ThoughtAction:
421
+ """Parse LLM response into ThoughtAction."""
422
+ thought = ""
423
+ action = AgentAction.THINK
424
+ tool_name = None
425
+ tool_args = None
426
+
427
+ lines = response.strip().split("\n")
428
+ current_section = None
429
+
430
+ for line in lines:
431
+ line = line.strip()
432
+
433
+ if line.startswith("THOUGHT:"):
434
+ current_section = "thought"
435
+ thought = line[8:].strip()
436
+ elif line.startswith("ACTION:"):
437
+ current_section = "action"
438
+ action_str = line[7:].strip().lower()
439
+ if action_str == "answer":
440
+ action = AgentAction.ANSWER
441
+ elif action_str == "abstain":
442
+ action = AgentAction.ABSTAIN
443
+ elif action_str in self.TOOLS:
444
+ action = AgentAction.USE_TOOL
445
+ tool_name = action_str
446
+ else:
447
+ action = AgentAction.USE_TOOL
448
+ tool_name = action_str
449
+ elif line.startswith("ACTION_INPUT:"):
450
+ current_section = "input"
451
+ input_str = line[13:].strip()
452
+ try:
453
+ tool_args = json.loads(input_str)
454
+ except json.JSONDecodeError:
455
+ tool_args = {"raw": input_str}
456
+ elif current_section == "thought":
457
+ thought += " " + line
458
+ elif current_section == "input":
459
+ try:
460
+ tool_args = json.loads(line)
461
+ except:
462
+ pass
463
+
464
+ return ThoughtAction(
465
+ thought=thought,
466
+ action=action,
467
+ tool_name=tool_name,
468
+ tool_args=tool_args,
469
+ )
470
+
471
+ async def _execute_tool(
472
+ self,
473
+ tool_name: str,
474
+ tool_args: Optional[Dict[str, Any]],
475
+ ) -> Tuple[str, List[EvidenceRef]]:
476
+ """Execute a tool and return observation."""
477
+ if not tool_args:
478
+ tool_args = {}
479
+
480
+ doc = self._current_document
481
+ evidence = []
482
+
483
+ try:
484
+ if tool_name == "extract_text":
485
+ return self._tool_extract_text(tool_args)
486
+
487
+ elif tool_name == "analyze_table":
488
+ return await self._tool_analyze_table(tool_args)
489
+
490
+ elif tool_name == "analyze_chart":
491
+ return await self._tool_analyze_chart(tool_args)
492
+
493
+ elif tool_name == "extract_fields":
494
+ return await self._tool_extract_fields(tool_args)
495
+
496
+ elif tool_name == "classify_document":
497
+ return self._tool_classify_document(tool_args)
498
+
499
+ elif tool_name == "search_text":
500
+ return self._tool_search_text(tool_args)
501
+
502
+ else:
503
+ return f"Unknown tool: {tool_name}", []
504
+
505
+ except Exception as e:
506
+ logger.error(f"Tool {tool_name} failed: {e}")
507
+ return f"Error executing {tool_name}: {e}", []
508
+
509
+ def _tool_extract_text(self, args: Dict[str, Any]) -> Tuple[str, List[EvidenceRef]]:
510
+ """Extract text from pages or regions."""
511
+ doc = self._current_document
512
+ page_numbers = args.get("page_numbers", list(range(doc.metadata.num_pages)))
513
+
514
+ if isinstance(page_numbers, int):
515
+ page_numbers = [page_numbers]
516
+
517
+ texts = []
518
+ evidence = []
519
+
520
+ for page in page_numbers:
521
+ page_chunks = doc.get_page_chunks(page)
522
+ for chunk in page_chunks:
523
+ texts.append(f"[Page {page + 1}]: {chunk.text}")
524
+ evidence.append(EvidenceRef(
525
+ chunk_id=chunk.chunk_id,
526
+ page=chunk.page,
527
+ bbox=chunk.bbox,
528
+ source_type="text",
529
+ snippet=chunk.text[:100],
530
+ confidence=chunk.confidence,
531
+ ))
532
+
533
+ return "\n".join(texts[:20]), evidence[:10]
534
+
535
+ async def _tool_analyze_table(self, args: Dict[str, Any]) -> Tuple[str, List[EvidenceRef]]:
536
+ """Analyze a table region."""
537
+ page = args.get("page", 0)
538
+ doc = self._current_document
539
+
540
+ # Find table chunks
541
+ table_chunks = [c for c in doc.chunks if c.chunk_type.value == "table" and c.page == page]
542
+
543
+ if not table_chunks:
544
+ return "No table found on this page", []
545
+
546
+ # Use LLM to analyze table
547
+ table_text = table_chunks[0].text
548
+ llm = self.llm_client.get_llm(complexity="standard")
549
+
550
+ from langchain_core.messages import HumanMessage
551
+ prompt = f"Analyze this table and extract structured data as JSON:\n\n{table_text}"
552
+ response = await llm.ainvoke([HumanMessage(content=prompt)])
553
+
554
+ evidence = [EvidenceRef(
555
+ chunk_id=table_chunks[0].chunk_id,
556
+ page=page,
557
+ bbox=table_chunks[0].bbox,
558
+ source_type="table",
559
+ snippet=table_text[:200],
560
+ confidence=table_chunks[0].confidence,
561
+ )]
562
+
563
+ return response.content, evidence
564
+
565
+ async def _tool_analyze_chart(self, args: Dict[str, Any]) -> Tuple[str, List[EvidenceRef]]:
566
+ """Analyze a chart region."""
567
+ page = args.get("page", 0)
568
+ doc = self._current_document
569
+
570
+ # Find chart/figure chunks
571
+ chart_chunks = [
572
+ c for c in doc.chunks
573
+ if c.chunk_type.value in ("chart", "figure") and c.page == page
574
+ ]
575
+
576
+ if not chart_chunks:
577
+ return "No chart/figure found on this page", []
578
+
579
+ # If we have the image, use vision model
580
+ if page in self._page_images:
581
+ # TODO: Use vision model for chart analysis
582
+ pass
583
+
584
+ return f"Chart found on page {page + 1}: {chart_chunks[0].caption or 'No caption'}", []
585
+
586
+ async def _tool_extract_fields(self, args: Dict[str, Any]) -> Tuple[str, List[EvidenceRef]]:
587
+ """Extract specific fields."""
588
+ schema_dict = args.get("schema", {})
589
+ doc = self._current_document
590
+
591
+ # Build context from chunks
592
+ context = "\n".join(c.text for c in doc.chunks[:20])
593
+
594
+ # Use LLM to extract
595
+ llm = self.llm_client.get_llm(complexity="complex")
596
+
597
+ from langchain_core.messages import HumanMessage, SystemMessage
598
+ system = "Extract the requested fields from the document. Output JSON with field names as keys."
599
+ user = f"Fields to extract: {json.dumps(schema_dict)}\n\nDocument content:\n{context}"
600
+
601
+ response = await llm.ainvoke([
602
+ SystemMessage(content=system),
603
+ HumanMessage(content=user),
604
+ ])
605
+
606
+ return response.content, []
607
+
608
+ def _tool_classify_document(self, args: Dict[str, Any]) -> Tuple[str, List[EvidenceRef]]:
609
+ """Classify document type based on first page."""
610
+ doc = self._current_document
611
+ first_page_chunks = doc.get_page_chunks(0)
612
+ text = " ".join(c.text for c in first_page_chunks[:5])
613
+
614
+ return f"First page content for classification:\n{text[:500]}", []
615
+
616
+ def _tool_search_text(self, args: Dict[str, Any]) -> Tuple[str, List[EvidenceRef]]:
617
+ """Search for text in document."""
618
+ query = args.get("query", "").lower()
619
+ doc = self._current_document
620
+
621
+ matches = []
622
+ evidence = []
623
+
624
+ for chunk in doc.chunks:
625
+ if query in chunk.text.lower():
626
+ matches.append(f"[Page {chunk.page + 1}]: ...{chunk.text}...")
627
+ evidence.append(EvidenceRef(
628
+ chunk_id=chunk.chunk_id,
629
+ page=chunk.page,
630
+ bbox=chunk.bbox,
631
+ source_type="text",
632
+ snippet=chunk.text[:100],
633
+ confidence=chunk.confidence,
634
+ ))
635
+
636
+ if not matches:
637
+ return f"No matches found for '{query}'", []
638
+
639
+ return f"Found {len(matches)} matches:\n" + "\n".join(matches[:10]), evidence[:10]
640
+
641
+ def _parse_answer(self, answer_input: Optional[Dict[str, Any]]) -> Any:
642
+ """Parse the final answer from tool args."""
643
+ if not answer_input:
644
+ return None
645
+
646
+ if isinstance(answer_input, dict):
647
+ return answer_input
648
+
649
+ return {"answer": answer_input}
650
+
651
+ def _calculate_confidence(self, steps: List[ThoughtAction]) -> float:
652
+ """Calculate overall confidence from trace."""
653
+ if not steps:
654
+ return 0.0
655
+
656
+ # Average evidence confidence
657
+ all_evidence = [e for s in steps if s.evidence for e in s.evidence]
658
+ if all_evidence:
659
+ return sum(e.confidence for e in all_evidence) / len(all_evidence)
660
+
661
+ return 0.5 # Default moderate confidence
src/cli/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SPARKNET Command Line Interface
3
+
4
+ Provides CLI commands for document intelligence and RAG operations.
5
+ """
6
+
7
+ from .main import app, main
8
+
9
+ __all__ = ["app", "main"]
src/cli/docint.py ADDED
@@ -0,0 +1,681 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document Intelligence CLI Commands
3
+
4
+ CLI interface for the document_intelligence subsystem.
5
+ """
6
+
7
+ import json
8
+ import sys
9
+ from pathlib import Path
10
+ from typing import List, Optional
11
+
12
+ import click
13
+
14
+
15
+ @click.group(name="docint")
16
+ def docint_cli():
17
+ """Document Intelligence commands."""
18
+ pass
19
+
20
+
21
+ @docint_cli.command()
22
+ @click.argument("path", type=click.Path(exists=True))
23
+ @click.option("--output", "-o", type=click.Path(), help="Output JSON file")
24
+ @click.option("--max-pages", type=int, help="Maximum pages to process")
25
+ @click.option("--dpi", type=int, default=200, help="Render DPI (default: 200)")
26
+ @click.option("--format", "output_format", type=click.Choice(["json", "markdown", "text"]),
27
+ default="json", help="Output format")
28
+ def parse(path: str, output: Optional[str], max_pages: Optional[int],
29
+ dpi: int, output_format: str):
30
+ """
31
+ Parse a document into semantic chunks.
32
+
33
+ Example:
34
+ sparknet docint parse invoice.pdf -o result.json
35
+ sparknet docint parse document.pdf --format markdown
36
+ """
37
+ from src.document_intelligence import (
38
+ DocumentParser,
39
+ ParserConfig,
40
+ )
41
+
42
+ config = ParserConfig(
43
+ render_dpi=dpi,
44
+ max_pages=max_pages,
45
+ )
46
+
47
+ parser = DocumentParser(config=config)
48
+
49
+ click.echo(f"Parsing: {path}")
50
+
51
+ try:
52
+ result = parser.parse(path)
53
+
54
+ if output_format == "json":
55
+ output_data = {
56
+ "doc_id": result.doc_id,
57
+ "filename": result.filename,
58
+ "num_pages": result.num_pages,
59
+ "chunks": [
60
+ {
61
+ "chunk_id": c.chunk_id,
62
+ "type": c.chunk_type.value,
63
+ "text": c.text,
64
+ "page": c.page,
65
+ "bbox": c.bbox.xyxy,
66
+ "confidence": c.confidence,
67
+ }
68
+ for c in result.chunks
69
+ ],
70
+ "processing_time_ms": result.processing_time_ms,
71
+ }
72
+
73
+ if output:
74
+ with open(output, "w") as f:
75
+ json.dump(output_data, f, indent=2)
76
+ click.echo(f"Output written to: {output}")
77
+ else:
78
+ click.echo(json.dumps(output_data, indent=2))
79
+
80
+ elif output_format == "markdown":
81
+ if output:
82
+ with open(output, "w") as f:
83
+ f.write(result.markdown_full)
84
+ click.echo(f"Markdown written to: {output}")
85
+ else:
86
+ click.echo(result.markdown_full)
87
+
88
+ else: # text
89
+ for chunk in result.chunks:
90
+ click.echo(f"[Page {chunk.page}, {chunk.chunk_type.value}]")
91
+ click.echo(chunk.text)
92
+ click.echo()
93
+
94
+ click.echo(f"\nParsed {len(result.chunks)} chunks in {result.processing_time_ms:.0f}ms")
95
+
96
+ except Exception as e:
97
+ click.echo(f"Error: {e}", err=True)
98
+ sys.exit(1)
99
+
100
+
101
+ @docint_cli.command()
102
+ @click.argument("path", type=click.Path(exists=True))
103
+ @click.option("--field", "-f", multiple=True, help="Field to extract (can specify multiple)")
104
+ @click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema file")
105
+ @click.option("--preset", type=click.Choice(["invoice", "receipt", "contract"]),
106
+ help="Use preset schema")
107
+ @click.option("--output", "-o", type=click.Path(), help="Output JSON file")
108
+ def extract(path: str, field: tuple, schema: Optional[str], preset: Optional[str],
109
+ output: Optional[str]):
110
+ """
111
+ Extract fields from a document.
112
+
113
+ Example:
114
+ sparknet docint extract invoice.pdf --preset invoice
115
+ sparknet docint extract doc.pdf -f vendor_name -f total_amount
116
+ sparknet docint extract doc.pdf --schema my_schema.json
117
+ """
118
+ from src.document_intelligence import (
119
+ DocumentParser,
120
+ FieldExtractor,
121
+ ExtractionSchema,
122
+ FieldSpec,
123
+ FieldType,
124
+ create_invoice_schema,
125
+ create_receipt_schema,
126
+ create_contract_schema,
127
+ )
128
+
129
+ # Build schema
130
+ if preset:
131
+ if preset == "invoice":
132
+ extraction_schema = create_invoice_schema()
133
+ elif preset == "receipt":
134
+ extraction_schema = create_receipt_schema()
135
+ elif preset == "contract":
136
+ extraction_schema = create_contract_schema()
137
+ elif schema:
138
+ with open(schema) as f:
139
+ schema_dict = json.load(f)
140
+ extraction_schema = ExtractionSchema.from_json_schema(schema_dict)
141
+ elif field:
142
+ extraction_schema = ExtractionSchema(name="custom")
143
+ for f in field:
144
+ extraction_schema.add_string_field(f, required=True)
145
+ else:
146
+ click.echo("Error: Specify --field, --schema, or --preset", err=True)
147
+ sys.exit(1)
148
+
149
+ click.echo(f"Extracting from: {path}")
150
+ click.echo(f"Fields: {', '.join(f.name for f in extraction_schema.fields)}")
151
+
152
+ try:
153
+ # Parse document
154
+ parser = DocumentParser()
155
+ parse_result = parser.parse(path)
156
+
157
+ # Extract fields
158
+ extractor = FieldExtractor()
159
+ result = extractor.extract(parse_result, extraction_schema)
160
+
161
+ output_data = {
162
+ "doc_id": parse_result.doc_id,
163
+ "filename": parse_result.filename,
164
+ "extracted_data": result.data,
165
+ "confidence": result.overall_confidence,
166
+ "abstained_fields": result.abstained_fields,
167
+ "evidence": [
168
+ {
169
+ "chunk_id": e.chunk_id,
170
+ "page": e.page,
171
+ "bbox": e.bbox.xyxy,
172
+ "snippet": e.snippet,
173
+ }
174
+ for e in result.evidence
175
+ ],
176
+ }
177
+
178
+ if output:
179
+ with open(output, "w") as f:
180
+ json.dump(output_data, f, indent=2)
181
+ click.echo(f"Output written to: {output}")
182
+ else:
183
+ click.echo("\nExtracted Data:")
184
+ for key, value in result.data.items():
185
+ status = "" if key not in result.abstained_fields else " [ABSTAINED]"
186
+ click.echo(f" {key}: {value}{status}")
187
+
188
+ click.echo(f"\nConfidence: {result.overall_confidence:.2f}")
189
+
190
+ if result.abstained_fields:
191
+ click.echo(f"Abstained: {', '.join(result.abstained_fields)}")
192
+
193
+ except Exception as e:
194
+ click.echo(f"Error: {e}", err=True)
195
+ sys.exit(1)
196
+
197
+
198
+ @docint_cli.command()
199
+ @click.argument("path", type=click.Path(exists=True))
200
+ @click.argument("question")
201
+ @click.option("--verbose", "-v", is_flag=True, help="Show evidence details")
202
+ @click.option("--use-rag", is_flag=True, help="Use RAG for retrieval (requires indexed document)")
203
+ @click.option("--document-id", "-d", help="Document ID for RAG retrieval")
204
+ @click.option("--top-k", "-k", type=int, default=5, help="Number of chunks to consider")
205
+ @click.option("--chunk-type", "-t", multiple=True, help="Filter by chunk type (can specify multiple)")
206
+ @click.option("--page-start", type=int, help="Filter by page range start")
207
+ @click.option("--page-end", type=int, help="Filter by page range end")
208
+ def ask(path: str, question: str, verbose: bool, use_rag: bool,
209
+ document_id: Optional[str], top_k: int, chunk_type: tuple,
210
+ page_start: Optional[int], page_end: Optional[int]):
211
+ """
212
+ Ask a question about a document.
213
+
214
+ Example:
215
+ sparknet docint ask invoice.pdf "What is the total amount?"
216
+ sparknet docint ask doc.pdf "Find claims" --use-rag --top-k 10
217
+ sparknet docint ask doc.pdf "What tables show?" -t table --use-rag
218
+ """
219
+ from src.document_intelligence import DocumentParser
220
+
221
+ click.echo(f"Document: {path}")
222
+ click.echo(f"Question: {question}")
223
+
224
+ if use_rag:
225
+ click.echo("Mode: RAG (semantic retrieval)")
226
+ else:
227
+ click.echo("Mode: Keyword search")
228
+
229
+ click.echo()
230
+
231
+ try:
232
+ if use_rag:
233
+ # Use RAG-based answering
234
+ from src.document_intelligence.tools import get_rag_tool
235
+
236
+ tool = get_rag_tool("rag_answer")
237
+
238
+ # Build page range if specified
239
+ page_range = None
240
+ if page_start is not None and page_end is not None:
241
+ page_range = (page_start, page_end)
242
+
243
+ result = tool.execute(
244
+ question=question,
245
+ document_id=document_id,
246
+ top_k=top_k,
247
+ chunk_types=list(chunk_type) if chunk_type else None,
248
+ page_range=page_range,
249
+ )
250
+ else:
251
+ # Parse document and use keyword-based search
252
+ from src.document_intelligence.tools import get_tool
253
+
254
+ parser = DocumentParser()
255
+ parse_result = parser.parse(path)
256
+
257
+ tool = get_tool("answer_question")
258
+ result = tool.execute(
259
+ parse_result=parse_result,
260
+ question=question,
261
+ top_k=top_k,
262
+ )
263
+
264
+ if result.success:
265
+ data = result.data
266
+ click.echo(f"Answer: {data.get('answer', 'No answer found')}")
267
+ click.echo(f"Confidence: {data.get('confidence', 0):.2f}")
268
+
269
+ if data.get('abstained'):
270
+ click.echo("Note: The system abstained due to low confidence.")
271
+
272
+ if verbose and result.evidence:
273
+ click.echo("\nEvidence:")
274
+ for ev in result.evidence:
275
+ click.echo(f" - Page {ev.get('page', '?')}: {ev.get('snippet', '')[:100]}...")
276
+
277
+ if data.get('citations'):
278
+ click.echo("\nCitations:")
279
+ for cit in data['citations']:
280
+ click.echo(f" [{cit['index']}] {cit.get('text', '')[:80]}...")
281
+ else:
282
+ click.echo(f"Error: {result.error}", err=True)
283
+
284
+ except Exception as e:
285
+ click.echo(f"Error: {e}", err=True)
286
+ sys.exit(1)
287
+
288
+
289
+ @docint_cli.command()
290
+ @click.argument("path", type=click.Path(exists=True))
291
+ @click.option("--output", "-o", type=click.Path(), help="Output JSON file")
292
+ def classify(path: str, output: Optional[str]):
293
+ """
294
+ Classify a document's type.
295
+
296
+ Example:
297
+ sparknet docint classify document.pdf
298
+ """
299
+ from src.document_intelligence import DocumentParser
300
+ from src.document_intelligence.chunks import DocumentType
301
+
302
+ click.echo(f"Classifying: {path}")
303
+
304
+ try:
305
+ # Parse document
306
+ parser = DocumentParser()
307
+ parse_result = parser.parse(path)
308
+
309
+ # Simple classification based on keywords
310
+ first_page_chunks = [c for c in parse_result.chunks if c.page == 1][:5]
311
+ content = " ".join(c.text[:200] for c in first_page_chunks).lower()
312
+
313
+ doc_type = "other"
314
+ confidence = 0.5
315
+
316
+ type_keywords = {
317
+ "invoice": ["invoice", "bill", "payment due", "amount due", "invoice number"],
318
+ "contract": ["agreement", "contract", "party", "whereas", "terms and conditions"],
319
+ "receipt": ["receipt", "paid", "transaction", "thank you for your purchase"],
320
+ "form": ["form", "fill in", "checkbox", "signature line"],
321
+ "letter": ["dear", "sincerely", "regards", "to whom it may concern"],
322
+ "report": ["report", "findings", "conclusion", "summary", "analysis"],
323
+ "patent": ["patent", "claims", "invention", "embodiment", "disclosed"],
324
+ }
325
+
326
+ for dtype, keywords in type_keywords.items():
327
+ matches = sum(1 for k in keywords if k in content)
328
+ if matches >= 2:
329
+ doc_type = dtype
330
+ confidence = min(0.95, 0.5 + matches * 0.15)
331
+ break
332
+
333
+ output_data = {
334
+ "doc_id": parse_result.doc_id,
335
+ "filename": parse_result.filename,
336
+ "document_type": doc_type,
337
+ "confidence": confidence,
338
+ }
339
+
340
+ if output:
341
+ with open(output, "w") as f:
342
+ json.dump(output_data, f, indent=2)
343
+ click.echo(f"Output written to: {output}")
344
+ else:
345
+ click.echo(f"Type: {doc_type}")
346
+ click.echo(f"Confidence: {confidence:.2f}")
347
+
348
+ except Exception as e:
349
+ click.echo(f"Error: {e}", err=True)
350
+ sys.exit(1)
351
+
352
+
353
+ @docint_cli.command()
354
+ @click.argument("path", type=click.Path(exists=True))
355
+ @click.option("--query", "-q", help="Search query")
356
+ @click.option("--type", "chunk_type", help="Filter by chunk type")
357
+ @click.option("--top", "-k", type=int, default=10, help="Number of results")
358
+ def search(path: str, query: Optional[str], chunk_type: Optional[str], top: int):
359
+ """
360
+ Search document content.
361
+
362
+ Example:
363
+ sparknet docint search document.pdf -q "payment terms"
364
+ sparknet docint search document.pdf --type table
365
+ """
366
+ from src.document_intelligence import DocumentParser
367
+ from src.document_intelligence.tools import get_tool
368
+
369
+ click.echo(f"Searching: {path}")
370
+
371
+ try:
372
+ # Parse document
373
+ parser = DocumentParser()
374
+ parse_result = parser.parse(path)
375
+
376
+ if query:
377
+ # Search by query
378
+ tool = get_tool("search_chunks")
379
+ result = tool.execute(
380
+ parse_result=parse_result,
381
+ query=query,
382
+ chunk_types=[chunk_type] if chunk_type else None,
383
+ top_k=top,
384
+ )
385
+
386
+ if result.success:
387
+ results = result.data.get("results", [])
388
+ click.echo(f"Found {len(results)} results:\n")
389
+
390
+ for i, r in enumerate(results, 1):
391
+ click.echo(f"{i}. [Page {r['page']}, {r['type']}] (score: {r['score']:.2f})")
392
+ click.echo(f" {r['text'][:200]}...")
393
+ click.echo()
394
+ else:
395
+ click.echo(f"Error: {result.error}", err=True)
396
+
397
+ elif chunk_type:
398
+ # Filter by type
399
+ matching = [c for c in parse_result.chunks if c.chunk_type.value == chunk_type]
400
+ click.echo(f"Found {len(matching)} {chunk_type} chunks:\n")
401
+
402
+ for i, chunk in enumerate(matching[:top], 1):
403
+ click.echo(f"{i}. [Page {chunk.page}] {chunk.chunk_id}")
404
+ click.echo(f" {chunk.text[:200]}...")
405
+ click.echo()
406
+
407
+ else:
408
+ # List all chunks
409
+ click.echo(f"Total chunks: {len(parse_result.chunks)}\n")
410
+
411
+ # Group by type
412
+ by_type = {}
413
+ for chunk in parse_result.chunks:
414
+ t = chunk.chunk_type.value
415
+ by_type[t] = by_type.get(t, 0) + 1
416
+
417
+ click.echo("Chunk types:")
418
+ for t, count in sorted(by_type.items()):
419
+ click.echo(f" {t}: {count}")
420
+
421
+ except Exception as e:
422
+ click.echo(f"Error: {e}", err=True)
423
+ sys.exit(1)
424
+
425
+
426
+ @docint_cli.command()
427
+ @click.argument("path", type=click.Path(exists=True))
428
+ @click.option("--page", "-p", type=int, default=1, help="Page number")
429
+ @click.option("--output-dir", "-d", type=click.Path(), default="./crops",
430
+ help="Output directory for crops")
431
+ @click.option("--annotate", "-a", is_flag=True, help="Create annotated page image")
432
+ def visualize(path: str, page: int, output_dir: str, annotate: bool):
433
+ """
434
+ Visualize document regions.
435
+
436
+ Example:
437
+ sparknet docint visualize document.pdf --page 1 --annotate
438
+ """
439
+ from src.document_intelligence import (
440
+ DocumentParser,
441
+ load_document,
442
+ RenderOptions,
443
+ )
444
+ from src.document_intelligence.grounding import create_annotated_image, CropManager
445
+ from PIL import Image
446
+ import numpy as np
447
+
448
+ output_path = Path(output_dir)
449
+ output_path.mkdir(parents=True, exist_ok=True)
450
+
451
+ click.echo(f"Processing: {path}, page {page}")
452
+
453
+ try:
454
+ # Parse document
455
+ parser = DocumentParser()
456
+ parse_result = parser.parse(path)
457
+
458
+ # Load and render page
459
+ loader, renderer = load_document(path)
460
+ page_image = renderer.render_page(page, RenderOptions(dpi=200))
461
+ loader.close()
462
+
463
+ # Get page chunks
464
+ page_chunks = [c for c in parse_result.chunks if c.page == page]
465
+
466
+ if annotate:
467
+ # Create annotated image
468
+ bboxes = [c.bbox for c in page_chunks]
469
+ labels = [f"{c.chunk_type.value[:10]}" for c in page_chunks]
470
+
471
+ annotated = create_annotated_image(page_image, bboxes, labels)
472
+
473
+ output_file = output_path / f"annotated_page_{page}.png"
474
+ Image.fromarray(annotated).save(output_file)
475
+ click.echo(f"Saved annotated image: {output_file}")
476
+
477
+ else:
478
+ # Save individual crops
479
+ crop_manager = CropManager(output_path)
480
+
481
+ for chunk in page_chunks:
482
+ crop_path = crop_manager.save_crop(
483
+ page_image,
484
+ parse_result.doc_id,
485
+ page,
486
+ chunk.bbox,
487
+ )
488
+ click.echo(f"Saved crop: {crop_path}")
489
+
490
+ click.echo(f"\nProcessed {len(page_chunks)} chunks from page {page}")
491
+
492
+ except Exception as e:
493
+ click.echo(f"Error: {e}", err=True)
494
+ sys.exit(1)
495
+
496
+
497
+ @docint_cli.command()
498
+ @click.argument("paths", nargs=-1, type=click.Path(exists=True), required=True)
499
+ @click.option("--max-pages", type=int, help="Maximum pages to process per document")
500
+ @click.option("--batch-size", type=int, default=32, help="Embedding batch size")
501
+ @click.option("--min-length", type=int, default=10, help="Minimum chunk text length")
502
+ def index(paths: tuple, max_pages: Optional[int], batch_size: int, min_length: int):
503
+ """
504
+ Index documents into the vector store for RAG.
505
+
506
+ Example:
507
+ sparknet docint index document.pdf
508
+ sparknet docint index *.pdf --max-pages 50
509
+ sparknet docint index doc1.pdf doc2.pdf doc3.pdf
510
+ """
511
+ from src.document_intelligence.tools import get_rag_tool
512
+
513
+ click.echo(f"Indexing {len(paths)} document(s)...")
514
+ click.echo()
515
+
516
+ try:
517
+ tool = get_rag_tool("index_document")
518
+
519
+ total_indexed = 0
520
+ total_skipped = 0
521
+ errors = []
522
+
523
+ for path in paths:
524
+ click.echo(f"Processing: {path}")
525
+
526
+ result = tool.execute(
527
+ path=path,
528
+ max_pages=max_pages,
529
+ )
530
+
531
+ if result.success:
532
+ data = result.data
533
+ indexed = data.get("chunks_indexed", 0)
534
+ skipped = data.get("chunks_skipped", 0)
535
+ total_indexed += indexed
536
+ total_skipped += skipped
537
+ click.echo(f" Indexed: {indexed} chunks, Skipped: {skipped}")
538
+ click.echo(f" Document ID: {data.get('document_id', 'unknown')}")
539
+ else:
540
+ errors.append((path, result.error))
541
+ click.echo(f" Error: {result.error}", err=True)
542
+
543
+ click.echo()
544
+ click.echo("=" * 40)
545
+ click.echo(f"Total documents: {len(paths)}")
546
+ click.echo(f"Total chunks indexed: {total_indexed}")
547
+ click.echo(f"Total chunks skipped: {total_skipped}")
548
+
549
+ if errors:
550
+ click.echo(f"Errors: {len(errors)}")
551
+ for path, err in errors:
552
+ click.echo(f" - {path}: {err}")
553
+
554
+ except Exception as e:
555
+ click.echo(f"Error: {e}", err=True)
556
+ sys.exit(1)
557
+
558
+
559
+ @docint_cli.command(name="index-stats")
560
+ def index_stats():
561
+ """
562
+ Show statistics about the vector store index.
563
+
564
+ Example:
565
+ sparknet docint index-stats
566
+ """
567
+ from src.document_intelligence.tools import get_rag_tool
568
+
569
+ try:
570
+ tool = get_rag_tool("get_index_stats")
571
+ result = tool.execute()
572
+
573
+ if result.success:
574
+ data = result.data
575
+ click.echo("Vector Store Statistics:")
576
+ click.echo(f" Total chunks: {data.get('total_chunks', 0)}")
577
+ click.echo(f" Embedding model: {data.get('embedding_model', 'unknown')}")
578
+ click.echo(f" Embedding dimension: {data.get('embedding_dimension', 'unknown')}")
579
+ else:
580
+ click.echo(f"Error: {result.error}", err=True)
581
+
582
+ except Exception as e:
583
+ click.echo(f"Error: {e}", err=True)
584
+ sys.exit(1)
585
+
586
+
587
+ @docint_cli.command(name="delete-index")
588
+ @click.argument("document_id")
589
+ @click.option("--yes", "-y", is_flag=True, help="Skip confirmation prompt")
590
+ def delete_index(document_id: str, yes: bool):
591
+ """
592
+ Delete a document from the vector store index.
593
+
594
+ Example:
595
+ sparknet docint delete-index doc_abc123
596
+ """
597
+ from src.document_intelligence.tools import get_rag_tool
598
+
599
+ if not yes:
600
+ click.confirm(f"Delete document '{document_id}' from index?", abort=True)
601
+
602
+ try:
603
+ tool = get_rag_tool("delete_document")
604
+ result = tool.execute(document_id=document_id)
605
+
606
+ if result.success:
607
+ data = result.data
608
+ click.echo(f"Deleted {data.get('chunks_deleted', 0)} chunks for document: {document_id}")
609
+ else:
610
+ click.echo(f"Error: {result.error}", err=True)
611
+
612
+ except Exception as e:
613
+ click.echo(f"Error: {e}", err=True)
614
+ sys.exit(1)
615
+
616
+
617
+ @docint_cli.command(name="retrieve")
618
+ @click.argument("query")
619
+ @click.option("--top-k", "-k", type=int, default=5, help="Number of results")
620
+ @click.option("--document-id", "-d", help="Filter by document ID")
621
+ @click.option("--chunk-type", "-t", multiple=True, help="Filter by chunk type")
622
+ @click.option("--page-start", type=int, help="Filter by page range start")
623
+ @click.option("--page-end", type=int, help="Filter by page range end")
624
+ @click.option("--verbose", "-v", is_flag=True, help="Show full chunk text")
625
+ def retrieve(query: str, top_k: int, document_id: Optional[str],
626
+ chunk_type: tuple, page_start: Optional[int],
627
+ page_end: Optional[int], verbose: bool):
628
+ """
629
+ Retrieve relevant chunks from the vector store.
630
+
631
+ Example:
632
+ sparknet docint retrieve "payment terms"
633
+ sparknet docint retrieve "claims" -d doc_abc123 -t paragraph -k 10
634
+ """
635
+ from src.document_intelligence.tools import get_rag_tool
636
+
637
+ click.echo(f"Query: {query}")
638
+ click.echo()
639
+
640
+ try:
641
+ tool = get_rag_tool("retrieve_chunks")
642
+
643
+ page_range = None
644
+ if page_start is not None and page_end is not None:
645
+ page_range = (page_start, page_end)
646
+
647
+ result = tool.execute(
648
+ query=query,
649
+ top_k=top_k,
650
+ document_id=document_id,
651
+ chunk_types=list(chunk_type) if chunk_type else None,
652
+ page_range=page_range,
653
+ )
654
+
655
+ if result.success:
656
+ data = result.data
657
+ chunks = data.get("chunks", [])
658
+ click.echo(f"Found {len(chunks)} results:\n")
659
+
660
+ for i, chunk in enumerate(chunks, 1):
661
+ click.echo(f"{i}. [sim={chunk['similarity']:.3f}] Page {chunk.get('page', '?')}, {chunk.get('chunk_type', 'text')}")
662
+ click.echo(f" Document: {chunk['document_id']}")
663
+
664
+ text = chunk['text']
665
+ if verbose:
666
+ click.echo(f" Text: {text}")
667
+ else:
668
+ click.echo(f" Text: {text[:150]}...")
669
+ click.echo()
670
+ else:
671
+ click.echo(f"Error: {result.error}", err=True)
672
+
673
+ except Exception as e:
674
+ click.echo(f"Error: {e}", err=True)
675
+ sys.exit(1)
676
+
677
+
678
+ # Register with main CLI
679
+ def register_commands(cli):
680
+ """Register docint commands with main CLI."""
681
+ cli.add_command(docint_cli)
src/cli/document.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document Processing CLI Commands
3
+
4
+ Commands:
5
+ sparknet document parse <file> - Parse and extract text from document
6
+ sparknet document extract <file> - Extract structured fields
7
+ sparknet document classify <file> - Classify document type
8
+ sparknet document analyze <file> - Full document analysis
9
+ """
10
+
11
+ import typer
12
+ from typing import Optional, List
13
+ from pathlib import Path
14
+ import json
15
+ import sys
16
+
17
+ # Create document sub-app
18
+ document_app = typer.Typer(
19
+ name="document",
20
+ help="Document processing commands",
21
+ )
22
+
23
+
24
+ @document_app.command("parse")
25
+ def parse_document(
26
+ file_path: Path = typer.Argument(..., help="Path to document file"),
27
+ output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"),
28
+ ocr_engine: str = typer.Option("paddleocr", "--ocr", help="OCR engine: paddleocr, tesseract"),
29
+ dpi: int = typer.Option(300, "--dpi", help="Rendering DPI for PDFs"),
30
+ max_pages: Optional[int] = typer.Option(None, "--max-pages", help="Maximum pages to process"),
31
+ include_images: bool = typer.Option(False, "--images", help="Include cropped region images"),
32
+ ):
33
+ """
34
+ Parse a document and extract text with layout information.
35
+
36
+ Example:
37
+ sparknet document parse invoice.pdf -o result.json
38
+ """
39
+ from loguru import logger
40
+
41
+ if not file_path.exists():
42
+ typer.echo(f"Error: File not found: {file_path}", err=True)
43
+ raise typer.Exit(1)
44
+
45
+ typer.echo(f"Parsing document: {file_path}")
46
+
47
+ try:
48
+ from ..document.pipeline import (
49
+ PipelineConfig,
50
+ get_document_processor,
51
+ )
52
+ from ..document.ocr import OCRConfig
53
+
54
+ # Build config
55
+ ocr_config = OCRConfig(engine=ocr_engine)
56
+ config = PipelineConfig(
57
+ ocr=ocr_config,
58
+ render_dpi=dpi,
59
+ max_pages=max_pages,
60
+ )
61
+
62
+ # Process document
63
+ processor = get_document_processor(config)
64
+ result = processor.process(str(file_path))
65
+
66
+ # Format output
67
+ output_data = {
68
+ "document_id": result.metadata.document_id,
69
+ "filename": result.metadata.filename,
70
+ "num_pages": result.metadata.num_pages,
71
+ "total_chunks": result.metadata.total_chunks,
72
+ "total_characters": result.metadata.total_characters,
73
+ "ocr_confidence": result.metadata.ocr_confidence_avg,
74
+ "chunks": [
75
+ {
76
+ "chunk_id": c.chunk_id,
77
+ "type": c.chunk_type.value,
78
+ "page": c.page,
79
+ "text": c.text[:500] + "..." if len(c.text) > 500 else c.text,
80
+ "confidence": c.confidence,
81
+ "bbox": {
82
+ "x_min": c.bbox.x_min,
83
+ "y_min": c.bbox.y_min,
84
+ "x_max": c.bbox.x_max,
85
+ "y_max": c.bbox.y_max,
86
+ },
87
+ }
88
+ for c in result.chunks
89
+ ],
90
+ "full_text": result.full_text[:2000] + "..." if len(result.full_text) > 2000 else result.full_text,
91
+ }
92
+
93
+ # Output
94
+ if output:
95
+ with open(output, "w") as f:
96
+ json.dump(output_data, f, indent=2)
97
+ typer.echo(f"Results written to: {output}")
98
+ else:
99
+ typer.echo(json.dumps(output_data, indent=2))
100
+
101
+ typer.echo(f"\nProcessed {result.metadata.num_pages} pages, {len(result.chunks)} chunks")
102
+
103
+ except ImportError as e:
104
+ typer.echo(f"Error: Missing dependency - {e}", err=True)
105
+ raise typer.Exit(1)
106
+ except Exception as e:
107
+ typer.echo(f"Error processing document: {e}", err=True)
108
+ raise typer.Exit(1)
109
+
110
+
111
+ @document_app.command("extract")
112
+ def extract_fields(
113
+ file_path: Path = typer.Argument(..., help="Path to document file"),
114
+ schema: Optional[Path] = typer.Option(None, "--schema", "-s", help="Extraction schema YAML file"),
115
+ fields: Optional[List[str]] = typer.Option(None, "--field", "-f", help="Fields to extract (can use multiple)"),
116
+ output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"),
117
+ validate: bool = typer.Option(True, "--validate/--no-validate", help="Validate extraction"),
118
+ ):
119
+ """
120
+ Extract structured fields from a document.
121
+
122
+ Example:
123
+ sparknet document extract invoice.pdf -f "invoice_number" -f "total_amount"
124
+ sparknet document extract contract.pdf --schema contract_schema.yaml
125
+ """
126
+ from loguru import logger
127
+
128
+ if not file_path.exists():
129
+ typer.echo(f"Error: File not found: {file_path}", err=True)
130
+ raise typer.Exit(1)
131
+
132
+ if not schema and not fields:
133
+ typer.echo("Error: Provide --schema or --field options", err=True)
134
+ raise typer.Exit(1)
135
+
136
+ typer.echo(f"Extracting fields from: {file_path}")
137
+
138
+ try:
139
+ from ..document.schemas.extraction import ExtractionSchema, FieldDefinition
140
+ from ..agents.document_agent import DocumentAgent
141
+
142
+ # Build extraction schema
143
+ if schema:
144
+ import yaml
145
+ with open(schema) as f:
146
+ schema_data = yaml.safe_load(f)
147
+ extraction_schema = ExtractionSchema(**schema_data)
148
+ else:
149
+ # Build from field names
150
+ field_defs = [
151
+ FieldDefinition(
152
+ name=f,
153
+ field_type="string",
154
+ required=True,
155
+ )
156
+ for f in fields
157
+ ]
158
+ extraction_schema = ExtractionSchema(
159
+ name="cli_extraction",
160
+ fields=field_defs,
161
+ )
162
+
163
+ # Run extraction with agent
164
+ import asyncio
165
+ agent = DocumentAgent()
166
+ asyncio.run(agent.load_document(str(file_path)))
167
+ result = asyncio.run(agent.extract_fields(extraction_schema))
168
+
169
+ # Format output
170
+ output_data = {
171
+ "document": str(file_path),
172
+ "fields": result.fields,
173
+ "confidence": result.confidence,
174
+ "evidence": [
175
+ {
176
+ "chunk_id": e.chunk_id,
177
+ "page": e.page,
178
+ "snippet": e.snippet,
179
+ }
180
+ for e in result.evidence
181
+ ] if result.evidence else [],
182
+ }
183
+
184
+ # Validate if requested
185
+ if validate and result.fields:
186
+ from ..document.validation import get_extraction_critic
187
+ critic = get_extraction_critic()
188
+
189
+ evidence_chunks = [
190
+ {"text": e.snippet, "page": e.page, "chunk_id": e.chunk_id}
191
+ for e in result.evidence
192
+ ] if result.evidence else []
193
+
194
+ validation = critic.validate_extraction(result.fields, evidence_chunks)
195
+ output_data["validation"] = {
196
+ "status": validation.overall_status.value,
197
+ "confidence": validation.overall_confidence,
198
+ "should_accept": validation.should_accept,
199
+ "abstain_reason": validation.abstain_reason,
200
+ }
201
+
202
+ # Output
203
+ if output:
204
+ with open(output, "w") as f:
205
+ json.dump(output_data, f, indent=2)
206
+ typer.echo(f"Results written to: {output}")
207
+ else:
208
+ typer.echo(json.dumps(output_data, indent=2))
209
+
210
+ except ImportError as e:
211
+ typer.echo(f"Error: Missing dependency - {e}", err=True)
212
+ raise typer.Exit(1)
213
+ except Exception as e:
214
+ typer.echo(f"Error extracting fields: {e}", err=True)
215
+ raise typer.Exit(1)
216
+
217
+
218
+ @document_app.command("classify")
219
+ def classify_document(
220
+ file_path: Path = typer.Argument(..., help="Path to document file"),
221
+ output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"),
222
+ ):
223
+ """
224
+ Classify document type.
225
+
226
+ Example:
227
+ sparknet document classify document.pdf
228
+ """
229
+ from loguru import logger
230
+
231
+ if not file_path.exists():
232
+ typer.echo(f"Error: File not found: {file_path}", err=True)
233
+ raise typer.Exit(1)
234
+
235
+ typer.echo(f"Classifying document: {file_path}")
236
+
237
+ try:
238
+ from ..agents.document_agent import DocumentAgent
239
+ import asyncio
240
+
241
+ agent = DocumentAgent()
242
+ asyncio.run(agent.load_document(str(file_path)))
243
+ classification = asyncio.run(agent.classify())
244
+
245
+ output_data = {
246
+ "document": str(file_path),
247
+ "document_type": classification.document_type.value,
248
+ "confidence": classification.confidence,
249
+ "reasoning": classification.reasoning,
250
+ "metadata": classification.metadata,
251
+ }
252
+
253
+ if output:
254
+ with open(output, "w") as f:
255
+ json.dump(output_data, f, indent=2)
256
+ typer.echo(f"Results written to: {output}")
257
+ else:
258
+ typer.echo(json.dumps(output_data, indent=2))
259
+
260
+ except Exception as e:
261
+ typer.echo(f"Error classifying document: {e}", err=True)
262
+ raise typer.Exit(1)
263
+
264
+
265
+ @document_app.command("ask")
266
+ def ask_document(
267
+ file_path: Path = typer.Argument(..., help="Path to document file"),
268
+ question: str = typer.Argument(..., help="Question to ask about the document"),
269
+ output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"),
270
+ ):
271
+ """
272
+ Ask a question about a document.
273
+
274
+ Example:
275
+ sparknet document ask invoice.pdf "What is the total amount?"
276
+ """
277
+ from loguru import logger
278
+
279
+ if not file_path.exists():
280
+ typer.echo(f"Error: File not found: {file_path}", err=True)
281
+ raise typer.Exit(1)
282
+
283
+ typer.echo(f"Processing question for: {file_path}")
284
+
285
+ try:
286
+ from ..agents.document_agent import DocumentAgent
287
+ import asyncio
288
+
289
+ agent = DocumentAgent()
290
+ asyncio.run(agent.load_document(str(file_path)))
291
+ answer, evidence = asyncio.run(agent.answer_question(question))
292
+
293
+ output_data = {
294
+ "document": str(file_path),
295
+ "question": question,
296
+ "answer": answer,
297
+ "evidence": [
298
+ {
299
+ "chunk_id": e.chunk_id,
300
+ "page": e.page,
301
+ "snippet": e.snippet,
302
+ "confidence": e.confidence,
303
+ }
304
+ for e in evidence
305
+ ] if evidence else [],
306
+ }
307
+
308
+ if output:
309
+ with open(output, "w") as f:
310
+ json.dump(output_data, f, indent=2)
311
+ typer.echo(f"Results written to: {output}")
312
+ else:
313
+ typer.echo(f"\nQuestion: {question}")
314
+ typer.echo(f"\nAnswer: {answer}")
315
+ if evidence:
316
+ typer.echo(f"\nEvidence ({len(evidence)} sources):")
317
+ for e in evidence[:3]:
318
+ typer.echo(f" - Page {e.page + 1}: {e.snippet[:100]}...")
319
+
320
+ except Exception as e:
321
+ typer.echo(f"Error processing question: {e}", err=True)
322
+ raise typer.Exit(1)
src/cli/main.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SPARKNET CLI Main Entry Point
3
+
4
+ Usage:
5
+ sparknet document parse <file>
6
+ sparknet document extract <file> --schema <schema.yaml>
7
+ sparknet rag index <file>
8
+ sparknet rag ask <question>
9
+ """
10
+
11
+ import typer
12
+ from typing import Optional
13
+ from pathlib import Path
14
+ import json
15
+ import sys
16
+
17
+ from .document import document_app
18
+ from .rag import rag_app
19
+
20
+ # Create main app
21
+ app = typer.Typer(
22
+ name="sparknet",
23
+ help="SPARKNET Document Intelligence CLI",
24
+ add_completion=False,
25
+ )
26
+
27
+ # Register sub-commands
28
+ app.add_typer(document_app, name="document", help="Document processing commands")
29
+ app.add_typer(rag_app, name="rag", help="RAG and retrieval commands")
30
+
31
+
32
+ @app.command()
33
+ def version():
34
+ """Show SPARKNET version."""
35
+ typer.echo("SPARKNET Document Intelligence v0.1.0")
36
+
37
+
38
+ @app.command()
39
+ def info():
40
+ """Show system information and configuration."""
41
+ from loguru import logger
42
+ import platform
43
+
44
+ typer.echo("SPARKNET Document Intelligence")
45
+ typer.echo("=" * 40)
46
+ typer.echo(f"Python: {platform.python_version()}")
47
+ typer.echo(f"Platform: {platform.system()} {platform.release()}")
48
+ typer.echo()
49
+
50
+ # Check component availability
51
+ typer.echo("Components:")
52
+
53
+ # OCR
54
+ try:
55
+ from paddleocr import PaddleOCR
56
+ typer.echo(" [✓] PaddleOCR")
57
+ except ImportError:
58
+ typer.echo(" [✗] PaddleOCR (install with: pip install paddleocr)")
59
+
60
+ try:
61
+ import pytesseract
62
+ typer.echo(" [✓] Tesseract")
63
+ except ImportError:
64
+ typer.echo(" [✗] Tesseract (install with: pip install pytesseract)")
65
+
66
+ # Vector Store
67
+ try:
68
+ import chromadb
69
+ typer.echo(" [✓] ChromaDB")
70
+ except ImportError:
71
+ typer.echo(" [✗] ChromaDB (install with: pip install chromadb)")
72
+
73
+ # Ollama
74
+ try:
75
+ import httpx
76
+ with httpx.Client(timeout=2.0) as client:
77
+ resp = client.get("http://localhost:11434/api/tags")
78
+ if resp.status_code == 200:
79
+ models = resp.json().get("models", [])
80
+ typer.echo(f" [✓] Ollama ({len(models)} models)")
81
+ else:
82
+ typer.echo(" [✗] Ollama (not responding)")
83
+ except Exception:
84
+ typer.echo(" [✗] Ollama (not running)")
85
+
86
+
87
+ @app.callback()
88
+ def main_callback(
89
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output"),
90
+ quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress output"),
91
+ ):
92
+ """SPARKNET Document Intelligence CLI."""
93
+ from loguru import logger
94
+ import sys
95
+
96
+ # Configure logging
97
+ logger.remove()
98
+ if verbose:
99
+ logger.add(sys.stderr, level="DEBUG")
100
+ elif not quiet:
101
+ logger.add(sys.stderr, level="INFO")
102
+
103
+
104
+ def main():
105
+ """Main entry point."""
106
+ app()
107
+
108
+
109
+ if __name__ == "__main__":
110
+ main()
src/cli/rag.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RAG CLI Commands
3
+
4
+ Commands:
5
+ sparknet rag index <file> - Index document for retrieval
6
+ sparknet rag search <query> - Search indexed documents
7
+ sparknet rag ask <question> - Answer question using RAG
8
+ sparknet rag status - Show index status
9
+ """
10
+
11
+ import typer
12
+ from typing import Optional, List
13
+ from pathlib import Path
14
+ import json
15
+ import sys
16
+
17
+ # Create RAG sub-app
18
+ rag_app = typer.Typer(
19
+ name="rag",
20
+ help="RAG and retrieval commands",
21
+ )
22
+
23
+
24
+ @rag_app.command("index")
25
+ def index_document(
26
+ files: List[Path] = typer.Argument(..., help="Document file(s) to index"),
27
+ collection: str = typer.Option("sparknet_documents", "--collection", "-c", help="Collection name"),
28
+ embedding_model: str = typer.Option("nomic-embed-text", "--model", "-m", help="Embedding model"),
29
+ ):
30
+ """
31
+ Index document(s) for RAG retrieval.
32
+
33
+ Example:
34
+ sparknet rag index document.pdf
35
+ sparknet rag index *.pdf --collection contracts
36
+ """
37
+ from loguru import logger
38
+
39
+ # Validate files
40
+ valid_files = []
41
+ for f in files:
42
+ if f.exists():
43
+ valid_files.append(f)
44
+ else:
45
+ typer.echo(f"Warning: File not found, skipping: {f}", err=True)
46
+
47
+ if not valid_files:
48
+ typer.echo("Error: No valid files to index", err=True)
49
+ raise typer.Exit(1)
50
+
51
+ typer.echo(f"Indexing {len(valid_files)} document(s)...")
52
+
53
+ try:
54
+ from ..rag import (
55
+ VectorStoreConfig,
56
+ EmbeddingConfig,
57
+ get_document_indexer,
58
+ )
59
+
60
+ # Configure
61
+ store_config = VectorStoreConfig(collection_name=collection)
62
+ embed_config = EmbeddingConfig(ollama_model=embedding_model)
63
+
64
+ # Get indexer
65
+ indexer = get_document_indexer()
66
+
67
+ # Index documents
68
+ results = indexer.index_batch([str(f) for f in valid_files])
69
+
70
+ # Summary
71
+ successful = sum(1 for r in results if r.success)
72
+ total_chunks = sum(r.num_chunks_indexed for r in results)
73
+
74
+ typer.echo(f"\nIndexing complete:")
75
+ typer.echo(f" Documents: {successful}/{len(results)} successful")
76
+ typer.echo(f" Chunks indexed: {total_chunks}")
77
+
78
+ for r in results:
79
+ status = "✓" if r.success else "✗"
80
+ typer.echo(f" [{status}] {r.source_path}: {r.num_chunks_indexed} chunks")
81
+ if r.error:
82
+ typer.echo(f" Error: {r.error}")
83
+
84
+ except ImportError as e:
85
+ typer.echo(f"Error: Missing dependency - {e}", err=True)
86
+ raise typer.Exit(1)
87
+ except Exception as e:
88
+ typer.echo(f"Error indexing documents: {e}", err=True)
89
+ raise typer.Exit(1)
90
+
91
+
92
+ @rag_app.command("search")
93
+ def search_documents(
94
+ query: str = typer.Argument(..., help="Search query"),
95
+ top_k: int = typer.Option(5, "--top", "-k", help="Number of results"),
96
+ collection: str = typer.Option("sparknet_documents", "--collection", "-c", help="Collection name"),
97
+ document_id: Optional[str] = typer.Option(None, "--document", "-d", help="Filter by document ID"),
98
+ chunk_type: Optional[str] = typer.Option(None, "--type", "-t", help="Filter by chunk type"),
99
+ output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"),
100
+ ):
101
+ """
102
+ Search indexed documents.
103
+
104
+ Example:
105
+ sparknet rag search "payment terms" --top 10
106
+ sparknet rag search "table data" --type table
107
+ """
108
+ typer.echo(f"Searching: {query}")
109
+
110
+ try:
111
+ from ..rag import get_document_retriever, RetrieverConfig
112
+
113
+ # Configure
114
+ config = RetrieverConfig(default_top_k=top_k)
115
+ retriever = get_document_retriever(config)
116
+
117
+ # Build filters
118
+ filters = {}
119
+ if document_id:
120
+ filters["document_id"] = document_id
121
+ if chunk_type:
122
+ filters["chunk_type"] = chunk_type
123
+
124
+ # Search
125
+ chunks = retriever.retrieve(query, top_k=top_k, filters=filters if filters else None)
126
+
127
+ if not chunks:
128
+ typer.echo("No results found.")
129
+ return
130
+
131
+ # Format output
132
+ output_data = {
133
+ "query": query,
134
+ "num_results": len(chunks),
135
+ "results": [
136
+ {
137
+ "chunk_id": c.chunk_id,
138
+ "document_id": c.document_id,
139
+ "page": c.page,
140
+ "chunk_type": c.chunk_type,
141
+ "similarity": c.similarity,
142
+ "text": c.text[:500] + "..." if len(c.text) > 500 else c.text,
143
+ }
144
+ for c in chunks
145
+ ],
146
+ }
147
+
148
+ if output:
149
+ with open(output, "w") as f:
150
+ json.dump(output_data, f, indent=2)
151
+ typer.echo(f"Results written to: {output}")
152
+ else:
153
+ typer.echo(f"\nFound {len(chunks)} results:\n")
154
+ for i, c in enumerate(chunks, 1):
155
+ typer.echo(f"[{i}] Similarity: {c.similarity:.3f}")
156
+ if c.page is not None:
157
+ typer.echo(f" Page: {c.page + 1}, Type: {c.chunk_type or 'text'}")
158
+ typer.echo(f" {c.text[:200]}...")
159
+ typer.echo()
160
+
161
+ except Exception as e:
162
+ typer.echo(f"Error searching: {e}", err=True)
163
+ raise typer.Exit(1)
164
+
165
+
166
+ @rag_app.command("ask")
167
+ def ask_question(
168
+ question: str = typer.Argument(..., help="Question to answer"),
169
+ top_k: int = typer.Option(5, "--top", "-k", help="Number of context chunks"),
170
+ collection: str = typer.Option("sparknet_documents", "--collection", "-c", help="Collection name"),
171
+ document_id: Optional[str] = typer.Option(None, "--document", "-d", help="Filter by document ID"),
172
+ output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"),
173
+ show_evidence: bool = typer.Option(True, "--evidence/--no-evidence", help="Show evidence sources"),
174
+ ):
175
+ """
176
+ Answer a question using RAG.
177
+
178
+ Example:
179
+ sparknet rag ask "What are the payment terms?"
180
+ sparknet rag ask "What is the contract value?" --document contract123
181
+ """
182
+ typer.echo(f"Question: {question}")
183
+ typer.echo("Processing...")
184
+
185
+ try:
186
+ from ..rag import get_grounded_generator, GeneratorConfig
187
+
188
+ # Configure
189
+ config = GeneratorConfig()
190
+ generator = get_grounded_generator(config)
191
+
192
+ # Build filters
193
+ filters = {"document_id": document_id} if document_id else None
194
+
195
+ # Generate answer
196
+ result = generator.answer_question(question, top_k=top_k, filters=filters)
197
+
198
+ # Format output
199
+ output_data = {
200
+ "question": question,
201
+ "answer": result.answer,
202
+ "confidence": result.confidence,
203
+ "abstained": result.abstained,
204
+ "abstain_reason": result.abstain_reason,
205
+ "citations": [
206
+ {
207
+ "index": c.index,
208
+ "page": c.page,
209
+ "snippet": c.text_snippet,
210
+ "confidence": c.confidence,
211
+ }
212
+ for c in result.citations
213
+ ],
214
+ "num_chunks_used": result.num_chunks_used,
215
+ }
216
+
217
+ if output:
218
+ with open(output, "w") as f:
219
+ json.dump(output_data, f, indent=2)
220
+ typer.echo(f"Results written to: {output}")
221
+ else:
222
+ typer.echo(f"\nAnswer: {result.answer}")
223
+ typer.echo(f"\nConfidence: {result.confidence:.2f}")
224
+
225
+ if result.abstained:
226
+ typer.echo(f"Note: {result.abstain_reason}")
227
+
228
+ if show_evidence and result.citations:
229
+ typer.echo(f"\nSources ({len(result.citations)}):")
230
+ for c in result.citations:
231
+ page_info = f"Page {c.page + 1}" if c.page is not None else ""
232
+ typer.echo(f" [{c.index}] {page_info}: {c.text_snippet[:80]}...")
233
+
234
+ except Exception as e:
235
+ typer.echo(f"Error generating answer: {e}", err=True)
236
+ raise typer.Exit(1)
237
+
238
+
239
+ @rag_app.command("status")
240
+ def show_status(
241
+ collection: str = typer.Option("sparknet_documents", "--collection", "-c", help="Collection name"),
242
+ ):
243
+ """
244
+ Show RAG index status.
245
+
246
+ Example:
247
+ sparknet rag status
248
+ sparknet rag status --collection contracts
249
+ """
250
+ typer.echo("RAG Index Status")
251
+ typer.echo("=" * 40)
252
+
253
+ try:
254
+ from ..rag import get_vector_store, VectorStoreConfig
255
+
256
+ config = VectorStoreConfig(collection_name=collection)
257
+ store = get_vector_store(config)
258
+
259
+ # Get stats
260
+ total_chunks = store.count()
261
+
262
+ typer.echo(f"Collection: {collection}")
263
+ typer.echo(f"Total chunks: {total_chunks}")
264
+
265
+ # List documents
266
+ if hasattr(store, 'list_documents'):
267
+ doc_ids = store.list_documents()
268
+ typer.echo(f"Documents indexed: {len(doc_ids)}")
269
+
270
+ if doc_ids:
271
+ typer.echo("\nDocuments:")
272
+ for doc_id in doc_ids[:10]:
273
+ chunk_count = store.count(doc_id)
274
+ typer.echo(f" - {doc_id}: {chunk_count} chunks")
275
+
276
+ if len(doc_ids) > 10:
277
+ typer.echo(f" ... and {len(doc_ids) - 10} more")
278
+
279
+ except Exception as e:
280
+ typer.echo(f"Error getting status: {e}", err=True)
281
+ raise typer.Exit(1)
282
+
283
+
284
+ @rag_app.command("delete")
285
+ def delete_document(
286
+ document_id: str = typer.Argument(..., help="Document ID to delete"),
287
+ collection: str = typer.Option("sparknet_documents", "--collection", "-c", help="Collection name"),
288
+ force: bool = typer.Option(False, "--force", "-f", help="Skip confirmation"),
289
+ ):
290
+ """
291
+ Delete a document from the index.
292
+
293
+ Example:
294
+ sparknet rag delete doc123
295
+ sparknet rag delete doc123 --force
296
+ """
297
+ if not force:
298
+ confirm = typer.confirm(f"Delete document '{document_id}' from index?")
299
+ if not confirm:
300
+ typer.echo("Cancelled.")
301
+ return
302
+
303
+ try:
304
+ from ..rag import get_vector_store, VectorStoreConfig
305
+
306
+ config = VectorStoreConfig(collection_name=collection)
307
+ store = get_vector_store(config)
308
+
309
+ deleted = store.delete_document(document_id)
310
+ typer.echo(f"Deleted {deleted} chunks for document: {document_id}")
311
+
312
+ except Exception as e:
313
+ typer.echo(f"Error deleting document: {e}", err=True)
314
+ raise typer.Exit(1)
src/document/__init__.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SPARKNET Document Intelligence Subsystem
3
+
4
+ A comprehensive document processing pipeline for:
5
+ - OCR with PaddleOCR and Tesseract
6
+ - Layout detection and reading order reconstruction
7
+ - Semantic chunking with grounding evidence
8
+ - Document classification and field extraction
9
+ - Extraction validation with Critic/Verifier
10
+
11
+ Principles:
12
+ - Processing is not understanding: OCR alone is insufficient
13
+ - Every extraction includes evidence pointers (bbox, page, chunk_id)
14
+ - Modular, pluggable components with clean interfaces
15
+ - Abstain with evidence when confidence is low
16
+ """
17
+
18
+ from .schemas.core import (
19
+ BoundingBox,
20
+ OCRRegion,
21
+ LayoutRegion,
22
+ LayoutType,
23
+ DocumentChunk,
24
+ ChunkType,
25
+ EvidenceRef,
26
+ ExtractionResult,
27
+ DocumentMetadata,
28
+ ProcessedDocument,
29
+ )
30
+
31
+ from .pipeline import (
32
+ PipelineConfig,
33
+ DocumentProcessor,
34
+ get_document_processor,
35
+ process_document,
36
+ )
37
+
38
+ from .validation import (
39
+ CriticConfig,
40
+ ValidationResult,
41
+ ExtractionCritic,
42
+ get_extraction_critic,
43
+ VerifierConfig,
44
+ VerificationResult,
45
+ EvidenceVerifier,
46
+ get_evidence_verifier,
47
+ )
48
+
49
+ __all__ = [
50
+ # Core schemas
51
+ "BoundingBox",
52
+ "OCRRegion",
53
+ "LayoutRegion",
54
+ "LayoutType",
55
+ "DocumentChunk",
56
+ "ChunkType",
57
+ "EvidenceRef",
58
+ "ExtractionResult",
59
+ "DocumentMetadata",
60
+ "ProcessedDocument",
61
+ # Pipeline
62
+ "PipelineConfig",
63
+ "DocumentProcessor",
64
+ "get_document_processor",
65
+ "process_document",
66
+ # Validation
67
+ "CriticConfig",
68
+ "ValidationResult",
69
+ "ExtractionCritic",
70
+ "get_extraction_critic",
71
+ "VerifierConfig",
72
+ "VerificationResult",
73
+ "EvidenceVerifier",
74
+ "get_evidence_verifier",
75
+ ]
src/document/chunking/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document Chunking Module
3
+
4
+ Creates semantic chunks from document content for retrieval and processing.
5
+ """
6
+
7
+ from .chunker import (
8
+ ChunkerConfig,
9
+ DocumentChunker,
10
+ SemanticChunker,
11
+ get_document_chunker,
12
+ )
13
+
14
+ __all__ = [
15
+ "ChunkerConfig",
16
+ "DocumentChunker",
17
+ "SemanticChunker",
18
+ "get_document_chunker",
19
+ ]
src/document/chunking/chunker.py ADDED
@@ -0,0 +1,944 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document Chunker Implementation
3
+
4
+ Creates semantic chunks from document content with bounding box tracking.
5
+ Includes TableAwareChunker for preserving table structure in markdown format.
6
+ """
7
+
8
+ import uuid
9
+ import time
10
+ import re
11
+ from typing import List, Optional, Dict, Any, Tuple
12
+ from dataclasses import dataclass
13
+ from pydantic import BaseModel, Field
14
+ from loguru import logger
15
+ from collections import defaultdict
16
+
17
+ from ..schemas.core import (
18
+ BoundingBox,
19
+ DocumentChunk,
20
+ ChunkType,
21
+ LayoutRegion,
22
+ LayoutType,
23
+ OCRRegion,
24
+ )
25
+
26
+
27
+ class ChunkerConfig(BaseModel):
28
+ """Configuration for document chunking."""
29
+ # Chunk size limits
30
+ max_chunk_chars: int = Field(
31
+ default=1000,
32
+ ge=100,
33
+ description="Maximum characters per chunk"
34
+ )
35
+ min_chunk_chars: int = Field(
36
+ default=50,
37
+ ge=10,
38
+ description="Minimum characters per chunk"
39
+ )
40
+ overlap_chars: int = Field(
41
+ default=100,
42
+ ge=0,
43
+ description="Character overlap between chunks"
44
+ )
45
+
46
+ # Chunking strategy
47
+ strategy: str = Field(
48
+ default="semantic",
49
+ description="Chunking strategy: semantic, fixed, or layout"
50
+ )
51
+ respect_layout: bool = Field(
52
+ default=True,
53
+ description="Respect layout region boundaries"
54
+ )
55
+ merge_small_regions: bool = Field(
56
+ default=True,
57
+ description="Merge small adjacent regions"
58
+ )
59
+
60
+ # Special element handling
61
+ chunk_tables: bool = Field(
62
+ default=True,
63
+ description="Create separate chunks for tables"
64
+ )
65
+ chunk_figures: bool = Field(
66
+ default=True,
67
+ description="Create separate chunks for figures"
68
+ )
69
+ include_captions: bool = Field(
70
+ default=True,
71
+ description="Include captions with figures/tables"
72
+ )
73
+
74
+ # Sentence handling
75
+ split_on_sentences: bool = Field(
76
+ default=True,
77
+ description="Split on sentence boundaries when possible"
78
+ )
79
+
80
+ # Table-aware chunking (FG-002)
81
+ preserve_table_structure: bool = Field(
82
+ default=True,
83
+ description="Preserve table structure as markdown with structured data"
84
+ )
85
+ table_row_threshold: float = Field(
86
+ default=10.0,
87
+ description="Y-coordinate threshold for grouping cells into rows"
88
+ )
89
+ table_col_threshold: float = Field(
90
+ default=20.0,
91
+ description="X-coordinate threshold for grouping cells into columns"
92
+ )
93
+ detect_table_headers: bool = Field(
94
+ default=True,
95
+ description="Attempt to detect and mark header rows"
96
+ )
97
+
98
+
99
+ # Map layout types to chunk types
100
+ LAYOUT_TO_CHUNK_TYPE = {
101
+ LayoutType.TEXT: ChunkType.TEXT,
102
+ LayoutType.TITLE: ChunkType.TITLE,
103
+ LayoutType.HEADING: ChunkType.HEADING,
104
+ LayoutType.PARAGRAPH: ChunkType.PARAGRAPH,
105
+ LayoutType.LIST: ChunkType.LIST_ITEM,
106
+ LayoutType.TABLE: ChunkType.TABLE,
107
+ LayoutType.FIGURE: ChunkType.FIGURE,
108
+ LayoutType.CHART: ChunkType.CHART,
109
+ LayoutType.FORMULA: ChunkType.FORMULA,
110
+ LayoutType.CAPTION: ChunkType.CAPTION,
111
+ LayoutType.FOOTNOTE: ChunkType.FOOTNOTE,
112
+ LayoutType.HEADER: ChunkType.HEADER,
113
+ LayoutType.FOOTER: ChunkType.FOOTER,
114
+ }
115
+
116
+
117
+ class DocumentChunker:
118
+ """Base class for document chunkers."""
119
+
120
+ def __init__(self, config: Optional[ChunkerConfig] = None):
121
+ self.config = config or ChunkerConfig()
122
+
123
+ def create_chunks(
124
+ self,
125
+ ocr_regions: List[OCRRegion],
126
+ layout_regions: Optional[List[LayoutRegion]] = None,
127
+ document_id: str = "",
128
+ source_path: Optional[str] = None,
129
+ ) -> List[DocumentChunk]:
130
+ """
131
+ Create chunks from OCR and layout regions.
132
+
133
+ Args:
134
+ ocr_regions: OCR text regions
135
+ layout_regions: Optional layout regions
136
+ document_id: Parent document ID
137
+ source_path: Source file path
138
+
139
+ Returns:
140
+ List of DocumentChunk
141
+ """
142
+ raise NotImplementedError
143
+
144
+
145
+ class SemanticChunker(DocumentChunker):
146
+ """
147
+ Semantic chunker that respects document structure.
148
+
149
+ Creates chunks based on:
150
+ - Layout region boundaries
151
+ - Semantic coherence (paragraphs, sections)
152
+ - Size constraints with overlap
153
+ """
154
+
155
+ def create_chunks(
156
+ self,
157
+ ocr_regions: List[OCRRegion],
158
+ layout_regions: Optional[List[LayoutRegion]] = None,
159
+ document_id: str = "",
160
+ source_path: Optional[str] = None,
161
+ ) -> List[DocumentChunk]:
162
+ """Create semantic chunks from document content."""
163
+ if not ocr_regions:
164
+ return []
165
+
166
+ start_time = time.time()
167
+ chunks = []
168
+ chunk_index = 0
169
+
170
+ if layout_regions and self.config.respect_layout:
171
+ # Use layout regions to guide chunking
172
+ chunks = self._chunk_by_layout(
173
+ ocr_regions, layout_regions, document_id, source_path
174
+ )
175
+ else:
176
+ # Fall back to text-based chunking
177
+ chunks = self._chunk_by_text(
178
+ ocr_regions, document_id, source_path
179
+ )
180
+
181
+ # Assign sequence indices
182
+ for i, chunk in enumerate(chunks):
183
+ chunk.sequence_index = i
184
+
185
+ logger.debug(
186
+ f"Created {len(chunks)} chunks in "
187
+ f"{(time.time() - start_time) * 1000:.1f}ms"
188
+ )
189
+
190
+ return chunks
191
+
192
+ def _chunk_by_layout(
193
+ self,
194
+ ocr_regions: List[OCRRegion],
195
+ layout_regions: List[LayoutRegion],
196
+ document_id: str,
197
+ source_path: Optional[str],
198
+ ) -> List[DocumentChunk]:
199
+ """Create chunks based on layout regions."""
200
+ chunks = []
201
+
202
+ # Sort layout regions by reading order
203
+ sorted_layouts = sorted(
204
+ layout_regions,
205
+ key=lambda r: (r.reading_order or 0, r.bbox.y_min, r.bbox.x_min)
206
+ )
207
+
208
+ for layout in sorted_layouts:
209
+ # Get OCR regions within this layout region
210
+ contained_ocr = self._get_contained_ocr(ocr_regions, layout)
211
+
212
+ if not contained_ocr:
213
+ continue
214
+
215
+ # Determine chunk type
216
+ chunk_type = LAYOUT_TO_CHUNK_TYPE.get(layout.type, ChunkType.TEXT)
217
+
218
+ # Handle special types differently
219
+ if layout.type == LayoutType.TABLE and self.config.chunk_tables:
220
+ chunk = self._create_table_chunk(
221
+ contained_ocr, layout, document_id, source_path
222
+ )
223
+ chunks.append(chunk)
224
+
225
+ elif layout.type in (LayoutType.FIGURE, LayoutType.CHART) and self.config.chunk_figures:
226
+ chunk = self._create_figure_chunk(
227
+ contained_ocr, layout, document_id, source_path
228
+ )
229
+ chunks.append(chunk)
230
+
231
+ else:
232
+ # Regular text chunk - may need splitting
233
+ text_chunks = self._create_text_chunks(
234
+ contained_ocr, layout, chunk_type, document_id, source_path
235
+ )
236
+ chunks.extend(text_chunks)
237
+
238
+ return chunks
239
+
240
+ def _chunk_by_text(
241
+ self,
242
+ ocr_regions: List[OCRRegion],
243
+ document_id: str,
244
+ source_path: Optional[str],
245
+ ) -> List[DocumentChunk]:
246
+ """Create chunks from text without layout guidance."""
247
+ chunks = []
248
+
249
+ # Sort by reading order (y then x)
250
+ sorted_regions = sorted(
251
+ ocr_regions,
252
+ key=lambda r: (r.page, r.bbox.y_min, r.bbox.x_min)
253
+ )
254
+
255
+ # Group by page
256
+ pages: Dict[int, List[OCRRegion]] = {}
257
+ for r in sorted_regions:
258
+ if r.page not in pages:
259
+ pages[r.page] = []
260
+ pages[r.page].append(r)
261
+
262
+ # Process each page
263
+ for page_num in sorted(pages.keys()):
264
+ page_regions = pages[page_num]
265
+ page_chunks = self._split_text_regions(
266
+ page_regions, document_id, source_path, page_num
267
+ )
268
+ chunks.extend(page_chunks)
269
+
270
+ return chunks
271
+
272
+ def _get_contained_ocr(
273
+ self,
274
+ ocr_regions: List[OCRRegion],
275
+ layout: LayoutRegion,
276
+ ) -> List[OCRRegion]:
277
+ """Get OCR regions contained within a layout region."""
278
+ contained = []
279
+ for ocr in ocr_regions:
280
+ if ocr.page == layout.page:
281
+ # Check if OCR region overlaps significantly with layout
282
+ iou = layout.bbox.iou(ocr.bbox)
283
+ if iou > 0.3 or layout.bbox.contains(ocr.bbox):
284
+ contained.append(ocr)
285
+ return contained
286
+
287
+ def _create_text_chunks(
288
+ self,
289
+ ocr_regions: List[OCRRegion],
290
+ layout: LayoutRegion,
291
+ chunk_type: ChunkType,
292
+ document_id: str,
293
+ source_path: Optional[str],
294
+ ) -> List[DocumentChunk]:
295
+ """Create text chunks from OCR regions, splitting if needed."""
296
+ chunks = []
297
+
298
+ # Combine text
299
+ text = " ".join(r.text for r in ocr_regions)
300
+
301
+ # Calculate average confidence
302
+ avg_conf = sum(r.confidence for r in ocr_regions) / len(ocr_regions)
303
+
304
+ # Check if splitting is needed
305
+ if len(text) <= self.config.max_chunk_chars:
306
+ # Single chunk
307
+ chunk = DocumentChunk(
308
+ chunk_id=f"{document_id}_{uuid.uuid4().hex[:8]}",
309
+ chunk_type=chunk_type,
310
+ text=text,
311
+ bbox=layout.bbox,
312
+ page=layout.page,
313
+ document_id=document_id,
314
+ source_path=source_path,
315
+ sequence_index=0,
316
+ confidence=avg_conf,
317
+ )
318
+ chunks.append(chunk)
319
+ else:
320
+ # Split into multiple chunks
321
+ split_chunks = self._split_text(
322
+ text, layout.bbox, layout.page, chunk_type,
323
+ document_id, source_path, avg_conf
324
+ )
325
+ chunks.extend(split_chunks)
326
+
327
+ return chunks
328
+
329
+ def _split_text(
330
+ self,
331
+ text: str,
332
+ bbox: BoundingBox,
333
+ page: int,
334
+ chunk_type: ChunkType,
335
+ document_id: str,
336
+ source_path: Optional[str],
337
+ confidence: float,
338
+ ) -> List[DocumentChunk]:
339
+ """Split long text into multiple chunks with overlap."""
340
+ chunks = []
341
+ max_chars = self.config.max_chunk_chars
342
+ overlap = self.config.overlap_chars
343
+
344
+ # Split on sentences if enabled
345
+ if self.config.split_on_sentences:
346
+ sentences = self._split_sentences(text)
347
+ else:
348
+ sentences = [text]
349
+
350
+ current_text = ""
351
+ for sentence in sentences:
352
+ if len(current_text) + len(sentence) > max_chars and current_text:
353
+ # Create chunk
354
+ chunk = DocumentChunk(
355
+ chunk_id=f"{document_id}_{uuid.uuid4().hex[:8]}",
356
+ chunk_type=chunk_type,
357
+ text=current_text.strip(),
358
+ bbox=bbox,
359
+ page=page,
360
+ document_id=document_id,
361
+ source_path=source_path,
362
+ sequence_index=len(chunks),
363
+ confidence=confidence,
364
+ )
365
+ chunks.append(chunk)
366
+
367
+ # Start new chunk with overlap
368
+ if overlap > 0:
369
+ overlap_text = current_text[-overlap:] if len(current_text) > overlap else current_text
370
+ current_text = overlap_text + " " + sentence
371
+ else:
372
+ current_text = sentence
373
+ else:
374
+ current_text += " " + sentence if current_text else sentence
375
+
376
+ # Don't forget the last chunk
377
+ if current_text.strip():
378
+ chunk = DocumentChunk(
379
+ chunk_id=f"{document_id}_{uuid.uuid4().hex[:8]}",
380
+ chunk_type=chunk_type,
381
+ text=current_text.strip(),
382
+ bbox=bbox,
383
+ page=page,
384
+ document_id=document_id,
385
+ source_path=source_path,
386
+ sequence_index=len(chunks),
387
+ confidence=confidence,
388
+ )
389
+ chunks.append(chunk)
390
+
391
+ return chunks
392
+
393
+ def _split_sentences(self, text: str) -> List[str]:
394
+ """Split text into sentences."""
395
+ # Simple sentence splitting
396
+ import re
397
+ sentences = re.split(r'(?<=[.!?])\s+', text)
398
+ return [s.strip() for s in sentences if s.strip()]
399
+
400
+ def _create_table_chunk(
401
+ self,
402
+ ocr_regions: List[OCRRegion],
403
+ layout: LayoutRegion,
404
+ document_id: str,
405
+ source_path: Optional[str],
406
+ ) -> DocumentChunk:
407
+ """
408
+ Create a chunk for table content with structure preservation.
409
+
410
+ Enhanced table handling (FG-002):
411
+ - Reconstructs table structure from OCR regions
412
+ - Generates markdown table representation
413
+ - Stores structured data for SQL-like queries
414
+ - Detects and marks header rows
415
+ """
416
+ if not ocr_regions:
417
+ return DocumentChunk(
418
+ chunk_id=f"{document_id}_table_{uuid.uuid4().hex[:8]}",
419
+ chunk_type=ChunkType.TABLE,
420
+ text="[Empty Table]",
421
+ bbox=layout.bbox,
422
+ page=layout.page,
423
+ document_id=document_id,
424
+ source_path=source_path,
425
+ sequence_index=0,
426
+ confidence=0.0,
427
+ extra=layout.extra or {},
428
+ )
429
+
430
+ avg_conf = sum(r.confidence for r in ocr_regions) / len(ocr_regions)
431
+
432
+ # Check if we should preserve table structure
433
+ if not self.config.preserve_table_structure:
434
+ # Fall back to simple pipe-separated format
435
+ text = " | ".join(r.text for r in ocr_regions)
436
+ return DocumentChunk(
437
+ chunk_id=f"{document_id}_table_{uuid.uuid4().hex[:8]}",
438
+ chunk_type=ChunkType.TABLE,
439
+ text=text,
440
+ bbox=layout.bbox,
441
+ page=layout.page,
442
+ document_id=document_id,
443
+ source_path=source_path,
444
+ sequence_index=0,
445
+ confidence=avg_conf,
446
+ extra=layout.extra or {},
447
+ )
448
+
449
+ # Reconstruct table structure from spatial positions
450
+ table_data = self._reconstruct_table_structure(ocr_regions)
451
+
452
+ # Generate markdown representation
453
+ markdown_table = self._table_to_markdown(
454
+ table_data["rows"],
455
+ table_data["headers"],
456
+ table_data["has_header"]
457
+ )
458
+
459
+ # Create rich metadata for structured queries
460
+ table_extra = {
461
+ **(layout.extra or {}),
462
+ "table_structure": {
463
+ "row_count": table_data["row_count"],
464
+ "col_count": table_data["col_count"],
465
+ "has_header": table_data["has_header"],
466
+ "headers": table_data["headers"],
467
+ "cells": table_data["cells"], # 2D list of cell values
468
+ "cell_positions": table_data["cell_positions"], # For highlighting
469
+ },
470
+ "format": "markdown",
471
+ "searchable_text": table_data["searchable_text"],
472
+ }
473
+
474
+ return DocumentChunk(
475
+ chunk_id=f"{document_id}_table_{uuid.uuid4().hex[:8]}",
476
+ chunk_type=ChunkType.TABLE,
477
+ text=markdown_table,
478
+ bbox=layout.bbox,
479
+ page=layout.page,
480
+ document_id=document_id,
481
+ source_path=source_path,
482
+ sequence_index=0,
483
+ confidence=avg_conf,
484
+ extra=table_extra,
485
+ )
486
+
487
+ def _reconstruct_table_structure(
488
+ self,
489
+ ocr_regions: List[OCRRegion],
490
+ ) -> Dict[str, Any]:
491
+ """
492
+ Reconstruct table structure from OCR regions based on spatial positions.
493
+
494
+ Groups OCR regions into rows and columns by analyzing their bounding boxes.
495
+ Returns structured table data for markdown generation and queries.
496
+ """
497
+ if not ocr_regions:
498
+ return {
499
+ "rows": [],
500
+ "headers": [],
501
+ "has_header": False,
502
+ "row_count": 0,
503
+ "col_count": 0,
504
+ "cells": [],
505
+ "cell_positions": [],
506
+ "searchable_text": "",
507
+ }
508
+
509
+ # Sort regions by vertical position (y_min) then horizontal (x_min)
510
+ sorted_regions = sorted(
511
+ ocr_regions,
512
+ key=lambda r: (r.bbox.y_min, r.bbox.x_min)
513
+ )
514
+
515
+ # Group into rows based on y-coordinate proximity
516
+ row_threshold = self.config.table_row_threshold
517
+ rows: List[List[OCRRegion]] = []
518
+ current_row: List[OCRRegion] = []
519
+ current_y = None
520
+
521
+ for region in sorted_regions:
522
+ if current_y is None:
523
+ current_y = region.bbox.y_min
524
+ current_row.append(region)
525
+ elif abs(region.bbox.y_min - current_y) <= row_threshold:
526
+ current_row.append(region)
527
+ else:
528
+ if current_row:
529
+ # Sort row by x position
530
+ current_row.sort(key=lambda r: r.bbox.x_min)
531
+ rows.append(current_row)
532
+ current_row = [region]
533
+ current_y = region.bbox.y_min
534
+
535
+ # Don't forget the last row
536
+ if current_row:
537
+ current_row.sort(key=lambda r: r.bbox.x_min)
538
+ rows.append(current_row)
539
+
540
+ # Determine column structure
541
+ # Find consistent column boundaries across all rows
542
+ col_positions = self._detect_column_positions(rows)
543
+ num_cols = len(col_positions) if col_positions else max(len(row) for row in rows)
544
+
545
+ # Build structured cell data
546
+ cells: List[List[str]] = []
547
+ cell_positions: List[List[Dict[str, Any]]] = []
548
+
549
+ for row in rows:
550
+ row_cells = self._assign_cells_to_columns(row, col_positions, num_cols)
551
+ cells.append([cell["text"] for cell in row_cells])
552
+ cell_positions.append([{
553
+ "text": cell["text"],
554
+ "bbox": cell["bbox"],
555
+ "confidence": cell["confidence"]
556
+ } for cell in row_cells])
557
+
558
+ # Detect header row
559
+ has_header = False
560
+ headers: List[str] = []
561
+
562
+ if self.config.detect_table_headers and len(cells) > 0:
563
+ has_header, headers = self._detect_header_row(cells, rows)
564
+
565
+ # Build searchable text (for vector embedding)
566
+ searchable_parts = []
567
+ for i, row in enumerate(cells):
568
+ if has_header and i == 0:
569
+ searchable_parts.append("Headers: " + ", ".join(row))
570
+ else:
571
+ if has_header and headers:
572
+ # Include header context for each value
573
+ for j, cell in enumerate(row):
574
+ if j < len(headers) and headers[j]:
575
+ searchable_parts.append(f"{headers[j]}: {cell}")
576
+ else:
577
+ searchable_parts.append(cell)
578
+ else:
579
+ searchable_parts.extend(row)
580
+
581
+ return {
582
+ "rows": cells,
583
+ "headers": headers,
584
+ "has_header": has_header,
585
+ "row_count": len(cells),
586
+ "col_count": num_cols,
587
+ "cells": cells,
588
+ "cell_positions": cell_positions,
589
+ "searchable_text": " | ".join(searchable_parts),
590
+ }
591
+
592
+ def _detect_column_positions(
593
+ self,
594
+ rows: List[List[OCRRegion]],
595
+ ) -> List[Tuple[float, float]]:
596
+ """
597
+ Detect consistent column boundaries from table rows.
598
+
599
+ Returns list of (x_start, x_end) tuples for each column.
600
+ """
601
+ if not rows:
602
+ return []
603
+
604
+ col_threshold = self.config.table_col_threshold
605
+
606
+ # Collect all x positions
607
+ all_x_starts = []
608
+ for row in rows:
609
+ for region in row:
610
+ all_x_starts.append(region.bbox.x_min)
611
+
612
+ if not all_x_starts:
613
+ return []
614
+
615
+ # Cluster x positions into columns
616
+ all_x_starts.sort()
617
+ columns = []
618
+ current_col_start = all_x_starts[0]
619
+ current_col_regions = [all_x_starts[0]]
620
+
621
+ for x in all_x_starts[1:]:
622
+ if x - current_col_regions[-1] <= col_threshold:
623
+ current_col_regions.append(x)
624
+ else:
625
+ # Calculate column boundary
626
+ col_center = sum(current_col_regions) / len(current_col_regions)
627
+ columns.append(col_center)
628
+ current_col_regions = [x]
629
+
630
+ # Last column
631
+ if current_col_regions:
632
+ col_center = sum(current_col_regions) / len(current_col_regions)
633
+ columns.append(col_center)
634
+
635
+ # Convert to column ranges
636
+ col_ranges = []
637
+ for i, col_x in enumerate(columns):
638
+ x_start = col_x - col_threshold
639
+ if i < len(columns) - 1:
640
+ x_end = (col_x + columns[i + 1]) / 2
641
+ else:
642
+ x_end = col_x + col_threshold * 3 # Extend last column
643
+ col_ranges.append((x_start, x_end))
644
+
645
+ return col_ranges
646
+
647
+ def _assign_cells_to_columns(
648
+ self,
649
+ row_regions: List[OCRRegion],
650
+ col_positions: List[Tuple[float, float]],
651
+ num_cols: int,
652
+ ) -> List[Dict[str, Any]]:
653
+ """
654
+ Assign OCR regions in a row to their respective columns.
655
+ Handles merged cells and missing cells.
656
+ """
657
+ # Initialize empty cells for each column
658
+ row_cells = [
659
+ {"text": "", "bbox": None, "confidence": 0.0}
660
+ for _ in range(num_cols)
661
+ ]
662
+
663
+ if not col_positions:
664
+ # No column positions detected, just use order
665
+ for i, region in enumerate(row_regions):
666
+ if i < num_cols:
667
+ row_cells[i] = {
668
+ "text": region.text.strip(),
669
+ "bbox": region.bbox.to_xyxy(),
670
+ "confidence": region.confidence,
671
+ }
672
+ return row_cells
673
+
674
+ # Assign regions to columns based on x position
675
+ for region in row_regions:
676
+ region_x = region.bbox.x_min
677
+ assigned = False
678
+
679
+ for col_idx, (x_start, x_end) in enumerate(col_positions):
680
+ if x_start <= region_x <= x_end:
681
+ # Append to existing cell (handle multi-line cells)
682
+ if row_cells[col_idx]["text"]:
683
+ row_cells[col_idx]["text"] += " " + region.text.strip()
684
+ else:
685
+ row_cells[col_idx]["text"] = region.text.strip()
686
+ row_cells[col_idx]["bbox"] = region.bbox.to_xyxy()
687
+ row_cells[col_idx]["confidence"] = max(
688
+ row_cells[col_idx]["confidence"],
689
+ region.confidence
690
+ )
691
+ assigned = True
692
+ break
693
+
694
+ # If not assigned, put in nearest column
695
+ if not assigned:
696
+ min_dist = float("inf")
697
+ nearest_col = 0
698
+ for col_idx, (x_start, x_end) in enumerate(col_positions):
699
+ col_center = (x_start + x_end) / 2
700
+ dist = abs(region_x - col_center)
701
+ if dist < min_dist:
702
+ min_dist = dist
703
+ nearest_col = col_idx
704
+
705
+ if row_cells[nearest_col]["text"]:
706
+ row_cells[nearest_col]["text"] += " " + region.text.strip()
707
+ else:
708
+ row_cells[nearest_col]["text"] = region.text.strip()
709
+ row_cells[nearest_col]["bbox"] = region.bbox.to_xyxy()
710
+ row_cells[nearest_col]["confidence"] = region.confidence
711
+
712
+ return row_cells
713
+
714
+ def _detect_header_row(
715
+ self,
716
+ cells: List[List[str]],
717
+ rows: List[List[OCRRegion]],
718
+ ) -> Tuple[bool, List[str]]:
719
+ """
720
+ Detect if the first row is a header row.
721
+
722
+ Heuristics used:
723
+ - First row contains non-numeric text
724
+ - First row text is shorter (labels vs data)
725
+ - First row has distinct formatting (if available)
726
+ """
727
+ if not cells or len(cells) < 2:
728
+ return False, []
729
+
730
+ first_row = cells[0]
731
+ other_rows = cells[1:]
732
+
733
+ # Check if first row is mostly non-numeric
734
+ first_row_numeric_count = sum(
735
+ 1 for cell in first_row
736
+ if cell and self._is_numeric(cell)
737
+ )
738
+ first_row_text_ratio = (len(first_row) - first_row_numeric_count) / max(len(first_row), 1)
739
+
740
+ # Check if other rows are more numeric
741
+ other_numeric_ratios = []
742
+ for row in other_rows:
743
+ if row:
744
+ numeric_count = sum(1 for cell in row if cell and self._is_numeric(cell))
745
+ other_numeric_ratios.append(numeric_count / max(len(row), 1))
746
+
747
+ avg_other_numeric = sum(other_numeric_ratios) / max(len(other_numeric_ratios), 1)
748
+
749
+ # Header detection: first row is text-heavy, others are more numeric
750
+ is_header = (
751
+ first_row_text_ratio > 0.5 and
752
+ (avg_other_numeric > first_row_text_ratio * 0.5 or first_row_text_ratio > 0.8)
753
+ )
754
+
755
+ # Also consider: shorter cell lengths in first row (labels are usually shorter)
756
+ first_row_avg_len = sum(len(cell) for cell in first_row) / max(len(first_row), 1)
757
+ other_avg_lens = [
758
+ sum(len(cell) for cell in row) / max(len(row), 1)
759
+ for row in other_rows
760
+ ]
761
+ avg_other_len = sum(other_avg_lens) / max(len(other_avg_lens), 1)
762
+
763
+ if first_row_avg_len < avg_other_len * 0.8:
764
+ is_header = True
765
+
766
+ return is_header, first_row if is_header else []
767
+
768
+ def _is_numeric(self, text: str) -> bool:
769
+ """Check if text is primarily numeric (including currency, percentages)."""
770
+ cleaned = re.sub(r'[$€£¥%,.\s\-+()]', '', text)
771
+ return cleaned.isdigit() if cleaned else False
772
+
773
+ def _table_to_markdown(
774
+ self,
775
+ rows: List[List[str]],
776
+ headers: List[str],
777
+ has_header: bool,
778
+ ) -> str:
779
+ """
780
+ Convert table data to markdown format.
781
+
782
+ Creates a properly formatted markdown table with:
783
+ - Header row (if detected)
784
+ - Separator row
785
+ - Data rows
786
+ """
787
+ if not rows:
788
+ return "[Empty Table]"
789
+
790
+ # Determine column count
791
+ num_cols = max(len(row) for row in rows) if rows else 0
792
+ if num_cols == 0:
793
+ return "[Empty Table]"
794
+
795
+ # Normalize all rows to same column count
796
+ normalized_rows = []
797
+ for row in rows:
798
+ normalized = row + [""] * (num_cols - len(row))
799
+ normalized_rows.append(normalized)
800
+
801
+ # Build markdown lines
802
+ md_lines = []
803
+
804
+ if has_header and headers:
805
+ # Use detected headers
806
+ header_line = "| " + " | ".join(headers + [""] * (num_cols - len(headers))) + " |"
807
+ separator = "| " + " | ".join(["---"] * num_cols) + " |"
808
+ md_lines.append(header_line)
809
+ md_lines.append(separator)
810
+ data_rows = normalized_rows[1:]
811
+ else:
812
+ # No header - create generic headers
813
+ generic_headers = [f"Col{i+1}" for i in range(num_cols)]
814
+ header_line = "| " + " | ".join(generic_headers) + " |"
815
+ separator = "| " + " | ".join(["---"] * num_cols) + " |"
816
+ md_lines.append(header_line)
817
+ md_lines.append(separator)
818
+ data_rows = normalized_rows
819
+
820
+ # Add data rows
821
+ for row in data_rows:
822
+ # Escape pipe characters in cell content
823
+ escaped_row = [cell.replace("|", "\\|") for cell in row]
824
+ row_line = "| " + " | ".join(escaped_row) + " |"
825
+ md_lines.append(row_line)
826
+
827
+ return "\n".join(md_lines)
828
+
829
+ def _create_figure_chunk(
830
+ self,
831
+ ocr_regions: List[OCRRegion],
832
+ layout: LayoutRegion,
833
+ document_id: str,
834
+ source_path: Optional[str],
835
+ ) -> DocumentChunk:
836
+ """Create a chunk for figure/chart content."""
837
+ # For figures, text is usually caption
838
+ text = " ".join(r.text for r in ocr_regions) if ocr_regions else "[Figure]"
839
+ avg_conf = sum(r.confidence for r in ocr_regions) / len(ocr_regions) if ocr_regions else 0.5
840
+
841
+ chunk_type = ChunkType.CHART if layout.type == LayoutType.CHART else ChunkType.FIGURE
842
+
843
+ return DocumentChunk(
844
+ chunk_id=f"{document_id}_{chunk_type.value}_{uuid.uuid4().hex[:8]}",
845
+ chunk_type=chunk_type,
846
+ text=text,
847
+ bbox=layout.bbox,
848
+ page=layout.page,
849
+ document_id=document_id,
850
+ source_path=source_path,
851
+ sequence_index=0,
852
+ confidence=avg_conf,
853
+ caption=text if ocr_regions else None,
854
+ )
855
+
856
+ def _split_text_regions(
857
+ self,
858
+ ocr_regions: List[OCRRegion],
859
+ document_id: str,
860
+ source_path: Optional[str],
861
+ page_num: int,
862
+ ) -> List[DocumentChunk]:
863
+ """Split OCR regions into chunks without layout guidance."""
864
+ if not ocr_regions:
865
+ return []
866
+
867
+ chunks = []
868
+ current_text = ""
869
+ current_regions = []
870
+
871
+ for region in ocr_regions:
872
+ if len(current_text) + len(region.text) > self.config.max_chunk_chars:
873
+ if current_regions:
874
+ # Create chunk from accumulated regions
875
+ chunk = self._create_chunk_from_regions(
876
+ current_regions, document_id, source_path, page_num, len(chunks)
877
+ )
878
+ chunks.append(chunk)
879
+
880
+ current_text = region.text
881
+ current_regions = [region]
882
+ else:
883
+ current_text += " " + region.text
884
+ current_regions.append(region)
885
+
886
+ # Final chunk
887
+ if current_regions:
888
+ chunk = self._create_chunk_from_regions(
889
+ current_regions, document_id, source_path, page_num, len(chunks)
890
+ )
891
+ chunks.append(chunk)
892
+
893
+ return chunks
894
+
895
+ def _create_chunk_from_regions(
896
+ self,
897
+ regions: List[OCRRegion],
898
+ document_id: str,
899
+ source_path: Optional[str],
900
+ page_num: int,
901
+ sequence_index: int,
902
+ ) -> DocumentChunk:
903
+ """Create a chunk from a list of OCR regions."""
904
+ text = " ".join(r.text for r in regions)
905
+ avg_conf = sum(r.confidence for r in regions) / len(regions)
906
+
907
+ # Compute bounding box
908
+ x_min = min(r.bbox.x_min for r in regions)
909
+ y_min = min(r.bbox.y_min for r in regions)
910
+ x_max = max(r.bbox.x_max for r in regions)
911
+ y_max = max(r.bbox.y_max for r in regions)
912
+
913
+ bbox = BoundingBox(
914
+ x_min=x_min, y_min=y_min,
915
+ x_max=x_max, y_max=y_max,
916
+ normalized=False,
917
+ )
918
+
919
+ return DocumentChunk(
920
+ chunk_id=f"{document_id}_{uuid.uuid4().hex[:8]}",
921
+ chunk_type=ChunkType.TEXT,
922
+ text=text,
923
+ bbox=bbox,
924
+ page=page_num,
925
+ document_id=document_id,
926
+ source_path=source_path,
927
+ sequence_index=sequence_index,
928
+ confidence=avg_conf,
929
+ )
930
+
931
+
932
+ # Factory
933
+ _document_chunker: Optional[DocumentChunker] = None
934
+
935
+
936
+ def get_document_chunker(
937
+ config: Optional[ChunkerConfig] = None,
938
+ ) -> DocumentChunker:
939
+ """Get or create singleton document chunker."""
940
+ global _document_chunker
941
+ if _document_chunker is None:
942
+ config = config or ChunkerConfig()
943
+ _document_chunker = SemanticChunker(config)
944
+ return _document_chunker
src/document/grounding/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document Grounding Module
3
+
4
+ Provides evidence packaging and visual grounding for extracted information.
5
+ """
6
+
7
+ from .evidence import (
8
+ GroundingConfig,
9
+ EvidenceBuilder,
10
+ create_evidence_ref,
11
+ crop_region_image,
12
+ encode_image_base64,
13
+ )
14
+
15
+ __all__ = [
16
+ "GroundingConfig",
17
+ "EvidenceBuilder",
18
+ "create_evidence_ref",
19
+ "crop_region_image",
20
+ "encode_image_base64",
21
+ ]
src/document/grounding/evidence.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evidence Builder for Document Grounding
3
+
4
+ Creates evidence references for extracted information.
5
+ Handles image cropping and base64 encoding.
6
+ """
7
+
8
+ import base64
9
+ import io
10
+ from typing import List, Optional, Dict, Any, Tuple
11
+ from pydantic import BaseModel, Field
12
+ import numpy as np
13
+ from PIL import Image
14
+ from loguru import logger
15
+
16
+ from ..schemas.core import (
17
+ BoundingBox,
18
+ DocumentChunk,
19
+ EvidenceRef,
20
+ OCRRegion,
21
+ )
22
+
23
+
24
+ class GroundingConfig(BaseModel):
25
+ """Configuration for grounding and evidence generation."""
26
+ # Image cropping
27
+ include_images: bool = Field(
28
+ default=True,
29
+ description="Include cropped images in evidence"
30
+ )
31
+ crop_padding: int = Field(
32
+ default=10,
33
+ ge=0,
34
+ description="Padding around crop regions in pixels"
35
+ )
36
+ max_image_size: int = Field(
37
+ default=512,
38
+ ge=64,
39
+ description="Maximum dimension for cropped images"
40
+ )
41
+ image_format: str = Field(
42
+ default="PNG",
43
+ description="Image format for encoding (PNG/JPEG)"
44
+ )
45
+ image_quality: int = Field(
46
+ default=85,
47
+ ge=1,
48
+ le=100,
49
+ description="JPEG quality if using JPEG format"
50
+ )
51
+
52
+ # Snippet settings
53
+ max_snippet_length: int = Field(
54
+ default=200,
55
+ ge=50,
56
+ description="Maximum length of text snippets"
57
+ )
58
+ include_context: bool = Field(
59
+ default=True,
60
+ description="Include surrounding context in snippets"
61
+ )
62
+
63
+
64
+ def crop_region_image(
65
+ image: np.ndarray,
66
+ bbox: BoundingBox,
67
+ padding: int = 10,
68
+ max_size: Optional[int] = None,
69
+ ) -> np.ndarray:
70
+ """
71
+ Crop a region from an image.
72
+
73
+ Args:
74
+ image: Source image (RGB, HWC format)
75
+ bbox: Bounding box to crop
76
+ padding: Padding around the crop
77
+ max_size: Maximum dimension (will resize if larger)
78
+
79
+ Returns:
80
+ Cropped image as numpy array
81
+ """
82
+ height, width = image.shape[:2]
83
+
84
+ # Get coordinates with padding
85
+ x1 = max(0, int(bbox.x_min) - padding)
86
+ y1 = max(0, int(bbox.y_min) - padding)
87
+ x2 = min(width, int(bbox.x_max) + padding)
88
+ y2 = min(height, int(bbox.y_max) + padding)
89
+
90
+ # Crop
91
+ cropped = image[y1:y2, x1:x2]
92
+
93
+ # Resize if needed
94
+ if max_size and max(cropped.shape[:2]) > max_size:
95
+ pil_img = Image.fromarray(cropped)
96
+ pil_img.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
97
+ cropped = np.array(pil_img)
98
+
99
+ return cropped
100
+
101
+
102
+ def encode_image_base64(
103
+ image: np.ndarray,
104
+ format: str = "PNG",
105
+ quality: int = 85,
106
+ ) -> str:
107
+ """
108
+ Encode image to base64 string.
109
+
110
+ Args:
111
+ image: Image as numpy array
112
+ format: Image format (PNG/JPEG)
113
+ quality: JPEG quality if applicable
114
+
115
+ Returns:
116
+ Base64-encoded string
117
+ """
118
+ pil_img = Image.fromarray(image)
119
+
120
+ # Convert to RGB if needed
121
+ if pil_img.mode != "RGB":
122
+ pil_img = pil_img.convert("RGB")
123
+
124
+ # Encode
125
+ buffer = io.BytesIO()
126
+ if format.upper() == "JPEG":
127
+ pil_img.save(buffer, format="JPEG", quality=quality)
128
+ else:
129
+ pil_img.save(buffer, format="PNG")
130
+
131
+ buffer.seek(0)
132
+ encoded = base64.b64encode(buffer.read()).decode("utf-8")
133
+
134
+ return encoded
135
+
136
+
137
+ def create_evidence_ref(
138
+ chunk: DocumentChunk,
139
+ source_type: str = "text",
140
+ snippet: Optional[str] = None,
141
+ confidence: float = 1.0,
142
+ image: Optional[np.ndarray] = None,
143
+ config: Optional[GroundingConfig] = None,
144
+ ) -> EvidenceRef:
145
+ """
146
+ Create an evidence reference from a document chunk.
147
+
148
+ Args:
149
+ chunk: Source chunk
150
+ source_type: Type of source (text/table/figure)
151
+ snippet: Optional specific snippet (defaults to chunk text)
152
+ confidence: Confidence score
153
+ image: Optional page image for cropping
154
+ config: Grounding configuration
155
+
156
+ Returns:
157
+ EvidenceRef instance
158
+ """
159
+ config = config or GroundingConfig()
160
+
161
+ # Create snippet
162
+ if snippet is None:
163
+ snippet = chunk.text[:config.max_snippet_length]
164
+ if len(chunk.text) > config.max_snippet_length:
165
+ snippet += "..."
166
+
167
+ # Create base evidence
168
+ evidence = EvidenceRef(
169
+ chunk_id=chunk.chunk_id,
170
+ page=chunk.page,
171
+ bbox=chunk.bbox,
172
+ source_type=source_type,
173
+ snippet=snippet,
174
+ confidence=confidence,
175
+ )
176
+
177
+ # Add image if available and configured
178
+ if image is not None and config.include_images:
179
+ try:
180
+ cropped = crop_region_image(
181
+ image,
182
+ chunk.bbox,
183
+ padding=config.crop_padding,
184
+ max_size=config.max_image_size,
185
+ )
186
+ evidence.image_base64 = encode_image_base64(
187
+ cropped,
188
+ format=config.image_format,
189
+ quality=config.image_quality,
190
+ )
191
+ except Exception as e:
192
+ logger.warning(f"Failed to crop evidence image: {e}")
193
+
194
+ return evidence
195
+
196
+
197
+ class EvidenceBuilder:
198
+ """
199
+ Builder for creating evidence references.
200
+
201
+ Handles:
202
+ - Evidence from chunks
203
+ - Evidence from OCR regions
204
+ - Evidence aggregation
205
+ - Image cropping and encoding
206
+ """
207
+
208
+ def __init__(self, config: Optional[GroundingConfig] = None):
209
+ """Initialize evidence builder."""
210
+ self.config = config or GroundingConfig()
211
+
212
+ def from_chunk(
213
+ self,
214
+ chunk: DocumentChunk,
215
+ image: Optional[np.ndarray] = None,
216
+ additional_context: Optional[str] = None,
217
+ ) -> EvidenceRef:
218
+ """
219
+ Create evidence reference from a chunk.
220
+
221
+ Args:
222
+ chunk: Source chunk
223
+ image: Optional page image for visual evidence
224
+ additional_context: Optional additional context
225
+
226
+ Returns:
227
+ EvidenceRef
228
+ """
229
+ # Determine source type
230
+ source_type = chunk.chunk_type.value
231
+
232
+ # Build snippet with optional context
233
+ snippet = chunk.text[:self.config.max_snippet_length]
234
+ if additional_context:
235
+ snippet = f"{additional_context}\n{snippet}"
236
+ if len(chunk.text) > self.config.max_snippet_length:
237
+ snippet += "..."
238
+
239
+ return create_evidence_ref(
240
+ chunk=chunk,
241
+ source_type=source_type,
242
+ snippet=snippet,
243
+ confidence=chunk.confidence,
244
+ image=image,
245
+ config=self.config,
246
+ )
247
+
248
+ def from_ocr_region(
249
+ self,
250
+ region: OCRRegion,
251
+ chunk_id: str,
252
+ document_id: str,
253
+ image: Optional[np.ndarray] = None,
254
+ ) -> EvidenceRef:
255
+ """
256
+ Create evidence reference from an OCR region.
257
+
258
+ Args:
259
+ region: OCR region
260
+ chunk_id: ID to assign
261
+ document_id: Parent document ID
262
+ image: Optional page image
263
+
264
+ Returns:
265
+ EvidenceRef
266
+ """
267
+ # Create a temporary chunk for the evidence
268
+ from ..schemas.core import DocumentChunk, ChunkType
269
+
270
+ chunk = DocumentChunk(
271
+ chunk_id=chunk_id,
272
+ chunk_type=ChunkType.TEXT,
273
+ text=region.text,
274
+ bbox=region.bbox,
275
+ page=region.page,
276
+ document_id=document_id,
277
+ source_path=None,
278
+ sequence_index=0,
279
+ confidence=region.confidence,
280
+ )
281
+
282
+ return self.from_chunk(chunk, image)
283
+
284
+ def aggregate_evidence(
285
+ self,
286
+ evidence_list: List[EvidenceRef],
287
+ combine_snippets: bool = True,
288
+ ) -> List[EvidenceRef]:
289
+ """
290
+ Aggregate and deduplicate evidence references.
291
+
292
+ Args:
293
+ evidence_list: List of evidence references
294
+ combine_snippets: Whether to combine snippets from same chunk
295
+
296
+ Returns:
297
+ Deduplicated evidence list
298
+ """
299
+ if not evidence_list:
300
+ return []
301
+
302
+ # Group by chunk_id
303
+ by_chunk: Dict[str, List[EvidenceRef]] = {}
304
+ for ev in evidence_list:
305
+ if ev.chunk_id not in by_chunk:
306
+ by_chunk[ev.chunk_id] = []
307
+ by_chunk[ev.chunk_id].append(ev)
308
+
309
+ # Combine or select best
310
+ result = []
311
+ for chunk_id, evidences in by_chunk.items():
312
+ if len(evidences) == 1:
313
+ result.append(evidences[0])
314
+ else:
315
+ # Take highest confidence, combine snippets
316
+ best = max(evidences, key=lambda e: e.confidence)
317
+ if combine_snippets:
318
+ all_snippets = list(set(e.snippet for e in evidences))
319
+ combined = " ... ".join(all_snippets[:3])
320
+ best = EvidenceRef(
321
+ chunk_id=best.chunk_id,
322
+ page=best.page,
323
+ bbox=best.bbox,
324
+ source_type=best.source_type,
325
+ snippet=combined[:self.config.max_snippet_length],
326
+ confidence=best.confidence,
327
+ image_base64=best.image_base64,
328
+ )
329
+ result.append(best)
330
+
331
+ # Sort by page and position
332
+ result.sort(key=lambda e: (e.page, e.bbox.y_min, e.bbox.x_min))
333
+
334
+ return result
335
+
336
+ def create_grounding_context(
337
+ self,
338
+ evidence_list: List[EvidenceRef],
339
+ include_images: bool = False,
340
+ ) -> str:
341
+ """
342
+ Create a text context from evidence for LLM prompting.
343
+
344
+ Args:
345
+ evidence_list: Evidence references
346
+ include_images: Whether to include image markers
347
+
348
+ Returns:
349
+ Formatted context string
350
+ """
351
+ if not evidence_list:
352
+ return ""
353
+
354
+ lines = ["Evidence from document:"]
355
+ for i, ev in enumerate(evidence_list, 1):
356
+ lines.append(
357
+ f"\n[{i}] Page {ev.page + 1}, {ev.source_type} "
358
+ f"(confidence: {ev.confidence:.2f}):"
359
+ )
360
+ lines.append(f' "{ev.snippet}"')
361
+
362
+ if include_images and ev.image_base64:
363
+ lines.append(" [Image available]")
364
+
365
+ return "\n".join(lines)
src/document/io/__init__.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document I/O Module
3
+
4
+ Handles loading, rendering, and caching of PDF and image documents.
5
+ """
6
+
7
+ from .loader import (
8
+ DocumentLoader,
9
+ load_document,
10
+ load_pdf,
11
+ load_image,
12
+ render_page,
13
+ )
14
+
15
+ from .cache import (
16
+ DocumentCache,
17
+ get_document_cache,
18
+ )
19
+
20
+ __all__ = [
21
+ "DocumentLoader",
22
+ "load_document",
23
+ "load_pdf",
24
+ "load_image",
25
+ "render_page",
26
+ "DocumentCache",
27
+ "get_document_cache",
28
+ ]
src/document/io/cache.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document Cache
3
+
4
+ Caches rendered page images and document metadata for performance.
5
+ """
6
+
7
+ import hashlib
8
+ import os
9
+ from pathlib import Path
10
+ from typing import Dict, Optional, Tuple
11
+ from dataclasses import dataclass
12
+ from datetime import datetime, timedelta
13
+ from loguru import logger
14
+
15
+ import numpy as np
16
+ from PIL import Image
17
+
18
+ from cachetools import TTLCache, LRUCache
19
+
20
+
21
+ @dataclass
22
+ class CacheEntry:
23
+ """A cached page image entry."""
24
+ document_id: str
25
+ page_number: int
26
+ dpi: int
27
+ image: np.ndarray
28
+ created_at: datetime
29
+ size_bytes: int
30
+
31
+
32
+ class DocumentCache:
33
+ """
34
+ In-memory cache for rendered document pages.
35
+ Uses LRU eviction with optional disk persistence.
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ max_pages: int = 100,
41
+ max_memory_mb: int = 1024,
42
+ ttl_seconds: int = 3600,
43
+ disk_cache_dir: Optional[str] = None,
44
+ ):
45
+ """
46
+ Initialize document cache.
47
+
48
+ Args:
49
+ max_pages: Maximum number of pages to cache in memory
50
+ max_memory_mb: Maximum memory usage in MB
51
+ ttl_seconds: Time-to-live for cache entries
52
+ disk_cache_dir: Optional directory for disk caching
53
+ """
54
+ self.max_pages = max_pages
55
+ self.max_memory_mb = max_memory_mb
56
+ self.ttl_seconds = ttl_seconds
57
+ self.disk_cache_dir = disk_cache_dir
58
+
59
+ # In-memory cache
60
+ self._cache: TTLCache = TTLCache(maxsize=max_pages, ttl=ttl_seconds)
61
+
62
+ # Memory tracking
63
+ self._memory_used_bytes = 0
64
+
65
+ # Statistics
66
+ self._hits = 0
67
+ self._misses = 0
68
+
69
+ # Initialize disk cache if enabled
70
+ if disk_cache_dir:
71
+ self._disk_cache_path = Path(disk_cache_dir)
72
+ self._disk_cache_path.mkdir(parents=True, exist_ok=True)
73
+ else:
74
+ self._disk_cache_path = None
75
+
76
+ logger.debug(f"Initialized DocumentCache (max_pages={max_pages}, max_memory={max_memory_mb}MB)")
77
+
78
+ def _make_key(self, document_id: str, page_number: int, dpi: int) -> str:
79
+ """Generate cache key."""
80
+ return f"{document_id}:p{page_number}:d{dpi}"
81
+
82
+ def get(
83
+ self,
84
+ document_id: str,
85
+ page_number: int,
86
+ dpi: int = 300,
87
+ ) -> Optional[np.ndarray]:
88
+ """
89
+ Get a cached page image.
90
+
91
+ Args:
92
+ document_id: Document identifier
93
+ page_number: Page number
94
+ dpi: Rendering DPI
95
+
96
+ Returns:
97
+ Cached image array or None
98
+ """
99
+ key = self._make_key(document_id, page_number, dpi)
100
+
101
+ # Check in-memory cache
102
+ entry = self._cache.get(key)
103
+ if entry is not None:
104
+ self._hits += 1
105
+ return entry.image
106
+
107
+ # Check disk cache
108
+ if self._disk_cache_path:
109
+ disk_path = self._disk_cache_path / f"{key}.npy"
110
+ if disk_path.exists():
111
+ try:
112
+ image = np.load(disk_path)
113
+ # Promote to memory cache
114
+ self._put_memory(key, document_id, page_number, dpi, image)
115
+ self._hits += 1
116
+ return image
117
+ except Exception as e:
118
+ logger.warning(f"Failed to load from disk cache: {e}")
119
+
120
+ self._misses += 1
121
+ return None
122
+
123
+ def put(
124
+ self,
125
+ document_id: str,
126
+ page_number: int,
127
+ dpi: int,
128
+ image: np.ndarray,
129
+ persist_to_disk: bool = False,
130
+ ):
131
+ """
132
+ Cache a page image.
133
+
134
+ Args:
135
+ document_id: Document identifier
136
+ page_number: Page number
137
+ dpi: Rendering DPI
138
+ image: Page image as numpy array
139
+ persist_to_disk: Whether to persist to disk
140
+ """
141
+ key = self._make_key(document_id, page_number, dpi)
142
+
143
+ # Put in memory cache
144
+ self._put_memory(key, document_id, page_number, dpi, image)
145
+
146
+ # Optionally persist to disk
147
+ if persist_to_disk and self._disk_cache_path:
148
+ self._put_disk(key, image)
149
+
150
+ def _put_memory(
151
+ self,
152
+ key: str,
153
+ document_id: str,
154
+ page_number: int,
155
+ dpi: int,
156
+ image: np.ndarray,
157
+ ):
158
+ """Put entry in memory cache."""
159
+ size_bytes = image.nbytes
160
+
161
+ # Check memory limit
162
+ max_bytes = self.max_memory_mb * 1024 * 1024
163
+ if self._memory_used_bytes + size_bytes > max_bytes:
164
+ # Evict oldest entries until we have space
165
+ self._evict_to_fit(size_bytes)
166
+
167
+ entry = CacheEntry(
168
+ document_id=document_id,
169
+ page_number=page_number,
170
+ dpi=dpi,
171
+ image=image,
172
+ created_at=datetime.utcnow(),
173
+ size_bytes=size_bytes,
174
+ )
175
+
176
+ self._cache[key] = entry
177
+ self._memory_used_bytes += size_bytes
178
+
179
+ def _put_disk(self, key: str, image: np.ndarray):
180
+ """Persist entry to disk cache."""
181
+ if not self._disk_cache_path:
182
+ return
183
+
184
+ try:
185
+ disk_path = self._disk_cache_path / f"{key}.npy"
186
+ np.save(disk_path, image)
187
+ except Exception as e:
188
+ logger.warning(f"Failed to write to disk cache: {e}")
189
+
190
+ def _evict_to_fit(self, needed_bytes: int):
191
+ """Evict entries to fit new entry."""
192
+ max_bytes = self.max_memory_mb * 1024 * 1024
193
+ target = max_bytes - needed_bytes
194
+
195
+ # Get entries sorted by creation time (oldest first)
196
+ entries = list(self._cache.items())
197
+
198
+ for key, entry in entries:
199
+ if self._memory_used_bytes <= target:
200
+ break
201
+ self._memory_used_bytes -= entry.size_bytes
202
+ del self._cache[key]
203
+
204
+ def invalidate(self, document_id: str, page_number: Optional[int] = None):
205
+ """
206
+ Invalidate cache entries for a document.
207
+
208
+ Args:
209
+ document_id: Document to invalidate
210
+ page_number: Optional specific page (None = all pages)
211
+ """
212
+ keys_to_remove = []
213
+
214
+ for key in self._cache.keys():
215
+ if key.startswith(f"{document_id}:"):
216
+ if page_number is None or f":p{page_number}:" in key:
217
+ keys_to_remove.append(key)
218
+
219
+ for key in keys_to_remove:
220
+ entry = self._cache.pop(key, None)
221
+ if entry:
222
+ self._memory_used_bytes -= entry.size_bytes
223
+
224
+ # Also remove from disk cache
225
+ if self._disk_cache_path:
226
+ for key in keys_to_remove:
227
+ disk_path = self._disk_cache_path / f"{key}.npy"
228
+ if disk_path.exists():
229
+ disk_path.unlink()
230
+
231
+ def clear(self):
232
+ """Clear all cache entries."""
233
+ self._cache.clear()
234
+ self._memory_used_bytes = 0
235
+
236
+ # Clear disk cache
237
+ if self._disk_cache_path:
238
+ for f in self._disk_cache_path.glob("*.npy"):
239
+ f.unlink()
240
+
241
+ logger.info("Document cache cleared")
242
+
243
+ @property
244
+ def stats(self) -> Dict:
245
+ """Get cache statistics."""
246
+ total = self._hits + self._misses
247
+ hit_rate = (self._hits / total * 100) if total > 0 else 0
248
+
249
+ return {
250
+ "hits": self._hits,
251
+ "misses": self._misses,
252
+ "hit_rate": f"{hit_rate:.1f}%",
253
+ "entries": len(self._cache),
254
+ "memory_used_mb": self._memory_used_bytes / (1024 * 1024),
255
+ "max_memory_mb": self.max_memory_mb,
256
+ }
257
+
258
+
259
+ # Global cache instance
260
+ _document_cache: Optional[DocumentCache] = None
261
+
262
+
263
+ def get_document_cache() -> DocumentCache:
264
+ """Get or create the global document cache."""
265
+ global _document_cache
266
+ if _document_cache is None:
267
+ _document_cache = DocumentCache()
268
+ return _document_cache