LiamKhoaLe commited on
Commit
ee39cc9
·
2 Parent(s): e55c935 e34edc7

Merge commit 'e34edc7cd55f292dd0b192dc00b782c22208fde6' as 'ingestion_python'

Browse files
ingestion_python/.dockerignore ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore unnecessary files for Docker build
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ .Python
7
+ env/
8
+ venv/
9
+ .venv/
10
+ pip-log.txt
11
+ pip-delete-this-directory.txt
12
+ .tox/
13
+ .coverage
14
+ .coverage.*
15
+ .cache
16
+ nosetests.xml
17
+ coverage.xml
18
+ *.cover
19
+ *.log
20
+ .git/
21
+ .gitignore
22
+ .dockerignore
23
+
24
+ # Ignore documentation files (keep only main README)
25
+ *.md
26
+ !README.md
27
+
28
+ # Ignore test files
29
+ test_*.py
30
+ *_test.py
31
+ *.sh
32
+ tests/
33
+
34
+ # Ignore IDE files
35
+ .vscode/
36
+ .idea/
37
+ *.swp
38
+ *.swo
39
+ *~
40
+
41
+ # Ignore OS files
42
+ .DS_Store
43
+ Thumbs.db
44
+
45
+ # Ignore unnecessary directories
46
+ # config/ and services/ are needed for the application
ingestion_python/CURL.md ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CURL Test Commands for Ingestion Pipeline
2
+
3
+ ## Backend Configuration
4
+ - **URL**: `https://binkhoale1812-studdybuddy-ingestion1.hf.space/`
5
+ - **User ID**: `44e65346-8eaa-4f95-b17a-f6219953e7a8`
6
+ - **Project ID**: `496e2fad-ec7e-4562-b06a-ea2491f2460`
7
+ - **Test Files**: `Lecture5_ML.pdf`, `Lecture6_ANN_DL.pdf`
8
+
9
+ ## 1. Health Check
10
+
11
+ ```bash
12
+ curl -X GET "https://binkhoale1812-studdybuddy-ingestion1.hf.space/health" \
13
+ -H "Content-Type: application/json"
14
+ ```
15
+
16
+ ## 2. Upload Files
17
+
18
+ ```bash
19
+ curl -X POST "https://binkhoale1812-studdybuddy-ingestion1.hf.space/upload" \
20
+ -F "user_id=44e65346-8eaa-4f95-b17a-f6219953e7a8" \
21
+ -F "project_id=496e2fad-ec7e-4562-b06a-ea2491f2460" \
22
+ -F "files=@../exefiles/Lecture5_ML.pdf" \
23
+ -F "files=@../exefiles/Lecture6_ANN_DL.pdf"
24
+ ```
25
+
26
+ ## 3. Check Upload Status
27
+
28
+ Replace `{JOB_ID}` with the job_id from the upload response:
29
+
30
+ ```bash
31
+ curl -X GET "https://binkhoale1812-studdybuddy-ingestion1.hf.space/upload/status?job_id={JOB_ID}" \
32
+ -H "Content-Type: application/json"
33
+ ```
34
+
35
+ ## 4. List Uploaded Files
36
+
37
+ ```bash
38
+ curl -X GET "https://binkhoale1812-studdybuddy-ingestion1.hf.space/files?user_id=44e65346-8eaa-4f95-b17a-f6219953e7a8&project_id=496e2fad-ec7e-4562-b06a-ea2491f2460" \
39
+ -H "Content-Type: application/json"
40
+ ```
41
+
42
+ ## 5. Get File Chunks (Lecture5_ML.pdf)
43
+
44
+ ```bash
45
+ curl -X GET "https://binkhoale1812-studdybuddy-ingestion1.hf.space/files/chunks?user_id=44e65346-8eaa-4f95-b17a-f6219953e7a8&project_id=496e2fad-ec7e-4562-b06a-ea2491f2460&filename=Lecture5_ML.pdf&limit=5" \
46
+ -H "Content-Type: application/json"
47
+ ```
48
+
49
+ ## 6. Get File Chunks (Lecture6_ANN_DL.pdf)
50
+
51
+ ```bash
52
+ curl -X GET "https://binkhoale1812-studdybuddy-ingestion1.hf.space/files/chunks?user_id=44e65346-8eaa-4f95-b17a-f6219953e7a8&project_id=496e2fad-ec7e-4562-b06a-ea2491f2460&filename=Lecture6_ANN_DL.pdf&limit=5" \
53
+ -H "Content-Type: application/json"
54
+ ```
55
+
56
+ ## Expected Responses
57
+
58
+ ### Health Check Response
59
+ ```json
60
+ {
61
+ "ok": true,
62
+ "mongodb_connected": true,
63
+ "service": "ingestion_pipeline"
64
+ }
65
+ ```
66
+
67
+ ### Upload Response
68
+ ```json
69
+ {
70
+ "job_id": "uuid-string",
71
+ "status": "processing",
72
+ "total_files": 2
73
+ }
74
+ ```
75
+
76
+ ### Status Response
77
+ ```json
78
+ {
79
+ "job_id": "uuid-string",
80
+ "status": "completed",
81
+ "total": 2,
82
+ "completed": 2,
83
+ "progress": 100.0,
84
+ "last_error": null,
85
+ "created_at": 1234567890.123
86
+ }
87
+ ```
88
+
89
+ ### Files List Response
90
+ ```json
91
+ {
92
+ "files": [
93
+ {
94
+ "filename": "Lecture5_ML.pdf",
95
+ "summary": "Document summary..."
96
+ },
97
+ {
98
+ "filename": "Lecture6_ANN_DL.pdf",
99
+ "summary": "Document summary..."
100
+ }
101
+ ],
102
+ "filenames": ["Lecture5_ML.pdf", "Lecture6_ANN_DL.pdf"]
103
+ }
104
+ ```
105
+
106
+ ### Chunks Response
107
+ ```json
108
+ {
109
+ "chunks": [
110
+ {
111
+ "user_id": "44e65346-8eaa-4f95-b17a-f6219953e7a8",
112
+ "project_id": "496e2fad-ec7e-4562-b06a-ea2491f2460",
113
+ "filename": "Lecture5_ML.pdf",
114
+ "topic_name": "Machine Learning Introduction",
115
+ "summary": "Chunk summary...",
116
+ "content": "Chunk content...",
117
+ "embedding": [0.1, 0.2, ...],
118
+ "page_span": [1, 3],
119
+ "card_id": "lecture5_ml-c0001"
120
+ }
121
+ ]
122
+ }
123
+ ```
124
+
125
+ ## Testing Steps
126
+
127
+ 1. **Run Health Check**: Verify the service is running
128
+ 2. **Upload Files**: Upload both PDF files
129
+ 3. **Monitor Progress**: Check job status until completion
130
+ 4. **Verify Files**: List uploaded files
131
+ 5. **Inspect Chunks**: Get document chunks to verify processing
132
+
133
+ ## Troubleshooting
134
+
135
+ - **Connection Issues**: Check if the backend URL is accessible
136
+ - **File Not Found**: Ensure PDF files exist in `../exefiles/` directory
137
+ - **Upload Fails**: Check file size limits and format support
138
+ - **Processing Stuck**: Monitor job status and check logs
ingestion_python/Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Spaces - Docker for Ingestion Pipeline
2
+ FROM python:3.11-slim
3
+
4
+ ENV PYTHONDONTWRITEBYTECODE=1
5
+ ENV PYTHONUNBUFFERED=1
6
+
7
+ # System deps (same as main system)
8
+ RUN apt-get update && apt-get install -y --no-install-recommends \
9
+ build-essential curl git libglib2.0-0 libgl1 \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Create and use a non-root user (same as main system)
13
+ RUN useradd -m -u 1000 user
14
+ USER user
15
+ ENV PATH="/home/user/.local/bin:$PATH"
16
+
17
+ # Set working directory
18
+ WORKDIR /app
19
+
20
+ # Copy ingestion pipeline files (includes utils and helpers)
21
+ COPY . .
22
+
23
+ # Install Python dependencies (same as main system)
24
+ RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
25
+
26
+ # No local model caches or warmup needed (remote embedding service)
27
+
28
+ # Expose port for HF Spaces
29
+ ENV PORT=7860
30
+ EXPOSE 7860
31
+
32
+ # Start FastAPI (single worker so app.state.jobs remains consistent)
33
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
ingestion_python/README.md ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: StuddyBuddy Ingestion
3
+ emoji: ⚙️
4
+ colorFrom: blue
5
+ colorTo: pink
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ short_description: 'backend for data ingestion'
10
+ ---
11
+
12
+ # Ingestion Pipeline
13
+
14
+ A dedicated service for processing file uploads and storing them in MongoDB Atlas. This service mirrors the main system's file processing functionality while running as a separate service to share the processing load.
15
+
16
+ [API docs](API.md) | [System docs](COMPATIBILITY.md)
17
+
18
+ ## 🏗️ Architecture
19
+
20
+ ```
21
+ ┌─────────────────────────────────────────────────────────────────────────────────┐
22
+ │ USER INTERFACE │
23
+ │ ┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐ │
24
+ │ │ Frontend UI │ │ Load Balancer │ │ Main System │ │
25
+ │ │ │◄──►│ │◄──►│ (Port 7860) │ │
26
+ │ │ - File Upload │ │ - Route Requests │ │ - Chat & Reports│ │
27
+ │ │ - Chat Interface│ │ - Health Checks │ │ - User Management│ │
28
+ │ │ - Project Mgmt │ │ - Load Balancing │ │ - Analytics │ │
29
+ │ └─────────────────┘ └──────────────────┘ └─────────────────┘ │
30
+ └─────────────────────────────────────────────────────────────────────────────────┘
31
+
32
+
33
+ ┌─────────────────────────────────────────────────────────────────────────────────┐
34
+ │ INGESTION PIPELINE │
35
+ │ ┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐ │
36
+ │ │ File Processing │ │ Data Storage │ │ Monitoring │ │
37
+ │ │ - PDF/DOCX Parse│ │ - MongoDB Atlas │ │ - Job Status │ │
38
+ │ │ - Image Caption │ │ - Vector Search │ │ - Health Checks │ │
39
+ │ │ - Text Chunking │ │ - Embeddings │ │ - Error Handling│ │
40
+ │ │ - Embedding Gen │ │ - User/Project │ │ - Logging │ │
41
+ │ └─────────────────┘ └──────────────────┘ └─────────────────┘ │
42
+ └─────────────────────────────────────────────────────────────────────────────────┘
43
+
44
+
45
+ ┌─────────────────────────────────────────────────────────────────────────────────┐
46
+ │ SHARED DATABASE │
47
+ │ ┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐ │
48
+ │ │ MongoDB Atlas │ │ Collections │ │ Indexes │ │
49
+ │ │ │ │ - chunks │ │ - Vector Search │ │
50
+ │ │ - Same Cluster │ │ - files │ │ - Text Search │ │
51
+ │ │ - Same Database │ │ - chat_sessions │ │ - User/Project │ │
52
+ │ │ - Same Schema │ │ - chat_messages │ │ - Performance │ │
53
+ │ └─────────────────┘ └──────────────────┘ └─────────────────┘ │
54
+ └───────────────────────────────────���─────────────────────────────────────────────┘
55
+ ```
56
+
57
+ ## 📁 Project Structure
58
+
59
+ ```
60
+ ingestion_pipeline/
61
+ ├── __init__.py
62
+ ├── app.py # Main FastAPI application
63
+ ├── requirements.txt # Python dependencies
64
+ ├── Dockerfile # HuggingFace deployment
65
+ ├── deploy.sh # Deployment script
66
+ ├── test_pipeline.py # Test script
67
+ ├── README.md # This file
68
+ ├── config/ # Configuration
69
+ │ ├── __init__.py
70
+ │ └── settings.py
71
+ ├── api/ # API layer
72
+ │ ├── __init__.py
73
+ │ ├── models.py # Pydantic models
74
+ │ └── routes.py # API routes
75
+ └── services/ # Business logic
76
+ ├── __init__.py
77
+ └── ingestion_service.py
78
+ ```
79
+
80
+ ## 🚀 Quick Start
81
+
82
+ ### Prerequisites
83
+ - Docker
84
+ - MongoDB Atlas cluster
85
+ - Python 3.11+
86
+
87
+
88
+ ## 🔧 API Endpoints
89
+
90
+ ### Health Check
91
+ ```http
92
+ GET /health
93
+ ```
94
+
95
+ ### Upload Files
96
+ ```http
97
+ POST /upload
98
+ Content-Type: multipart/form-data
99
+
100
+ user_id: string
101
+ project_id: string
102
+ files: File[]
103
+ replace_filenames: string (optional)
104
+ rename_map: string (optional)
105
+ ```
106
+
107
+ ### Job Status
108
+ ```http
109
+ GET /upload/status?job_id={job_id}
110
+ ```
111
+
112
+ ### List Files
113
+ ```http
114
+ GET /files?user_id={user_id}&project_id={project_id}
115
+ ```
116
+
117
+ ### Get File Chunks
118
+ ```http
119
+ GET /files/chunks?user_id={user_id}&project_id={project_id}&filename={filename}&limit={limit}
120
+ ```
121
+
122
+ ## 🔄 Data Flow
123
+
124
+ ### File Processing Pipeline
125
+ 1. **File Upload**: User uploads files via frontend
126
+ 2. **Load Balancing**: Request routed to ingestion pipeline
127
+ 3. **File Processing**:
128
+ - PDF/DOCX parsing with image extraction
129
+ - BLIP image captioning
130
+ - Semantic chunking with overlap
131
+ - Embedding generation (all-MiniLM-L6-v2)
132
+ 4. **Data Storage**:
133
+ - Chunks stored in `chunks` collection
134
+ - File summaries in `files` collection
135
+ - Both scoped by `user_id` and `project_id`
136
+ 5. **Response**: Job ID returned for progress tracking
137
+
138
+ ### Data Consistency
139
+ - **Same Database**: Uses identical MongoDB Atlas cluster
140
+ - **Same Collections**: Stores in `chunks` and `files` collections
141
+ - **Same Schema**: Identical data structure and metadata
142
+ - **Same Scoping**: All data scoped by `user_id` and `project_id`
143
+ - **Same Indexes**: Uses identical database indexes
144
+
145
+ ## 🐳 Docker Deployment
146
+
147
+ ### HuggingFace Spaces
148
+ The service is designed for HuggingFace Spaces deployment with:
149
+ - Port 7860 (HuggingFace default)
150
+ - Non-root user for security
151
+ - HuggingFace cache directories
152
+ - Model preloading and warmup
153
+
154
+ ### Logging
155
+ - Comprehensive logging for all operations
156
+ - Error tracking and debugging
157
+ - Performance monitoring
158
+
159
+ ### Job Tracking
160
+ - Upload progress monitoring
161
+ - Error handling and reporting
162
+ - Status updates
163
+
164
+ ## 🔧 Configuration
165
+
166
+ ### Environment Variables
167
+ | Variable | Default | Description |
168
+ |----------|---------|-------------|
169
+ | `MONGO_URI` | Required | MongoDB connection string |
170
+ | `MONGO_DB` | `studybuddy` | Database name |
171
+ | `EMBED_MODEL` | `sentence-transformers/all-MiniLM-L6-v2` | Embedding model |
172
+ | `ATLAS_VECTOR` | `0` | Enable Atlas Vector Search |
173
+ | `MAX_FILES_PER_UPLOAD` | `15` | Maximum files per upload |
174
+ | `MAX_FILE_MB` | `50` | Maximum file size in MB |
175
+ | `INGESTION_PORT` | `7860` | Service port |
176
+
177
+ ### Processing Configuration
178
+ - **Vector Dimension**: 384 (all-MiniLM-L6-v2)
179
+ - **Chunk Max Words**: 500
180
+ - **Chunk Min Words**: 150
181
+ - **Chunk Overlap**: 50 words
182
+
183
+ ## 🔒 Security
184
+
185
+ ### Security Features
186
+ - Non-root user in Docker container
187
+ - Input validation and sanitization
188
+ - Error handling and logging
189
+ - Rate limiting (configurable)
190
+
191
+ ### Best Practices
192
+ - Use environment variables for secrets
193
+ - Regular security updates
194
+ - Monitor logs for anomalies
195
+ - Implement proper access controls
196
+
197
+ ## 🚀 Performance
198
+
199
+ ### Optimization Features
200
+ - Lazy loading of ML models
201
+ - Efficient file processing
202
+ - Background task processing
203
+ - Memory management
204
+
205
+ ### Scaling
206
+ - Horizontal scaling support
207
+ - Load balancing ready
208
+ - Resource optimization
209
+ - Performance monitoring
210
+
211
+ ## 📚 Integration
212
+
213
+ ### Main System Integration
214
+ The ingestion pipeline is designed to work seamlessly with the main system:
215
+ - Same API endpoints
216
+ - Same data structures
217
+ - Same processing pipeline
218
+ - Same storage format
219
+
220
+ ### Load Balancer Integration
221
+ - Automatic request routing
222
+ - Health check integration
223
+ - Failover support
224
+ - Performance monitoring
225
+
226
+ ## 🐛 Troubleshooting
227
+
228
+ ### Common Issues
229
+ 1. **MongoDB Connection**: Verify `MONGO_URI` is correct
230
+ 2. **Port Conflicts**: Ensure port 7860 is available
231
+ 3. **Model Loading**: Check HuggingFace cache permissions
232
+ 4. **File Processing**: Verify file format support
233
+
234
+ ## 📈 Future Enhancements
235
+
236
+ ### Planned Features
237
+ - Multiple file format support
238
+ - Advanced chunking strategies
239
+ - Performance optimizations
240
+ - Enhanced monitoring
241
+
242
+ ### Scalability
243
+ - Kubernetes deployment
244
+ - Auto-scaling support
245
+ - Load balancing improvements
246
+ - Resource optimization
ingestion_python/api/models.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pydantic models for the ingestion pipeline API
3
+ """
4
+
5
+ from typing import List, Dict, Any, Optional
6
+ from pydantic import BaseModel
7
+
8
+ # Response models (same as main system)
9
+ class UploadResponse(BaseModel):
10
+ job_id: str
11
+ status: str
12
+ total_files: Optional[int] = None
13
+
14
+ class JobStatusResponse(BaseModel):
15
+ job_id: str
16
+ status: str
17
+ total: int
18
+ completed: int
19
+ progress: float
20
+ last_error: Optional[str] = None
21
+ created_at: float
22
+
23
+ class HealthResponse(BaseModel):
24
+ ok: bool
25
+ mongodb_connected: bool
26
+ service: str = "ingestion_pipeline"
27
+
28
+ class FileResponse(BaseModel):
29
+ filename: str
30
+ summary: str
31
+
32
+ class FilesListResponse(BaseModel):
33
+ files: List[FileResponse]
34
+ filenames: List[str]
35
+
36
+ class ChunksResponse(BaseModel):
37
+ chunks: List[Dict[str, Any]]
ingestion_python/api/routes.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ API routes for the ingestion pipeline
3
+ """
4
+
5
+ import os
6
+ import asyncio
7
+ import uuid
8
+ import time
9
+ import json
10
+ from typing import List, Dict, Any, Optional
11
+ from fastapi import APIRouter, Form, File, UploadFile, HTTPException, BackgroundTasks, Request
12
+
13
+ from api.models import UploadResponse, JobStatusResponse, HealthResponse, FilesListResponse, ChunksResponse
14
+ from services.ingestion_service import IngestionService
15
+ from services.maverick_captioner import _normalize_caption
16
+ from utils.logger import get_logger
17
+
18
+ logger = get_logger("INGESTION_ROUTES", __name__)
19
+
20
+ # Create router
21
+ router = APIRouter()
22
+
23
+ # Global services (will be injected)
24
+ rag = None
25
+ embedder = None
26
+ captioner = None
27
+ ingestion_service = None
28
+
29
+ def initialize_services(rag_store, embedder_client, captioner_client):
30
+ """Initialize services"""
31
+ global rag, embedder, captioner, ingestion_service
32
+ rag = rag_store
33
+ embedder = embedder_client
34
+ captioner = captioner_client
35
+ ingestion_service = IngestionService(rag_store, embedder_client, captioner_client)
36
+
37
+ @router.get("/health", response_model=HealthResponse)
38
+ async def health():
39
+ """Health check endpoint"""
40
+ mongodb_connected = rag is not None
41
+ return HealthResponse(
42
+ ok=mongodb_connected,
43
+ mongodb_connected=mongodb_connected
44
+ )
45
+
46
+ @router.post("/upload", response_model=UploadResponse)
47
+ async def upload_files(
48
+ request: Request,
49
+ background_tasks: BackgroundTasks,
50
+ user_id: str = Form(...),
51
+ project_id: str = Form(...),
52
+ files: List[UploadFile] = File(...),
53
+ replace_filenames: Optional[str] = Form(None),
54
+ rename_map: Optional[str] = Form(None),
55
+ ):
56
+ """
57
+ Upload and process files
58
+
59
+ This endpoint mirrors the main system's upload functionality exactly.
60
+ """
61
+ if not rag:
62
+ raise HTTPException(500, detail="MongoDB connection not available")
63
+
64
+ job_id = str(uuid.uuid4())
65
+
66
+ # File limits (same as main system)
67
+ max_files = int(os.getenv("MAX_FILES_PER_UPLOAD", "15"))
68
+ max_mb = int(os.getenv("MAX_FILE_MB", "50"))
69
+
70
+ if len(files) > max_files:
71
+ raise HTTPException(400, detail=f"Too many files. Max {max_files} allowed per upload.")
72
+
73
+ # Parse replace/rename directives (same as main system)
74
+ replace_set = set()
75
+ try:
76
+ if replace_filenames:
77
+ replace_set = set(json.loads(replace_filenames))
78
+ except Exception:
79
+ pass
80
+
81
+ rename_dict: Dict[str, str] = {}
82
+ try:
83
+ if rename_map:
84
+ rename_dict = json.loads(rename_map)
85
+ except Exception:
86
+ pass
87
+
88
+ # Preload files (same as main system)
89
+ preloaded_files = []
90
+ for uf in files:
91
+ raw = await uf.read()
92
+ if len(raw) > max_mb * 1024 * 1024:
93
+ raise HTTPException(400, detail=f"{uf.filename} exceeds {max_mb} MB limit")
94
+ eff_name = rename_dict.get(uf.filename, uf.filename)
95
+ preloaded_files.append((eff_name, raw))
96
+
97
+ # Initialize job status (same as main system)
98
+ from app import app
99
+ app.state.jobs[job_id] = {
100
+ "created_at": time.time(),
101
+ "total": len(preloaded_files),
102
+ "completed": 0,
103
+ "status": "processing",
104
+ "last_error": None,
105
+ }
106
+
107
+ # Background processing (mirrors main system exactly)
108
+ async def _process_all():
109
+ for idx, (fname, raw) in enumerate(preloaded_files, start=1):
110
+ try:
111
+ # Handle file replacement (same as main system)
112
+ if fname in replace_set:
113
+ try:
114
+ rag.db["chunks"].delete_many({"user_id": user_id, "project_id": project_id, "filename": fname})
115
+ rag.db["files"].delete_many({"user_id": user_id, "project_id": project_id, "filename": fname})
116
+ logger.info(f"[{job_id}] Replaced prior data for {fname}")
117
+ except Exception as de:
118
+ logger.warning(f"[{job_id}] Replace delete failed for {fname}: {de}")
119
+
120
+ logger.info(f"[{job_id}] ({idx}/{len(preloaded_files)}) Parsing {fname} ({len(raw)} bytes)")
121
+
122
+ # Extract pages (same as main system)
123
+ from helpers.pages import _extract_pages
124
+ pages = _extract_pages(fname, raw)
125
+
126
+ # Process images with captions (same as main system)
127
+ num_imgs = sum(len(p.get("images", [])) for p in pages)
128
+ captions = []
129
+ if num_imgs > 0:
130
+ for p in pages:
131
+ caps = []
132
+ for im in p.get("images", []):
133
+ try:
134
+ cap = captioner.caption_image(im)
135
+ caps.append(cap)
136
+ except Exception as e:
137
+ logger.warning(f"[{job_id}] Caption error in {fname}: {e}")
138
+ captions.append(caps)
139
+ else:
140
+ captions = [[] for _ in pages]
141
+
142
+ # Merge captions into text (same as main system)
143
+ for p, caps in zip(pages, captions):
144
+ if caps:
145
+ normalized = [ _normalize_caption(c) for c in caps if c ]
146
+ if normalized:
147
+ p["text"] = (p.get("text", "") + "\n\n" + "\n".join([f"[Image] {c}" for c in normalized])).strip()
148
+
149
+ # Build cards (same as main system)
150
+ from utils.ingestion.chunker import build_cards_from_pages
151
+ cards = await build_cards_from_pages(pages, filename=fname, user_id=user_id, project_id=project_id)
152
+ logger.info(f"[{job_id}] Built {len(cards)} cards for {fname}")
153
+
154
+ # Generate embeddings (same as main system)
155
+ embeddings = embedder.embed([c["content"] for c in cards])
156
+ for c, vec in zip(cards, embeddings):
157
+ c["embedding"] = vec
158
+
159
+ # Store in MongoDB (same as main system)
160
+ rag.store_cards(cards)
161
+
162
+ # Create file summary (same as main system)
163
+ from utils.service.summarizer import cheap_summarize
164
+ full_text = "\n\n".join(p.get("text", "") for p in pages)
165
+ file_summary = await cheap_summarize(full_text, max_sentences=6)
166
+ rag.upsert_file_summary(user_id=user_id, project_id=project_id, filename=fname, summary=file_summary)
167
+
168
+ logger.info(f"[{job_id}] Completed {fname}")
169
+
170
+ # Update job progress (same as main system)
171
+ job = app.state.jobs.get(job_id)
172
+ if job:
173
+ job["completed"] = idx
174
+ job["status"] = "processing" if idx < job.get("total", 0) else "completed"
175
+
176
+ except Exception as e:
177
+ logger.error(f"[{job_id}] Failed processing {fname}: {e}")
178
+ job = app.state.jobs.get(job_id)
179
+ if job:
180
+ job["last_error"] = str(e)
181
+ job["completed"] = idx
182
+ finally:
183
+ await asyncio.sleep(0)
184
+
185
+ # Finalize job (same as main system)
186
+ logger.info(f"[{job_id}] Ingestion complete for {len(preloaded_files)} files")
187
+ job = app.state.jobs.get(job_id)
188
+ if job:
189
+ job["status"] = "completed"
190
+
191
+ background_tasks.add_task(_process_all)
192
+
193
+ return UploadResponse(
194
+ job_id=job_id,
195
+ status="processing",
196
+ total_files=len(preloaded_files)
197
+ )
198
+
199
+ @router.get("/upload/status", response_model=JobStatusResponse)
200
+ async def upload_status(job_id: str):
201
+ """Get upload job status"""
202
+ from app import app
203
+ job = app.state.jobs.get(job_id)
204
+ if not job:
205
+ raise HTTPException(404, detail="Job not found")
206
+
207
+ progress = (job["completed"] / job["total"]) * 100 if job["total"] > 0 else 0
208
+
209
+ return JobStatusResponse(
210
+ job_id=job_id,
211
+ status=job["status"],
212
+ total=job["total"],
213
+ completed=job["completed"],
214
+ progress=progress,
215
+ last_error=job.get("last_error"),
216
+ created_at=job["created_at"]
217
+ )
218
+
219
+ @router.get("/files", response_model=FilesListResponse)
220
+ async def list_files(user_id: str, project_id: str):
221
+ """List files for a project (compatible with main system)"""
222
+ if not rag:
223
+ raise HTTPException(500, detail="MongoDB connection not available")
224
+
225
+ files = rag.list_files(user_id, project_id)
226
+ return FilesListResponse(
227
+ files=[{"filename": f["filename"], "summary": f["summary"]} for f in files],
228
+ filenames=[f["filename"] for f in files]
229
+ )
230
+
231
+ @router.get("/files/chunks", response_model=ChunksResponse)
232
+ async def get_file_chunks(user_id: str, project_id: str, filename: str, limit: int = 20):
233
+ """Get chunks for a specific file (compatible with main system)"""
234
+ if not rag:
235
+ raise HTTPException(500, detail="MongoDB connection not available")
236
+
237
+ chunks = rag.get_file_chunks(user_id, project_id, filename, limit)
238
+ return ChunksResponse(chunks=chunks)
ingestion_python/app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ingestion Pipeline Service
3
+
4
+ A dedicated service for processing file uploads and storing them in MongoDB Atlas.
5
+ This service mirrors the main system's file processing functionality while
6
+ running as a separate service to share the processing load.
7
+ """
8
+
9
+ import os
10
+ from fastapi import FastAPI
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+
13
+ # Import shared utilities (now local)
14
+ from utils.logger import get_logger
15
+ from utils.rag.rag import RAGStore, ensure_indexes
16
+ from utils.embedding import RemoteEmbeddingClient
17
+ from services.maverick_captioner import NvidiaMaverickCaptioner
18
+ from api.routes import router, initialize_services
19
+
20
+ logger = get_logger("INGESTION_PIPELINE", __name__)
21
+
22
+ # FastAPI app
23
+ app = FastAPI(title="Ingestion Pipeline", version="1.0.0")
24
+ app.add_middleware(
25
+ CORSMiddleware,
26
+ allow_origins=["*"],
27
+ allow_credentials=True,
28
+ allow_methods=["*"],
29
+ allow_headers=["*"],
30
+ )
31
+
32
+ # In-memory job tracker (same as main system)
33
+ app.state.jobs = {}
34
+
35
+ # Global clients (same as main system)
36
+ try:
37
+ rag = RAGStore(mongo_uri=os.getenv("MONGO_URI"), db_name=os.getenv("MONGO_DB", "studybuddy"))
38
+ rag.client.admin.command('ping')
39
+ logger.info("[INGESTION_PIPELINE] MongoDB connection successful")
40
+ ensure_indexes(rag)
41
+ logger.info("[INGESTION_PIPELINE] MongoDB indexes ensured")
42
+ except Exception as e:
43
+ logger.error(f"[INGESTION_PIPELINE] Failed to initialize MongoDB: {e}")
44
+ rag = None
45
+
46
+ embedder = RemoteEmbeddingClient()
47
+ captioner = NvidiaMaverickCaptioner()
48
+
49
+ # Initialize services
50
+ initialize_services(rag, embedder, captioner)
51
+
52
+ # Include API routes
53
+ app.include_router(router)
54
+
55
+ if __name__ == "__main__":
56
+ import uvicorn
57
+ port = int(os.getenv("INGESTION_PORT", "7860"))
58
+ uvicorn.run(app, host="0.0.0.0", port=port)
ingestion_python/helpers/pages.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, Dict, Any
3
+ from fastapi import HTTPException
4
+ from utils.ingestion.parser import parse_pdf_bytes, parse_docx_bytes
5
+
6
+ # ────────────────────────────── Helpers ──────────────────────────────
7
+ def _infer_mime(filename: str) -> str:
8
+ lower = filename.lower()
9
+ if lower.endswith(".pdf"):
10
+ return "application/pdf"
11
+ if lower.endswith(".docx"):
12
+ return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
13
+ return "application/octet-stream"
14
+
15
+ def _extract_pages(filename: str, file_bytes: bytes) -> List[Dict[str, Any]]:
16
+ mime = _infer_mime(filename)
17
+ if mime == "application/pdf":
18
+ return parse_pdf_bytes(file_bytes)
19
+ elif mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
20
+ return parse_docx_bytes(file_bytes)
21
+ else:
22
+ raise HTTPException(status_code=400, detail=f"Unsupported file type: {filename}")
23
+
24
+
ingestion_python/requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ingestion Pipeline Requirements
2
+ # Same as main system but focused on processing
3
+
4
+ fastapi==0.114.2
5
+ uvicorn[standard]==0.30.6
6
+ python-multipart==0.0.9
7
+ pymongo==4.8.0
8
+ httpx==0.27.2
9
+ requests==2.32.3
10
+ python-docx==1.1.2
11
+ PyMuPDF==1.24.10
12
+ pillow==10.4.0
13
+ sumy==0.11.0
14
+ numpy==1.26.4
15
+ reportlab==4.0.9
16
+ markdown==3.6
17
+ python-dotenv==1.0.0
ingestion_python/services/ingestion_service.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ingestion service for processing files and storing them in MongoDB
3
+ """
4
+
5
+ import asyncio
6
+ import uuid
7
+ import time
8
+ import json
9
+ from typing import List, Dict, Any, Optional
10
+ from utils.logger import get_logger
11
+ from utils.rag.rag import RAGStore
12
+ from utils.embedding import RemoteEmbeddingClient
13
+ from services.maverick_captioner import NvidiaMaverickCaptioner, _normalize_caption
14
+ from utils.ingestion.chunker import build_cards_from_pages
15
+ from utils.service.summarizer import cheap_summarize
16
+ from helpers.pages import _extract_pages
17
+
18
+ logger = get_logger("INGESTION_SERVICE", __name__)
19
+
20
+ class IngestionService:
21
+ """Service for processing file uploads and storing them in MongoDB"""
22
+
23
+ def __init__(self, rag_store: RAGStore, embedder: RemoteEmbeddingClient, captioner: NvidiaMaverickCaptioner):
24
+ self.rag = rag_store
25
+ self.embedder = embedder
26
+ self.captioner = captioner
27
+
28
+ async def process_files(
29
+ self,
30
+ user_id: str,
31
+ project_id: str,
32
+ files: List[tuple], # (filename, raw_bytes)
33
+ replace_filenames: Optional[List[str]] = None,
34
+ rename_map: Optional[Dict[str, str]] = None,
35
+ job_id: Optional[str] = None
36
+ ) -> str:
37
+ """
38
+ Process files and store them in MongoDB
39
+
40
+ Args:
41
+ user_id: User identifier
42
+ project_id: Project identifier
43
+ files: List of (filename, raw_bytes) tuples
44
+ replace_filenames: Optional list of filenames to replace
45
+ rename_map: Optional mapping of old names to new names
46
+ job_id: Optional job ID for tracking
47
+
48
+ Returns:
49
+ Job ID for tracking progress
50
+ """
51
+ if not job_id:
52
+ job_id = str(uuid.uuid4())
53
+
54
+ replace_set = set(replace_filenames or [])
55
+
56
+ for idx, (fname, raw) in enumerate(files, start=1):
57
+ try:
58
+ # Handle file replacement
59
+ if fname in replace_set:
60
+ try:
61
+ self.rag.db["chunks"].delete_many({"user_id": user_id, "project_id": project_id, "filename": fname})
62
+ self.rag.db["files"].delete_many({"user_id": user_id, "project_id": project_id, "filename": fname})
63
+ logger.info(f"[{job_id}] Replaced prior data for {fname}")
64
+ except Exception as de:
65
+ logger.warning(f"[{job_id}] Replace delete failed for {fname}: {de}")
66
+
67
+ logger.info(f"[{job_id}] ({idx}/{len(files)}) Parsing {fname} ({len(raw)} bytes)")
68
+
69
+ # Extract pages
70
+ pages = _extract_pages(fname, raw)
71
+
72
+ # Process images with captions
73
+ num_imgs = sum(len(p.get("images", [])) for p in pages)
74
+ captions = []
75
+ if num_imgs > 0:
76
+ for p in pages:
77
+ caps = []
78
+ for im in p.get("images", []):
79
+ try:
80
+ cap = self.captioner.caption_image(im)
81
+ caps.append(cap)
82
+ except Exception as e:
83
+ logger.warning(f"[{job_id}] Caption error in {fname}: {e}")
84
+ captions.append(caps)
85
+ else:
86
+ captions = [[] for _ in pages]
87
+
88
+ # Merge captions into text
89
+ for p, caps in zip(pages, captions):
90
+ if caps:
91
+ normalized = [ _normalize_caption(c) for c in caps if c ]
92
+ if normalized:
93
+ p["text"] = (p.get("text", "") + "\n\n" + "\n".join([f"[Image] {c}" for c in normalized])).strip()
94
+
95
+ # Build cards
96
+ cards = await build_cards_from_pages(pages, filename=fname, user_id=user_id, project_id=project_id)
97
+ logger.info(f"[{job_id}] Built {len(cards)} cards for {fname}")
98
+
99
+ # Generate embeddings
100
+ embeddings = self.embedder.embed([c["content"] for c in cards])
101
+ for c, vec in zip(cards, embeddings):
102
+ c["embedding"] = vec
103
+
104
+ # Store in MongoDB
105
+ self.rag.store_cards(cards)
106
+
107
+ # Create file summary
108
+ full_text = "\n\n".join(p.get("text", "") for p in pages)
109
+ file_summary = await cheap_summarize(full_text, max_sentences=6)
110
+ self.rag.upsert_file_summary(user_id=user_id, project_id=project_id, filename=fname, summary=file_summary)
111
+
112
+ logger.info(f"[{job_id}] Completed {fname}")
113
+
114
+ except Exception as e:
115
+ logger.error(f"[{job_id}] Failed processing {fname}: {e}")
116
+ raise
117
+
118
+ logger.info(f"[{job_id}] Ingestion complete for {len(files)} files")
119
+ return job_id
ingestion_python/services/maverick_captioner.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import os
4
+ from typing import Optional
5
+
6
+ import requests
7
+ from PIL import Image
8
+
9
+ from utils.logger import get_logger
10
+ try:
11
+ from utils.api.rotator import APIKeyRotator # available in full repo
12
+ except Exception: # standalone fallback
13
+ class APIKeyRotator: # type: ignore
14
+ def __init__(self, prefix: str = "NVIDIA_API_", max_slots: int = 5):
15
+ self.keys = []
16
+ for i in range(1, max_slots + 1):
17
+ k = os.getenv(f"{prefix}{i}")
18
+ if k:
19
+ self.keys.append(k)
20
+ if not self.keys:
21
+ single = os.getenv(prefix.rstrip("_"))
22
+ if single:
23
+ self.keys.append(single)
24
+ self._idx = 0
25
+
26
+ def get_key(self) -> Optional[str]:
27
+ if not self.keys:
28
+ return None
29
+ k = self.keys[self._idx % len(self.keys)]
30
+ self._idx += 1
31
+ return k
32
+
33
+
34
+ logger = get_logger("MAVERICK_CAPTIONER", __name__)
35
+
36
+
37
+ def _normalize_caption(text: str) -> str:
38
+ if not text:
39
+ return ""
40
+ t = text.strip()
41
+ # Remove common conversational/openers and meta phrases
42
+ banned_prefixes = [
43
+ "sure,", "sure.", "sure", "here is", "here are", "this image", "the image", "image shows",
44
+ "the picture", "the photo", "the text describes", "the text describe", "it shows", "it depicts",
45
+ "caption:", "description:", "output:", "result:", "answer:", "analysis:", "observation:",
46
+ ]
47
+ t_lower = t.lower()
48
+ for p in banned_prefixes:
49
+ if t_lower.startswith(p):
50
+ t = t[len(p):].lstrip(" :-\u2014\u2013")
51
+ t_lower = t.lower()
52
+
53
+ # Strip surrounding quotes and markdown artifacts
54
+ t = t.strip().strip('"').strip("'").strip()
55
+ # Collapse whitespace
56
+ t = " ".join(t.split())
57
+ return t
58
+
59
+
60
+ class NvidiaMaverickCaptioner:
61
+ """Caption images using NVIDIA Integrate API (meta/llama-4-maverick-17b-128e-instruct)."""
62
+
63
+ def __init__(self, rotator: Optional[APIKeyRotator] = None, model: Optional[str] = None):
64
+ self.rotator = rotator or APIKeyRotator(prefix="NVIDIA_API_", max_slots=5)
65
+ self.model = model or os.getenv("NVIDIA_MAVERICK_MODEL", "meta/llama-4-maverick-17b-128e-instruct")
66
+ self.invoke_url = "https://integrate.api.nvidia.com/v1/chat/completions"
67
+
68
+ def _encode_image_jpeg_b64(self, image: Image.Image) -> str:
69
+ buf = io.BytesIO()
70
+ # Convert to RGB to ensure JPEG-compatible
71
+ image.convert("RGB").save(buf, format="JPEG", quality=90)
72
+ return base64.b64encode(buf.getvalue()).decode("utf-8")
73
+
74
+ def caption_image(self, image: Image.Image) -> str:
75
+ try:
76
+ key = self.rotator.get_key()
77
+ if not key:
78
+ logger.warning("NVIDIA API key not available; skipping image caption.")
79
+ return ""
80
+
81
+ img_b64 = self._encode_image_jpeg_b64(image)
82
+
83
+ # Strict, non-conversational system prompt
84
+ system_prompt = (
85
+ "You are an expert vision captioner. Produce a precise, information-dense caption of the image. "
86
+ "Do not include conversational phrases, prefaces, meta commentary, or apologies. "
87
+ "Avoid starting with phrases like 'The image/picture/photo shows' or 'Here is'. "
88
+ "Write a single concise paragraph with concrete entities, text in the image, and notable details."
89
+ )
90
+
91
+ user_prompt = (
92
+ "Caption this image at the finest level of detail. Include any visible text verbatim. "
93
+ "Return only the caption text."
94
+ )
95
+
96
+ # Multimodal content format for NVIDIA Integrate API
97
+ messages = [
98
+ {"role": "system", "content": system_prompt},
99
+ {
100
+ "role": "user",
101
+ "content": [
102
+ {"type": "text", "text": user_prompt},
103
+ {
104
+ "type": "image_url",
105
+ "image_url": {
106
+ "url": f"data:image/jpeg;base64,{img_b64}"
107
+ }
108
+ },
109
+ ]
110
+ },
111
+ ]
112
+
113
+ payload = {
114
+ "model": self.model,
115
+ "messages": messages,
116
+ "max_tokens": 512,
117
+ "temperature": 0.2,
118
+ "top_p": 0.9,
119
+ "frequency_penalty": 0.0,
120
+ "presence_penalty": 0.0,
121
+ "stream": False,
122
+ }
123
+
124
+ headers = {
125
+ "Authorization": f"Bearer {key}",
126
+ "Accept": "application/json",
127
+ "Content-Type": "application/json",
128
+ }
129
+
130
+ resp = requests.post(self.invoke_url, headers=headers, json=payload, timeout=60)
131
+ if resp.status_code >= 400:
132
+ logger.warning(f"Maverick caption API error {resp.status_code}: {resp.text[:200]}")
133
+ return ""
134
+ data = resp.json()
135
+ text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
136
+ return _normalize_caption(text)
137
+ except Exception as e:
138
+ logger.warning(f"Maverick caption failed: {e}")
139
+ return ""
140
+
141
+
ingestion_python/test_upload1.sh ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ set -euo pipefail
4
+
5
+ echo "🚀 Testing Ingestion Pipeline Upload"
6
+ echo "======================================"
7
+
8
+ # Configuration
9
+ BACKEND_URL="https://binkhoale1812-studdybuddy-ingestion1.hf.space"
10
+ USER_ID="44e65346-8eaa-4f95-b17a-f6219953e7a8"
11
+ PROJECT_ID="496e2fad-ec7e-4562-b06a-ea2491f2460"
12
+
13
+ # Test files
14
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
15
+ FILE1="$SCRIPT_DIR/../exefiles/Lecture5_ML.pdf"
16
+ FILE2="$SCRIPT_DIR/../exefiles/Lecture6_ANN_DL.pdf"
17
+
18
+
19
+ # Debug toggles
20
+ DEBUG=${DEBUG:-0}
21
+ TRACE=${TRACE:-0}
22
+
23
+ echo "📋 Configuration:"
24
+ echo " Backend URL: $BACKEND_URL"
25
+ echo " User ID: $USER_ID"
26
+ echo " Project ID: $PROJECT_ID"
27
+ echo " Files: $FILE1, $FILE2"
28
+ echo ""
29
+
30
+ # Validate files and resolve absolute paths
31
+ if [ ! -f "$FILE1" ]; then
32
+ echo "❌ Missing file: $FILE1"; exit 26
33
+ fi
34
+ if [ ! -f "$FILE2" ]; then
35
+ echo "❌ Missing file: $FILE2"; exit 26
36
+ fi
37
+ FILE1_DIR="$(cd "$(dirname "$FILE1")" && pwd)"; FILE1_BASENAME="$(basename "$FILE1")"; FILE1="$FILE1_DIR/$FILE1_BASENAME"
38
+ FILE2_DIR="$(cd "$(dirname "$FILE2")" && pwd)"; FILE2_BASENAME="$(basename "$FILE2")"; FILE2="$FILE2_DIR/$FILE2_BASENAME"
39
+
40
+ curl_base() {
41
+ local method="$1"; shift
42
+ local url="$1"; shift
43
+ local extra=("$@")
44
+ local common=(
45
+ -L --http1.1 --fail-with-body -sS
46
+ --connect-timeout 60
47
+ --retry 5 --retry-delay 4 --retry-connrefused
48
+ )
49
+ if [ "$DEBUG" = "1" ]; then
50
+ common+=( -v )
51
+ fi
52
+ if [ "$TRACE" = "1" ]; then
53
+ common+=( --trace-time --trace-ascii - )
54
+ fi
55
+ curl -X "$method" "$url" "${common[@]}" "${extra[@]}"
56
+ }
57
+
58
+ json_with_status() {
59
+ local method="$1"; shift
60
+ local url="$1"; shift
61
+ local extra=("$@")
62
+ curl_base "$method" "$url" "${extra[@]}" \
63
+ -w "\nHTTP Status: %{http_code}\n"
64
+ }
65
+
66
+ # Step 0: Preflight (for browser parity)
67
+ echo "🛰️ Step 0: OPTIONS /upload (preflight parity)"
68
+ echo "---------------------------------------------"
69
+ json_with_status OPTIONS "$BACKEND_URL/upload" -H "Origin: https://example.com" -H "Access-Control-Request-Method: POST" || true
70
+ echo ""; echo ""
71
+
72
+ # Step 1: Health Check
73
+ echo "🏥 Step 1: Health Check"
74
+ echo "------------------------"
75
+ json_with_status GET "$BACKEND_URL/health" -H "Accept: application/json" || true
76
+ echo ""; echo ""
77
+
78
+ # Step 2: Upload Files
79
+ echo "📁 Step 2: Upload Files (sequential)"
80
+ echo "------------------------------------"
81
+ echo "Uploading $(basename "$FILE1")..."
82
+
83
+ UPLOAD_HEADERS=$(mktemp)
84
+ UPLOAD_BODY=$(mktemp)
85
+
86
+ set +e
87
+ HTTP_CODE=$(curl -L --http1.1 --fail-with-body -sS \
88
+ --connect-timeout 60 --retry 3 --retry-delay 4 --retry-connrefused \
89
+ -H "Expect:" \
90
+ -X POST "$BACKEND_URL/upload" \
91
+ -F "user_id=$USER_ID" \
92
+ -F "project_id=$PROJECT_ID" \
93
+ -F "files=@$FILE1" \
94
+ -D "$UPLOAD_HEADERS" -o "$UPLOAD_BODY" \
95
+ -w "%{http_code}")
96
+ RET=$?
97
+ set -e
98
+
99
+ echo "HTTP Status: $HTTP_CODE"
100
+ echo "--- Response Headers ---"; sed -e 's/\r$//' "$UPLOAD_HEADERS" | sed 's/^/ /'
101
+ echo "--- Response Body ---"; sed 's/^/ /' "$UPLOAD_BODY"
102
+
103
+ if [ "$RET" -ne 0 ] || [ "$HTTP_CODE" = "000" ]; then
104
+ echo "❌ Upload failed (curl exit=$RET, http=$HTTP_CODE)"; exit 1
105
+ fi
106
+
107
+ # Extract job_id (prefer jq)
108
+ if command -v jq >/dev/null 2>&1; then
109
+ JOB_ID=$(jq -r '.job_id // empty' < "$UPLOAD_BODY")
110
+ else
111
+ JOB_ID=$(python3 - <<'PY'
112
+ import sys, json
113
+ try:
114
+ data=json.load(sys.stdin)
115
+ print(data.get('job_id',''))
116
+ except Exception:
117
+ print('')
118
+ PY
119
+ < "$UPLOAD_BODY")
120
+ fi
121
+
122
+ if [ -z "${JOB_ID:-}" ]; then
123
+ echo "❌ Failed to extract job_id from upload response"; exit 1
124
+ fi
125
+
126
+ echo ""
127
+ echo "✅ Upload 1 initiated successfully!"
128
+ echo " Job ID: $JOB_ID"
129
+ echo ""
130
+
131
+ # Step 3: Monitor Upload Progress
132
+ echo "📊 Step 3: Monitor Upload Progress"
133
+ echo "----------------------------------"
134
+
135
+ for i in {1..48}; do
136
+ echo "Checking progress (attempt $i/12)..."
137
+ json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID" -H "Accept: application/json" | sed 's/^/ /'
138
+ STATUS_LINE=$(json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID" -H "Accept: application/json" | tail -n +1)
139
+ if echo "$STATUS_LINE" | grep -q '"status":"completed"'; then
140
+ echo "✅ Upload completed successfully!"; break
141
+ elif echo "$STATUS_LINE" | grep -q '"status":"processing"'; then
142
+ echo "⏳ Still processing... waiting 20 seconds"; sleep 20
143
+ else
144
+ echo "❌ Upload failed or unknown status"; break
145
+ fi
146
+ echo ""
147
+ done
148
+
149
+ echo ""
150
+
151
+ # Now upload second file after first completes
152
+ echo "📁 Step 3: Upload second file"
153
+ echo "------------------------------"
154
+ echo "Uploading $(basename "$FILE2")..."
155
+
156
+ UPLOAD_HEADERS2=$(mktemp)
157
+ UPLOAD_BODY2=$(mktemp)
158
+
159
+ set +e
160
+ HTTP_CODE2=$(curl -L --http1.1 --fail-with-body -sS \
161
+ --connect-timeout 60 --retry 3 --retry-delay 4 --retry-connrefused \
162
+ -H "Expect:" \
163
+ -X POST "$BACKEND_URL/upload" \
164
+ -F "user_id=$USER_ID" \
165
+ -F "project_id=$PROJECT_ID" \
166
+ -F "files=@$FILE2" \
167
+ -D "$UPLOAD_HEADERS2" -o "$UPLOAD_BODY2" \
168
+ -w "%{http_code}")
169
+ RET2=$?
170
+ set -e
171
+
172
+ echo "HTTP Status: $HTTP_CODE2"
173
+ echo "--- Response Headers ---"; sed -e 's/\r$//' "$UPLOAD_HEADERS2" | sed 's/^/ /'
174
+ echo "--- Response Body ---"; sed 's/^/ /' "$UPLOAD_BODY2"
175
+
176
+ if [ "$RET2" -ne 0 ] || [ "$HTTP_CODE2" = "000" ]; then
177
+ echo "❌ Upload 2 failed (curl exit=$RET2, http=$HTTP_CODE2)"; exit 1
178
+ fi
179
+
180
+ # Extract job_id2
181
+ if command -v jq >/dev/null 2>&1; then
182
+ JOB_ID2=$(jq -r '.job_id // empty' < "$UPLOAD_BODY2")
183
+ else
184
+ JOB_ID2=$(python3 - <<'PY'
185
+ import sys, json
186
+ try:
187
+ data=json.load(sys.stdin)
188
+ print(data.get('job_id',''))
189
+ except Exception:
190
+ print('')
191
+ PY
192
+ < "$UPLOAD_BODY2")
193
+ fi
194
+
195
+ if [ -z "${JOB_ID2:-}" ]; then
196
+ echo "❌ Failed to extract job_id from second upload response"; exit 1
197
+ fi
198
+
199
+ echo ""
200
+ echo "✅ Upload 2 initiated successfully!"
201
+ echo " Job ID: $JOB_ID2"
202
+ echo ""
203
+
204
+ echo "📊 Step 4: Monitor Upload 2 Progress"
205
+ echo "-------------------------------------"
206
+ for i in {1..48}; do
207
+ echo "Checking progress (attempt $i/48)..."
208
+ json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID2" -H "Accept: application/json" | sed 's/^/ /'
209
+ STATUS_LINE=$(json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID2" -H "Accept: application/json" | tail -n +1)
210
+ if echo "$STATUS_LINE" | grep -q '"status":"completed"'; then
211
+ echo "✅ Upload 2 completed successfully!"; break
212
+ elif echo "$STATUS_LINE" | grep -q '"status":"processing"'; then
213
+ echo "⏳ Still processing... waiting 20 seconds"; sleep 20
214
+ else
215
+ echo "❌ Upload 2 failed or unknown status"; break
216
+ fi
217
+ echo ""
218
+ done
219
+
220
+ echo ""
221
+
222
+ # Step 5: List Uploaded Files
223
+ echo "📋 Step 4: List Uploaded Files"
224
+ echo "-------------------------------"
225
+ json_with_status GET "$BACKEND_URL/files?user_id=$USER_ID&project_id=$PROJECT_ID" -H "Accept: application/json" | sed 's/^/ /'
226
+ echo ""; echo ""
227
+
228
+ # Step 5: Get File Chunks (for Lecture7_GA_EC.pdf)
229
+ echo "🔍 Step 5: Get File Chunks for Lecture7_GA_EC.pdf"
230
+ echo "----------------------------------------------"
231
+ json_with_status GET "$BACKEND_URL/files/chunks?user_id=$USER_ID&project_id=$PROJECT_ID&filename=Lecture7_GA_EC.pdf&limit=5" -H "Accept: application/json" | sed 's/^/ /'
232
+ echo ""; echo ""
233
+
234
+ # Step 6: Get File Chunks (for Tut7.pdf)
235
+ echo "🔍 Step 6: Get File Chunks for Tut7.pdf"
236
+ echo "------------------------------------------------"
237
+ json_with_status GET "$BACKEND_URL/files/chunks?user_id=$USER_ID&project_id=$PROJECT_ID&filename=Tut7.pdf&limit=5" -H "Accept: application/json" | sed 's/^/ /'
238
+
239
+ echo ""
240
+ echo "🎉 Test completed!"
241
+ echo "=================="
ingestion_python/test_upload2.sh ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ set -euo pipefail
4
+
5
+ echo "🚀 Testing Ingestion Pipeline Upload"
6
+ echo "======================================"
7
+
8
+ # Configuration
9
+ BACKEND_URL="https://binkhoale1812-studdybuddy-ingestion2.hf.space"
10
+ USER_ID="44e65346-8eaa-4f95-b17a-f6219953e7a8"
11
+ PROJECT_ID="496e2fad-ec7e-4562-b06a-ea2491f2460"
12
+
13
+ # Test files (resolve relative to this script's directory)
14
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
15
+ FILE1="$SCRIPT_DIR/../exefiles/Lecture7_GA_EC.pdf"
16
+ FILE2="$SCRIPT_DIR/../exefiles/Tut7.pdf"
17
+
18
+ # Debug toggles
19
+ DEBUG=${DEBUG:-0}
20
+ TRACE=${TRACE:-0}
21
+
22
+ echo "📋 Configuration:"
23
+ echo " Backend URL: $BACKEND_URL"
24
+ echo " User ID: $USER_ID"
25
+ echo " Project ID: $PROJECT_ID"
26
+ echo " Files: $FILE1, $FILE2"
27
+ echo ""
28
+
29
+ # Validate files and resolve absolute paths
30
+ if [ ! -f "$FILE1" ]; then
31
+ echo "❌ Missing file: $FILE1"; exit 26
32
+ fi
33
+ if [ ! -f "$FILE2" ]; then
34
+ echo "❌ Missing file: $FILE2"; exit 26
35
+ fi
36
+ FILE1_DIR="$(cd "$(dirname "$FILE1")" && pwd)"; FILE1_BASENAME="$(basename "$FILE1")"; FILE1="$FILE1_DIR/$FILE1_BASENAME"
37
+ FILE2_DIR="$(cd "$(dirname "$FILE2")" && pwd)"; FILE2_BASENAME="$(basename "$FILE2")"; FILE2="$FILE2_DIR/$FILE2_BASENAME"
38
+
39
+ curl_base() {
40
+ local method="$1"; shift
41
+ local url="$1"; shift
42
+ local extra=("$@")
43
+ local common=(
44
+ -L --http1.1 --fail-with-body -sS
45
+ --connect-timeout 60
46
+ --retry 5 --retry-delay 4 --retry-connrefused
47
+ )
48
+ if [ "$DEBUG" = "1" ]; then
49
+ common+=( -v )
50
+ fi
51
+ if [ "$TRACE" = "1" ]; then
52
+ common+=( --trace-time --trace-ascii - )
53
+ fi
54
+ curl -X "$method" "$url" "${common[@]}" "${extra[@]}"
55
+ }
56
+
57
+ json_with_status() {
58
+ local method="$1"; shift
59
+ local url="$1"; shift
60
+ local extra=("$@")
61
+ curl_base "$method" "$url" "${extra[@]}" \
62
+ -w "\nHTTP Status: %{http_code}\n"
63
+ }
64
+
65
+ # Step 0: Preflight (for browser parity)
66
+ echo "🛰️ Step 0: OPTIONS /upload (preflight parity)"
67
+ echo "---------------------------------------------"
68
+ json_with_status OPTIONS "$BACKEND_URL/upload" -H "Origin: https://example.com" -H "Access-Control-Request-Method: POST" || true
69
+ echo ""; echo ""
70
+
71
+ # Step 1: Health Check
72
+ echo "🏥 Step 1: Health Check"
73
+ echo "------------------------"
74
+ json_with_status GET "$BACKEND_URL/health" -H "Accept: application/json" || true
75
+ echo ""; echo ""
76
+
77
+ # Step 2: Upload Files
78
+ echo "📁 Step 2: Upload Files (sequential)"
79
+ echo "------------------------------------"
80
+ echo "Uploading $(basename "$FILE1")..."
81
+
82
+ UPLOAD_HEADERS=$(mktemp)
83
+ UPLOAD_BODY=$(mktemp)
84
+
85
+ set +e
86
+ HTTP_CODE=$(curl -L --http1.1 --fail-with-body -sS \
87
+ --connect-timeout 60 --retry 3 --retry-delay 4 --retry-connrefused \
88
+ -H "Expect:" \
89
+ -X POST "$BACKEND_URL/upload" \
90
+ -F "user_id=$USER_ID" \
91
+ -F "project_id=$PROJECT_ID" \
92
+ -F "files=@$FILE1" \
93
+ -D "$UPLOAD_HEADERS" -o "$UPLOAD_BODY" \
94
+ -w "%{http_code}")
95
+ RET=$?
96
+ set -e
97
+
98
+ echo "HTTP Status: $HTTP_CODE"
99
+ echo "--- Response Headers ---"; sed -e 's/\r$//' "$UPLOAD_HEADERS" | sed 's/^/ /'
100
+ echo "--- Response Body ---"; sed 's/^/ /' "$UPLOAD_BODY"
101
+
102
+ if [ "$RET" -ne 0 ] || [ "$HTTP_CODE" = "000" ]; then
103
+ echo "❌ Upload failed (curl exit=$RET, http=$HTTP_CODE)"; exit 1
104
+ fi
105
+
106
+ # Extract job_id (prefer jq)
107
+ if command -v jq >/dev/null 2>&1; then
108
+ JOB_ID=$(jq -r '.job_id // empty' < "$UPLOAD_BODY")
109
+ else
110
+ JOB_ID=$(python3 - <<'PY'
111
+ import sys, json
112
+ try:
113
+ data=json.load(sys.stdin)
114
+ print(data.get('job_id',''))
115
+ except Exception:
116
+ print('')
117
+ PY
118
+ < "$UPLOAD_BODY")
119
+ fi
120
+
121
+ if [ -z "${JOB_ID:-}" ]; then
122
+ echo "❌ Failed to extract job_id from upload response"; exit 1
123
+ fi
124
+
125
+ echo ""
126
+ echo "✅ Upload 1 initiated successfully!"
127
+ echo " Job ID: $JOB_ID"
128
+ echo ""
129
+
130
+ # Step 3: Monitor Upload Progress
131
+ echo "📊 Step 3: Monitor Upload Progress"
132
+ echo "----------------------------------"
133
+
134
+ for i in {1..48}; do
135
+ echo "Checking progress (attempt $i/12)..."
136
+ json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID" -H "Accept: application/json" | sed 's/^/ /'
137
+ STATUS_LINE=$(json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID" -H "Accept: application/json" | tail -n +1)
138
+ if echo "$STATUS_LINE" | grep -q '"status":"completed"'; then
139
+ echo "✅ Upload completed successfully!"; break
140
+ elif echo "$STATUS_LINE" | grep -q '"status":"processing"'; then
141
+ echo "⏳ Still processing... waiting 20 seconds"; sleep 20
142
+ else
143
+ echo "❌ Upload failed or unknown status"; break
144
+ fi
145
+ echo ""
146
+ done
147
+
148
+ echo ""
149
+
150
+ echo "📁 Step 3: Upload second file"
151
+ echo "------------------------------"
152
+ echo "Uploading $(basename "$FILE2")..."
153
+
154
+ UPLOAD_HEADERS2=$(mktemp)
155
+ UPLOAD_BODY2=$(mktemp)
156
+
157
+ set +e
158
+ HTTP_CODE2=$(curl -L --http1.1 --fail-with-body -sS \
159
+ --connect-timeout 60 --retry 3 --retry-delay 4 --retry-connrefused \
160
+ -H "Expect:" \
161
+ -X POST "$BACKEND_URL/upload" \
162
+ -F "user_id=$USER_ID" \
163
+ -F "project_id=$PROJECT_ID" \
164
+ -F "files=@$FILE2" \
165
+ -D "$UPLOAD_HEADERS2" -o "$UPLOAD_BODY2" \
166
+ -w "%{http_code}")
167
+ RET2=$?
168
+ set -e
169
+
170
+ echo "HTTP Status: $HTTP_CODE2"
171
+ echo "--- Response Headers ---"; sed -e 's/\r$//' "$UPLOAD_HEADERS2" | sed 's/^/ /'
172
+ echo "--- Response Body ---"; sed 's/^/ /' "$UPLOAD_BODY2"
173
+
174
+ if [ "$RET2" -ne 0 ] || [ "$HTTP_CODE2" = "000" ]; then
175
+ echo "❌ Upload 2 failed (curl exit=$RET2, http=$HTTP_CODE2)"; exit 1
176
+ fi
177
+
178
+ if command -v jq >/dev/null 2>&1; then
179
+ JOB_ID2=$(jq -r '.job_id // empty' < "$UPLOAD_BODY2")
180
+ else
181
+ JOB_ID2=$(python3 - <<'PY'
182
+ import sys, json
183
+ try:
184
+ data=json.load(sys.stdin)
185
+ print(data.get('job_id',''))
186
+ except Exception:
187
+ print('')
188
+ PY
189
+ < "$UPLOAD_BODY2")
190
+ fi
191
+
192
+ if [ -z "${JOB_ID2:-}" ]; then
193
+ echo "❌ Failed to extract job_id from second upload response"; exit 1
194
+ fi
195
+
196
+ echo ""
197
+ echo "✅ Upload 2 initiated successfully!"
198
+ echo " Job ID: $JOB_ID2"
199
+ echo ""
200
+
201
+ echo "📊 Step 4: Monitor Upload 2 Progress"
202
+ echo "-------------------------------------"
203
+ for i in {1..48}; do
204
+ echo "Checking progress (attempt $i/48)..."
205
+ json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID2" -H "Accept: application/json" | sed 's/^/ /'
206
+ STATUS_LINE=$(json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID2" -H "Accept: application/json" | tail -n +1)
207
+ if echo "$STATUS_LINE" | grep -q '"status":"completed"'; then
208
+ echo "✅ Upload 2 completed successfully!"; break
209
+ elif echo "$STATUS_LINE" | grep -q '"status":"processing"'; then
210
+ echo "⏳ Still processing... waiting 20 seconds"; sleep 20
211
+ else
212
+ echo "❌ Upload 2 failed or unknown status"; break
213
+ fi
214
+ echo ""
215
+ done
216
+
217
+ echo ""
218
+
219
+ # Step 4: List Uploaded Files
220
+ echo "📋 Step 4: List Uploaded Files"
221
+ echo "-------------------------------"
222
+ json_with_status GET "$BACKEND_URL/files?user_id=$USER_ID&project_id=$PROJECT_ID" -H "Accept: application/json" | sed 's/^/ /'
223
+ echo ""; echo ""
224
+
225
+ # Step 5: Get File Chunks (for Lecture7_GA_EC.pdf)
226
+ echo "🔍 Step 5: Get File Chunks for Lecture7_GA_EC.pdf"
227
+ echo "----------------------------------------------"
228
+ json_with_status GET "$BACKEND_URL/files/chunks?user_id=$USER_ID&project_id=$PROJECT_ID&filename=Lecture7_GA_EC.pdf&limit=5" -H "Accept: application/json" | sed 's/^/ /'
229
+ echo ""; echo ""
230
+
231
+ # Step 6: Get File Chunks (for Tut7.pdf)
232
+ echo "🔍 Step 6: Get File Chunks for Tut7.pdf"
233
+ echo "------------------------------------------------"
234
+ json_with_status GET "$BACKEND_URL/files/chunks?user_id=$USER_ID&project_id=$PROJECT_ID&filename=Tut7.pdf&limit=5" -H "Accept: application/json" | sed 's/^/ /'
235
+
236
+ echo ""
237
+ echo "🎉 Test completed!"
238
+ echo "=================="
ingestion_python/test_upload3.sh ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ set -euo pipefail
4
+
5
+ echo "🚀 Testing Ingestion Pipeline Upload"
6
+ echo "======================================"
7
+
8
+ # Configuration
9
+ BACKEND_URL="https://binkhoale1812-studdybuddy-ingestion3.hf.space"
10
+ USER_ID="44e65346-8eaa-4f95-b17a-f6219953e7a8"
11
+ PROJECT_ID="496e2fad-ec7e-4562-b06a-ea2491f2460"
12
+
13
+ # Test files
14
+ FILE1="../exefiles/Lecture8_PSO_ACO.pdf"
15
+ FILE2="../exefiles/Tut8.pdf"
16
+
17
+ # Debug toggles
18
+ DEBUG=${DEBUG:-0}
19
+ TRACE=${TRACE:-0}
20
+
21
+ echo "📋 Configuration:"
22
+ echo " Backend URL: $BACKEND_URL"
23
+ echo " User ID: $USER_ID"
24
+ echo " Project ID: $PROJECT_ID"
25
+ echo " Files: $FILE1, $FILE2"
26
+ echo ""
27
+
28
+ curl_base() {
29
+ local method="$1"; shift
30
+ local url="$1"; shift
31
+ local extra=("$@")
32
+ local common=(
33
+ -L --http1.1 --fail-with-body -sS
34
+ --connect-timeout 60
35
+ --retry 5 --retry-delay 4 --retry-connrefused
36
+ )
37
+ if [ "$DEBUG" = "1" ]; then
38
+ common+=( -v )
39
+ fi
40
+ if [ "$TRACE" = "1" ]; then
41
+ common+=( --trace-time --trace-ascii - )
42
+ fi
43
+ curl -X "$method" "$url" "${common[@]}" "${extra[@]}"
44
+ }
45
+
46
+ json_with_status() {
47
+ local method="$1"; shift
48
+ local url="$1"; shift
49
+ local extra=("$@")
50
+ curl_base "$method" "$url" "${extra[@]}" \
51
+ -w "\nHTTP Status: %{http_code}\n"
52
+ }
53
+
54
+ # Step 0: Preflight (for browser parity)
55
+ echo "🛰️ Step 0: OPTIONS /upload (preflight parity)"
56
+ echo "---------------------------------------------"
57
+ json_with_status OPTIONS "$BACKEND_URL/upload" -H "Origin: https://example.com" -H "Access-Control-Request-Method: POST" || true
58
+ echo ""; echo ""
59
+
60
+ # Step 1: Health Check
61
+ echo "🏥 Step 1: Health Check"
62
+ echo "------------------------"
63
+ json_with_status GET "$BACKEND_URL/health" -H "Accept: application/json" || true
64
+ echo ""; echo ""
65
+
66
+ # Step 2: Upload Files
67
+ echo "📁 Step 2: Upload Files (sequential)"
68
+ echo "------------------------------------"
69
+ echo "Uploading $(basename "$FILE1")..."
70
+
71
+ UPLOAD_HEADERS=$(mktemp)
72
+ UPLOAD_BODY=$(mktemp)
73
+
74
+ set +e
75
+ HTTP_CODE=$(curl -L --http1.1 --fail-with-body -sS \
76
+ --connect-timeout 60 --retry 3 --retry-delay 4 --retry-connrefused \
77
+ -H "Expect:" \
78
+ -X POST "$BACKEND_URL/upload" \
79
+ -F "user_id=$USER_ID" \
80
+ -F "project_id=$PROJECT_ID" \
81
+ -F "files=@$FILE1" \
82
+ -D "$UPLOAD_HEADERS" -o "$UPLOAD_BODY" \
83
+ -w "%{http_code}")
84
+ RET=$?
85
+ set -e
86
+
87
+ echo "HTTP Status: $HTTP_CODE"
88
+ echo "--- Response Headers ---"; sed -e 's/\r$//' "$UPLOAD_HEADERS" | sed 's/^/ /'
89
+ echo "--- Response Body ---"; sed 's/^/ /' "$UPLOAD_BODY"
90
+
91
+ if [ "$RET" -ne 0 ] || [ "$HTTP_CODE" = "000" ]; then
92
+ echo "❌ Upload failed (curl exit=$RET, http=$HTTP_CODE)"; exit 1
93
+ fi
94
+
95
+ # Extract job_id (prefer jq)
96
+ if command -v jq >/dev/null 2>&1; then
97
+ JOB_ID=$(jq -r '.job_id // empty' < "$UPLOAD_BODY")
98
+ else
99
+ JOB_ID=$(python3 - <<'PY'
100
+ import sys, json
101
+ try:
102
+ data=json.load(sys.stdin)
103
+ print(data.get('job_id',''))
104
+ except Exception:
105
+ print('')
106
+ PY
107
+ < "$UPLOAD_BODY")
108
+ fi
109
+
110
+ if [ -z "${JOB_ID:-}" ]; then
111
+ echo "❌ Failed to extract job_id from upload response"; exit 1
112
+ fi
113
+
114
+ echo ""
115
+ echo "✅ Upload 1 initiated successfully!"
116
+ echo " Job ID: $JOB_ID"
117
+ echo ""
118
+
119
+ # Step 3: Monitor Upload Progress
120
+ echo "📊 Step 3: Monitor Upload Progress"
121
+ echo "----------------------------------"
122
+
123
+ for i in {1..48}; do
124
+ echo "Checking progress (attempt $i/12)..."
125
+ json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID" -H "Accept: application/json" | sed 's/^/ /'
126
+ STATUS_LINE=$(json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID" -H "Accept: application/json" | tail -n +1)
127
+ if echo "$STATUS_LINE" | grep -q '"status":"completed"'; then
128
+ echo "✅ Upload completed successfully!"; break
129
+ elif echo "$STATUS_LINE" | grep -q '"status":"processing"'; then
130
+ echo "⏳ Still processing... waiting 20 seconds"; sleep 20
131
+ else
132
+ echo "❌ Upload failed or unknown status"; break
133
+ fi
134
+ echo ""
135
+ done
136
+
137
+ echo ""
138
+
139
+ echo "📁 Step 3: Upload second file"
140
+ echo "------------------------------"
141
+ echo "Uploading $(basename "$FILE2")..."
142
+
143
+ UPLOAD_HEADERS2=$(mktemp)
144
+ UPLOAD_BODY2=$(mktemp)
145
+
146
+ set +e
147
+ HTTP_CODE2=$(curl -L --http1.1 --fail-with-body -sS \
148
+ --connect-timeout 60 --retry 3 --retry-delay 4 --retry-connrefused \
149
+ -H "Expect:" \
150
+ -X POST "$BACKEND_URL/upload" \
151
+ -F "user_id=$USER_ID" \
152
+ -F "project_id=$PROJECT_ID" \
153
+ -F "files=@$FILE2" \
154
+ -D "$UPLOAD_HEADERS2" -o "$UPLOAD_BODY2" \
155
+ -w "%{http_code}")
156
+ RET2=$?
157
+ set -e
158
+
159
+ echo "HTTP Status: $HTTP_CODE2"
160
+ echo "--- Response Headers ---"; sed -e 's/\r$//' "$UPLOAD_HEADERS2" | sed 's/^/ /'
161
+ echo "--- Response Body ---"; sed 's/^/ /' "$UPLOAD_BODY2"
162
+
163
+ if [ "$RET2" -ne 0 ] || [ "$HTTP_CODE2" = "000" ]; then
164
+ echo "❌ Upload 2 failed (curl exit=$RET2, http=$HTTP_CODE2)"; exit 1
165
+ fi
166
+
167
+ if command -v jq >/dev/null 2>&1; then
168
+ JOB_ID2=$(jq -r '.job_id // empty' < "$UPLOAD_BODY2")
169
+ else
170
+ JOB_ID2=$(python3 - <<'PY'
171
+ import sys, json
172
+ try:
173
+ data=json.load(sys.stdin)
174
+ print(data.get('job_id',''))
175
+ except Exception:
176
+ print('')
177
+ PY
178
+ < "$UPLOAD_BODY2")
179
+ fi
180
+
181
+ if [ -z "${JOB_ID2:-}" ]; then
182
+ echo "❌ Failed to extract job_id from second upload response"; exit 1
183
+ fi
184
+
185
+ echo ""
186
+ echo "✅ Upload 2 initiated successfully!"
187
+ echo " Job ID: $JOB_ID2"
188
+ echo ""
189
+
190
+ echo "📊 Step 4: Monitor Upload 2 Progress"
191
+ echo "-------------------------------------"
192
+ for i in {1..48}; do
193
+ echo "Checking progress (attempt $i/48)..."
194
+ json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID2" -H "Accept: application/json" | sed 's/^/ /'
195
+ STATUS_LINE=$(json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID2" -H "Accept: application/json" | tail -n +1)
196
+ if echo "$STATUS_LINE" | grep -q '"status":"completed"'; then
197
+ echo "✅ Upload 2 completed successfully!"; break
198
+ elif echo "$STATUS_LINE" | grep -q '"status":"processing"'; then
199
+ echo "⏳ Still processing... waiting 20 seconds"; sleep 20
200
+ else
201
+ echo "❌ Upload 2 failed or unknown status"; break
202
+ fi
203
+ echo ""
204
+ done
205
+
206
+ echo ""
207
+
208
+ # Step 4: List Uploaded Files
209
+ echo "📋 Step 4: List Uploaded Files"
210
+ echo "-------------------------------"
211
+ json_with_status GET "$BACKEND_URL/files?user_id=$USER_ID&project_id=$PROJECT_ID" -H "Accept: application/json" | sed 's/^/ /'
212
+ echo ""; echo ""
213
+
214
+ # Step 5: Get File Chunks (for Lecture8_PSO_ACO.pdf)
215
+ echo "🔍 Step 5: Get File Chunks for Lecture8_PSO_ACO.pdf"
216
+ echo "----------------------------------------------"
217
+ json_with_status GET "$BACKEND_URL/files/chunks?user_id=$USER_ID&project_id=$PROJECT_ID&filename=Lecture8_PSO_ACO.pdf&limit=5" -H "Accept: application/json" | sed 's/^/ /'
218
+ echo ""; echo ""
219
+
220
+ # Step 6: Get File Chunks (for Tut8.pdf)
221
+ echo "🔍 Step 6: Get File Chunks for Tut8.pdf"
222
+ echo "------------------------------------------------"
223
+ json_with_status GET "$BACKEND_URL/files/chunks?user_id=$USER_ID&project_id=$PROJECT_ID&filename=Tut8.pdf&limit=5" -H "Accept: application/json" | sed 's/^/ /'
224
+
225
+ echo ""
226
+ echo "🎉 Test completed!"
227
+ echo "=================="
ingestion_python/utils/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+
2
+
ingestion_python/utils/api/rotator.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ────────────────────────────── utils/rotator.py ──────────────────────────────
2
+ import os
3
+ import itertools
4
+ from ..logger import get_logger
5
+ from typing import Optional
6
+
7
+ import httpx
8
+
9
+ logger = get_logger("ROTATOR", __name__)
10
+
11
+
12
+ class APIKeyRotator:
13
+ """
14
+ Round-robin API key rotator.
15
+ - Loads keys from env vars with given prefix (e.g., GEMINI_API_1..6)
16
+ - get_key() returns current key
17
+ - rotate() moves to next key
18
+ - on HTTP 401/429/5xx you should call rotate() and retry (bounded)
19
+ """
20
+ def __init__(self, prefix: str, max_slots: int = 6):
21
+ self.keys = []
22
+ for i in range(1, max_slots + 1):
23
+ v = os.getenv(f"{prefix}{i}")
24
+ if v:
25
+ self.keys.append(v.strip())
26
+ if not self.keys:
27
+ logger.warning(f"No API keys found for prefix {prefix}. Calls will likely fail.")
28
+ self._cycle = itertools.cycle([""])
29
+ else:
30
+ self._cycle = itertools.cycle(self.keys)
31
+ self.current = next(self._cycle)
32
+
33
+ def get_key(self) -> Optional[str]:
34
+ return self.current
35
+
36
+ def rotate(self) -> Optional[str]:
37
+ self.current = next(self._cycle)
38
+ logger.info("Rotated API key.")
39
+ return self.current
40
+
41
+
42
+ async def robust_post_json(url: str, headers: dict, payload: dict, rotator: APIKeyRotator, max_retries: int = 6):
43
+ """
44
+ POST JSON with simple retry+rotate on 401/403/429/5xx.
45
+ Returns json response.
46
+ """
47
+ for attempt in range(max_retries):
48
+ try:
49
+ async with httpx.AsyncClient(timeout=60) as client:
50
+ r = await client.post(url, headers=headers, json=payload)
51
+ logger.info(f"[ROTATOR] HTTP {r.status_code} response from {url}")
52
+
53
+ if r.status_code in (401, 403, 429) or (500 <= r.status_code < 600):
54
+ logger.warning(f"HTTP {r.status_code} from provider. Rotating key and retrying ({attempt+1}/{max_retries})")
55
+ logger.warning(f"Response body: {r.text}")
56
+ rotator.rotate()
57
+ continue
58
+ r.raise_for_status()
59
+
60
+ response_data = r.json()
61
+ logger.info(f"[ROTATOR] Successfully parsed JSON response with keys: {list(response_data.keys()) if isinstance(response_data, dict) else 'Not a dict'}")
62
+ return response_data
63
+ except Exception as e:
64
+ logger.warning(f"Request error: {e}. Rotating and retrying ({attempt+1}/{max_retries})")
65
+ logger.warning(f"Request details - URL: {url}, Headers: {headers}")
66
+ rotator.rotate()
67
+ raise RuntimeError("Provider request failed after retries.")
ingestion_python/utils/api/router.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ────────────────────────────── utils/router.py ──────────────────────────────
2
+ import os
3
+ from ..logger import get_logger
4
+ from typing import Dict, Any
5
+ from .rotator import robust_post_json, APIKeyRotator
6
+
7
+ logger = get_logger("ROUTER", __name__)
8
+
9
+ # Default model names (can be overridden via env)
10
+ GEMINI_SMALL = os.getenv("GEMINI_SMALL", "gemini-2.5-flash-lite")
11
+ GEMINI_MED = os.getenv("GEMINI_MED", "gemini-2.5-flash")
12
+ GEMINI_PRO = os.getenv("GEMINI_PRO", "gemini-2.5-pro")
13
+
14
+ # NVIDIA model hierarchy (can be overridden via env)
15
+ NVIDIA_SMALL = os.getenv("NVIDIA_SMALL", "meta/llama-3.1-8b-instruct") # Llama model for easy complexity tasks
16
+ NVIDIA_MEDIUM = os.getenv("NVIDIA_MEDIUM", "qwen/qwen3-next-80b-a3b-thinking") # Qwen model for reasoning tasks
17
+ NVIDIA_LARGE = os.getenv("NVIDIA_LARGE", "openai/gpt-oss-120b") # GPT-OSS model for hard/long context tasks
18
+
19
+ def select_model(question: str, context: str) -> Dict[str, Any]:
20
+ """
21
+ Enhanced three-tier model selection system:
22
+ - Easy tasks (immediate execution, simple) -> Llama (NVIDIA small)
23
+ - Reasoning tasks (analysis, decision-making, JSON parsing) -> Qwen (NVIDIA medium)
24
+ - Hard/long context tasks (complex synthesis, long-form) -> GPT-OSS (NVIDIA large)
25
+ - Very complex tasks (research, comprehensive analysis) -> Gemini Pro
26
+ """
27
+ qlen = len(question.split())
28
+ clen = len(context.split())
29
+
30
+ # Very hard task keywords - require Gemini Pro (research, comprehensive analysis)
31
+ very_hard_keywords = ("prove", "derivation", "complexity", "algorithm", "optimize", "theorem", "rigorous", "step-by-step", "policy critique", "ambiguity", "counterfactual", "comprehensive", "detailed analysis", "synthesis", "evaluation", "research", "investigation", "comprehensive study")
32
+
33
+ # Hard/long context keywords - require NVIDIA Large (GPT-OSS)
34
+ hard_keywords = ("analyze", "explain", "compare", "evaluate", "summarize", "extract", "classify", "identify", "describe", "discuss", "synthesis", "consolidate", "process", "generate", "create", "develop", "build", "construct")
35
+
36
+ # Reasoning task keywords - require Qwen (thinking/reasoning)
37
+ reasoning_keywords = ("reasoning", "context", "enhance", "select", "decide", "choose", "determine", "assess", "judge", "consider", "think", "reason", "logic", "inference", "deduction", "analysis", "interpretation")
38
+
39
+ # Simple task keywords - immediate execution
40
+ simple_keywords = ("what", "how", "when", "where", "who", "yes", "no", "count", "list", "find", "search", "lookup")
41
+
42
+ # Determine complexity level
43
+ is_very_hard = (
44
+ any(k in question.lower() for k in very_hard_keywords) or
45
+ qlen > 120 or
46
+ clen > 4000 or
47
+ "comprehensive" in question.lower() or
48
+ "detailed" in question.lower() or
49
+ "research" in question.lower()
50
+ )
51
+
52
+ is_hard = (
53
+ any(k in question.lower() for k in hard_keywords) or
54
+ qlen > 50 or
55
+ clen > 1500 or
56
+ "synthesis" in question.lower() or
57
+ "generate" in question.lower() or
58
+ "create" in question.lower()
59
+ )
60
+
61
+ is_reasoning = (
62
+ any(k in question.lower() for k in reasoning_keywords) or
63
+ qlen > 20 or
64
+ clen > 800 or
65
+ "enhance" in question.lower() or
66
+ "context" in question.lower() or
67
+ "select" in question.lower() or
68
+ "decide" in question.lower()
69
+ )
70
+
71
+ is_simple = (
72
+ any(k in question.lower() for k in simple_keywords) or
73
+ qlen <= 10 or
74
+ clen <= 200
75
+ )
76
+
77
+ if is_very_hard:
78
+ # Use Gemini Pro for very complex tasks requiring advanced reasoning
79
+ return {"provider": "gemini", "model": GEMINI_PRO}
80
+ elif is_hard:
81
+ # Use NVIDIA Large (GPT-OSS) for hard/long context tasks
82
+ return {"provider": "nvidia_large", "model": NVIDIA_LARGE}
83
+ elif is_reasoning:
84
+ # Use Qwen for reasoning tasks requiring thinking
85
+ return {"provider": "qwen", "model": NVIDIA_MEDIUM}
86
+ else:
87
+ # Use NVIDIA small (Llama) for simple tasks requiring immediate execution
88
+ return {"provider": "nvidia", "model": NVIDIA_SMALL}
89
+
90
+
91
+ async def generate_answer_with_model(selection: Dict[str, Any], system_prompt: str, user_prompt: str,
92
+ gemini_rotator: APIKeyRotator, nvidia_rotator: APIKeyRotator,
93
+ user_id: str = None, context: str = "") -> str:
94
+ provider = selection["provider"]
95
+ model = selection["model"]
96
+
97
+ if provider == "gemini":
98
+ # Try Gemini first
99
+ try:
100
+ key = gemini_rotator.get_key() or ""
101
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={key}"
102
+ payload = {
103
+ "contents": [
104
+ {"role": "user", "parts": [{"text": f"{system_prompt}\n\n{user_prompt}"}]}
105
+ ],
106
+ "generationConfig": {"temperature": 0.2}
107
+ }
108
+ headers = {"Content-Type": "application/json"}
109
+ data = await robust_post_json(url, headers, payload, gemini_rotator)
110
+
111
+ content = data["candidates"][0]["content"]["parts"][0]["text"]
112
+ if not content or content.strip() == "":
113
+ logger.warning(f"Empty content from Gemini model: {data}")
114
+ raise Exception("Empty content from Gemini")
115
+ return content
116
+ except Exception as e:
117
+ logger.warning(f"Gemini model {model} failed: {e}. Attempting fallback...")
118
+
119
+ # Fallback logic: GEMINI_PRO/MED → NVIDIA_LARGE, GEMINI_SMALL → NVIDIA_SMALL
120
+ if model in [GEMINI_PRO, GEMINI_MED]:
121
+ logger.info(f"Falling back from {model} to NVIDIA_LARGE")
122
+ fallback_selection = {"provider": "nvidia_large", "model": NVIDIA_LARGE}
123
+ return await generate_answer_with_model(fallback_selection, system_prompt, user_prompt, gemini_rotator, nvidia_rotator, user_id, context)
124
+ elif model == GEMINI_SMALL:
125
+ logger.info(f"Falling back from {model} to NVIDIA_SMALL")
126
+ fallback_selection = {"provider": "nvidia", "model": NVIDIA_SMALL}
127
+ return await generate_answer_with_model(fallback_selection, system_prompt, user_prompt, gemini_rotator, nvidia_rotator, user_id, context)
128
+ else:
129
+ logger.error(f"No fallback defined for Gemini model: {model}")
130
+ return "I couldn't parse the model response."
131
+
132
+ elif provider == "nvidia":
133
+ # Try NVIDIA small model first
134
+ try:
135
+ key = nvidia_rotator.get_key() or ""
136
+ url = "https://integrate.api.nvidia.com/v1/chat/completions"
137
+ payload = {
138
+ "model": model,
139
+ "temperature": 0.2,
140
+ "messages": [
141
+ {"role": "system", "content": system_prompt},
142
+ {"role": "user", "content": user_prompt},
143
+ ]
144
+ }
145
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
146
+
147
+ logger.info(f"[ROUTER] NVIDIA API call - Model: {model}, Key present: {bool(key)}")
148
+ logger.info(f"[ROUTER] System prompt length: {len(system_prompt)}, User prompt length: {len(user_prompt)}")
149
+
150
+ data = await robust_post_json(url, headers, payload, nvidia_rotator)
151
+
152
+ logger.info(f"[ROUTER] NVIDIA API response type: {type(data)}, keys: {list(data.keys()) if isinstance(data, dict) else 'Not a dict'}")
153
+ content = data["choices"][0]["message"]["content"]
154
+ if not content or content.strip() == "":
155
+ logger.warning(f"Empty content from NVIDIA model: {data}")
156
+ raise Exception("Empty content from NVIDIA")
157
+ return content
158
+ except Exception as e:
159
+ logger.warning(f"NVIDIA model {model} failed: {e}. Attempting fallback...")
160
+
161
+ # Fallback: NVIDIA_SMALL → Try a different NVIDIA model or basic response
162
+ if model == NVIDIA_SMALL:
163
+ logger.info(f"Falling back from {model} to basic response")
164
+ return "I'm experiencing technical difficulties with the AI model. Please try again later."
165
+ else:
166
+ logger.error(f"No fallback defined for NVIDIA model: {model}")
167
+ return "I couldn't parse the model response."
168
+
169
+ elif provider == "qwen":
170
+ # Use Qwen for reasoning tasks with fallback
171
+ try:
172
+ return await qwen_chat_completion(system_prompt, user_prompt, nvidia_rotator, user_id, context)
173
+ except Exception as e:
174
+ logger.warning(f"Qwen model failed: {e}. Attempting fallback...")
175
+ # Fallback: Qwen → NVIDIA_SMALL
176
+ logger.info("Falling back from Qwen to NVIDIA_SMALL")
177
+ fallback_selection = {"provider": "nvidia", "model": NVIDIA_SMALL}
178
+ return await generate_answer_with_model(fallback_selection, system_prompt, user_prompt, gemini_rotator, nvidia_rotator, user_id, context)
179
+ elif provider == "nvidia_large":
180
+ # Use NVIDIA Large (GPT-OSS) for hard/long context tasks with fallback
181
+ try:
182
+ return await nvidia_large_chat_completion(system_prompt, user_prompt, nvidia_rotator, user_id, context)
183
+ except Exception as e:
184
+ logger.warning(f"NVIDIA_LARGE model failed: {e}. Attempting fallback...")
185
+ # Fallback: NVIDIA_LARGE → NVIDIA_SMALL
186
+ logger.info("Falling back from NVIDIA_LARGE to NVIDIA_SMALL")
187
+ fallback_selection = {"provider": "nvidia", "model": NVIDIA_SMALL}
188
+ return await generate_answer_with_model(fallback_selection, system_prompt, user_prompt, gemini_rotator, nvidia_rotator, user_id, context)
189
+ elif provider == "nvidia_coder":
190
+ # Use NVIDIA Coder for code generation tasks with fallback
191
+ try:
192
+ from helpers.coder import nvidia_coder_completion
193
+ return await nvidia_coder_completion(system_prompt, user_prompt, nvidia_rotator, user_id, context)
194
+ except Exception as e:
195
+ logger.warning(f"NVIDIA_CODER model failed: {e}. Attempting fallback...")
196
+ # Fallback: NVIDIA_CODER → NVIDIA_SMALL
197
+ logger.info("Falling back from NVIDIA_CODER to NVIDIA_SMALL")
198
+ fallback_selection = {"provider": "nvidia", "model": NVIDIA_SMALL}
199
+ return await generate_answer_with_model(fallback_selection, system_prompt, user_prompt, gemini_rotator, nvidia_rotator, user_id, context)
200
+
201
+ return "Unsupported provider."
202
+
203
+
204
+ async def qwen_chat_completion(system_prompt: str, user_prompt: str, nvidia_rotator: APIKeyRotator, user_id: str = None, context: str = "") -> str:
205
+ """
206
+ Qwen chat completion with thinking mode enabled.
207
+ Uses the NVIDIA API rotator for key management.
208
+ """
209
+
210
+ key = nvidia_rotator.get_key() or ""
211
+ url = "https://integrate.api.nvidia.com/v1/chat/completions"
212
+
213
+ payload = {
214
+ "model": NVIDIA_MEDIUM,
215
+ "messages": [
216
+ {"role": "system", "content": system_prompt},
217
+ {"role": "user", "content": user_prompt}
218
+ ],
219
+ "temperature": 0.6,
220
+ "top_p": 0.7,
221
+ "max_tokens": 8192,
222
+ "stream": True
223
+ }
224
+
225
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
226
+
227
+ logger.info(f"[QWEN] API call - Model: {NVIDIA_MEDIUM}, Key present: {bool(key)}")
228
+ logger.info(f"[QWEN] System prompt length: {len(system_prompt)}, User prompt length: {len(user_prompt)}")
229
+
230
+ try:
231
+ # For streaming, we need to handle the response differently
232
+ import httpx
233
+ async with httpx.AsyncClient(timeout=60) as client:
234
+ response = await client.post(url, headers=headers, json=payload)
235
+
236
+ if response.status_code in (401, 403, 429) or (500 <= response.status_code < 600):
237
+ logger.warning(f"HTTP {response.status_code} from Qwen provider. Rotating key and retrying")
238
+ nvidia_rotator.rotate()
239
+ # Retry once with new key
240
+ key = nvidia_rotator.get_key() or ""
241
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
242
+ response = await client.post(url, headers=headers, json=payload)
243
+
244
+ response.raise_for_status()
245
+
246
+ # Handle streaming response
247
+ content = ""
248
+ async for line in response.aiter_lines():
249
+ if line.startswith("data: "):
250
+ data = line[6:] # Remove "data: " prefix
251
+ if data.strip() == "[DONE]":
252
+ break
253
+
254
+ try:
255
+ import json
256
+ chunk_data = json.loads(data)
257
+ if "choices" in chunk_data and len(chunk_data["choices"]) > 0:
258
+ delta = chunk_data["choices"][0].get("delta", {})
259
+
260
+ # Handle reasoning content (thinking)
261
+ reasoning = delta.get("reasoning_content")
262
+ if reasoning:
263
+ logger.debug(f"[QWEN] Reasoning: {reasoning}")
264
+
265
+ # Handle regular content
266
+ chunk_content = delta.get("content")
267
+ if chunk_content:
268
+ content += chunk_content
269
+ except json.JSONDecodeError:
270
+ continue
271
+
272
+ if not content or content.strip() == "":
273
+ logger.warning(f"Empty content from Qwen model")
274
+ return "I received an empty response from the model."
275
+
276
+ return content.strip()
277
+
278
+ except Exception as e:
279
+ logger.warning(f"Qwen API error: {e}")
280
+ return "I couldn't process the request with Qwen model."
281
+
282
+
283
+ async def nvidia_large_chat_completion(system_prompt: str, user_prompt: str, nvidia_rotator: APIKeyRotator, user_id: str = None, context: str = "") -> str:
284
+ """
285
+ NVIDIA Large (GPT-OSS) chat completion for hard/long context tasks.
286
+ Uses the NVIDIA API rotator for key management.
287
+ """
288
+
289
+ key = nvidia_rotator.get_key() or ""
290
+ url = "https://integrate.api.nvidia.com/v1/chat/completions"
291
+
292
+ payload = {
293
+ "model": NVIDIA_LARGE,
294
+ "messages": [
295
+ {"role": "system", "content": system_prompt},
296
+ {"role": "user", "content": user_prompt}
297
+ ],
298
+ "temperature": 1.0,
299
+ "top_p": 1.0,
300
+ "max_tokens": 4096,
301
+ "stream": True
302
+ }
303
+
304
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
305
+
306
+ logger.info(f"[NVIDIA_LARGE] API call - Model: {NVIDIA_LARGE}, Key present: {bool(key)}")
307
+ logger.info(f"[NVIDIA_LARGE] System prompt length: {len(system_prompt)}, User prompt length: {len(user_prompt)}")
308
+
309
+ try:
310
+ # For streaming, we need to handle the response differently
311
+ import httpx
312
+ async with httpx.AsyncClient(timeout=60) as client:
313
+ response = await client.post(url, headers=headers, json=payload)
314
+
315
+ if response.status_code in (401, 403, 429) or (500 <= response.status_code < 600):
316
+ logger.warning(f"HTTP {response.status_code} from NVIDIA Large provider. Rotating key and retrying")
317
+ nvidia_rotator.rotate()
318
+ # Retry once with new key
319
+ key = nvidia_rotator.get_key() or ""
320
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
321
+ response = await client.post(url, headers=headers, json=payload)
322
+
323
+ response.raise_for_status()
324
+
325
+ # Handle streaming response
326
+ content = ""
327
+ async for line in response.aiter_lines():
328
+ if line.startswith("data: "):
329
+ data = line[6:] # Remove "data: " prefix
330
+ if data.strip() == "[DONE]":
331
+ break
332
+
333
+ try:
334
+ import json
335
+ chunk_data = json.loads(data)
336
+ if "choices" in chunk_data and len(chunk_data["choices"]) > 0:
337
+ delta = chunk_data["choices"][0].get("delta", {})
338
+
339
+ # Handle reasoning content (thinking)
340
+ reasoning = delta.get("reasoning_content")
341
+ if reasoning:
342
+ logger.debug(f"[NVIDIA_LARGE] Reasoning: {reasoning}")
343
+
344
+ # Handle regular content
345
+ chunk_content = delta.get("content")
346
+ if chunk_content:
347
+ content += chunk_content
348
+ except json.JSONDecodeError:
349
+ continue
350
+
351
+ if not content or content.strip() == "":
352
+ logger.warning(f"Empty content from NVIDIA Large model")
353
+ return "I received an empty response from the model."
354
+
355
+ return content.strip()
356
+
357
+ except Exception as e:
358
+ logger.warning(f"NVIDIA Large API error: {e}")
359
+ return "I couldn't process the request with NVIDIA Large model."
ingestion_python/utils/embedding.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+
4
+ import requests
5
+
6
+ from utils.logger import get_logger
7
+
8
+
9
+ logger = get_logger("REMOTE_EMBED", __name__)
10
+
11
+
12
+ class RemoteEmbeddingClient:
13
+ """Client to call external embedding service /embed endpoint.
14
+
15
+ Expects env EMBED_BASE_URL, e.g. https://<space>.hf.space
16
+ """
17
+
18
+ def __init__(self, base_url: str | None = None, timeout: int = 60):
19
+ self.base_url = (base_url or os.getenv("EMBED_BASE_URL", "https://binkhoale1812-embedding.hf.space")).rstrip("/")
20
+ if not self.base_url:
21
+ raise RuntimeError("EMBED_BASE_URL is required for RemoteEmbeddingClient")
22
+ self.timeout = timeout
23
+
24
+ def embed(self, texts: List[str]) -> List[list]:
25
+ if not texts:
26
+ return []
27
+ url = f"{self.base_url}/embed"
28
+ payload = {"texts": texts}
29
+ headers = {"Content-Type": "application/json"}
30
+ try:
31
+ resp = requests.post(url, json=payload, headers=headers, timeout=self.timeout)
32
+ resp.raise_for_status()
33
+ data = resp.json()
34
+ vectors = data.get("vectors", [])
35
+ # Basic validation
36
+ if not isinstance(vectors, list):
37
+ raise ValueError("Invalid vectors format from remote embedder")
38
+ return vectors
39
+ except Exception as e:
40
+ logger.warning(f"Remote embedding failed: {e}")
41
+ # Fail closed with zero vectors to avoid crashes
42
+ return [[0.0] * 384 for _ in texts]
43
+
44
+
ingestion_python/utils/ingestion/chunker.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ────────────────────────────── utils/chunker.py ──────────────────────────────
2
+ import re
3
+ from typing import List, Dict, Any
4
+ from utils.service.summarizer import cheap_summarize, clean_chunk_text
5
+ from utils.service.common import split_sentences, slugify
6
+ from ..logger import get_logger
7
+
8
+ # Enhanced semantic chunker with overlap and better structure:
9
+ # - Split by headings / numbered sections if present
10
+ # - Ensure each chunk ~ 300-600 words (configurable)
11
+ # - Add overlap between chunks for better context preservation
12
+ # - Generate a short summary + topic name
13
+ # - Better handling of semantic boundaries
14
+
15
+ MAX_WORDS = 500
16
+ MIN_WORDS = 150
17
+ OVERLAP_WORDS = 50 # Overlap between chunks for better context
18
+ logger = get_logger("CHUNKER", __name__)
19
+
20
+
21
+ def _by_headings(text: str):
22
+ # Enhanced split on markdown-like or outline headings with better patterns
23
+ patterns = [
24
+ r"(?m)^(#{1,6}\s.*)\s*$", # Markdown headers
25
+ r"(?m)^([0-9]+\.\s+[^\n]+)\s*$", # Numbered sections
26
+ r"(?m)^([A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$", # Underlined headers
27
+ r"(?m)^(Chapter\s+\d+.*|Section\s+\d+.*)\s*$", # Chapter/Section headers
28
+ r"(?m)^(Abstract|Introduction|Conclusion|References|Bibliography)\s*$", # Common academic sections
29
+ ]
30
+
31
+ parts = []
32
+ last = 0
33
+ all_matches = []
34
+
35
+ # Find all matches from all patterns
36
+ for pattern in patterns:
37
+ for m in re.finditer(pattern, text):
38
+ all_matches.append((m.start(), m.end(), m.group(1).strip()))
39
+
40
+ # Sort matches by position
41
+ all_matches.sort(key=lambda x: x[0])
42
+
43
+ # Split text based on matches
44
+ for start, end, header in all_matches:
45
+ if start > last:
46
+ parts.append(text[last:start])
47
+ parts.append(text[start:end])
48
+ last = end
49
+
50
+ if last < len(text):
51
+ parts.append(text[last:])
52
+
53
+ if not parts:
54
+ parts = [text]
55
+
56
+ return parts
57
+
58
+
59
+ def _create_overlapping_chunks(text_blocks: List[str]) -> List[str]:
60
+ """Create overlapping chunks from text blocks for better context preservation"""
61
+ chunks = []
62
+
63
+ for i, block in enumerate(text_blocks):
64
+ words = block.split()
65
+ if not words:
66
+ continue
67
+
68
+ # If block is small enough, use as-is
69
+ if len(words) <= MAX_WORDS:
70
+ chunks.append(block)
71
+ continue
72
+
73
+ # Split large blocks with overlap
74
+ start = 0
75
+ while start < len(words):
76
+ end = min(start + MAX_WORDS, len(words))
77
+ chunk_words = words[start:end]
78
+
79
+ # Add overlap from previous chunk if available
80
+ if start > 0 and len(chunks) > 0:
81
+ prev_words = chunks[-1].split()
82
+ overlap_start = max(0, len(prev_words) - OVERLAP_WORDS)
83
+ overlap_words = prev_words[overlap_start:]
84
+ chunk_words = overlap_words + chunk_words
85
+
86
+ chunks.append(" ".join(chunk_words))
87
+ start = end - OVERLAP_WORDS # Overlap with next chunk
88
+
89
+ return chunks
90
+
91
+
92
+ async def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id: str, project_id: str) -> List[Dict[str, Any]]:
93
+ # Concatenate pages but keep page spans for metadata
94
+ full = ""
95
+ page_markers = []
96
+ for p in pages:
97
+ start = len(full)
98
+ full += f"\n\n[[Page {p['page_num']}]]\n{p.get('text','').strip()}\n"
99
+ page_markers.append((p['page_num'], start, len(full)))
100
+
101
+ # First split by headings
102
+ coarse = _by_headings(full)
103
+
104
+ # Create overlapping chunks for better context preservation
105
+ cards = _create_overlapping_chunks(coarse)
106
+
107
+ # Build card dicts
108
+ out = []
109
+ for i, raw_content in enumerate(cards, 1):
110
+ # Clean with LLM to remove headers/footers and IDs
111
+ cleaned = await clean_chunk_text(raw_content)
112
+ topic = await cheap_summarize(cleaned, max_sentences=1)
113
+ if not topic:
114
+ topic = cleaned[:80] + "..."
115
+ summary = await cheap_summarize(cleaned, max_sentences=3)
116
+ # Estimate page span
117
+ first_page = pages[0]['page_num'] if pages else 1
118
+ last_page = pages[-1]['page_num'] if pages else 1
119
+ out.append({
120
+ "user_id": user_id,
121
+ "project_id": project_id,
122
+ "filename": filename,
123
+ "topic_name": topic[:120],
124
+ "summary": summary,
125
+ "content": cleaned,
126
+ "page_span": [first_page, last_page],
127
+ "card_id": f"{slugify(filename)}-c{i:04d}"
128
+ })
129
+ logger.info(f"Built {len(out)} cards from {len(pages)} pages for {filename}")
130
+ return out
ingestion_python/utils/ingestion/parser.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ from typing import List, Dict, Any
3
+ import fitz # PyMuPDF
4
+ from docx import Document
5
+ from PIL import Image
6
+ import numpy as np
7
+ from ..logger import get_logger
8
+
9
+ logger = get_logger("PARSER", __name__)
10
+
11
+
12
+ def parse_pdf_bytes(b: bytes) -> List[Dict[str, Any]]:
13
+ """
14
+ Returns list of pages, each {'page_num': i, 'text': str, 'images': [PIL.Image]}
15
+ """
16
+ pages = []
17
+ with fitz.open(stream=b, filetype="pdf") as doc:
18
+ for i, page in enumerate(doc):
19
+ text = page.get_text("text")
20
+ images = []
21
+ for img in page.get_images(full=True):
22
+ xref = img[0]
23
+ try:
24
+ pix = fitz.Pixmap(doc, xref)
25
+ # Convert CMYK/Alpha safely
26
+ if pix.n - pix.alpha >= 4:
27
+ pix = fitz.Pixmap(fitz.csRGB, pix)
28
+ # Use PNG bytes to avoid 'not enough image data'
29
+ png_bytes = pix.tobytes("png")
30
+ im = Image.open(io.BytesIO(png_bytes)).convert("RGB")
31
+ images.append(im)
32
+ except Exception as e:
33
+ logger.warning(f"Failed to extract image on page {i+1}: {e}")
34
+ finally:
35
+ try:
36
+ pix = None
37
+ except Exception:
38
+ pass
39
+ pages.append({"page_num": i + 1, "text": text, "images": images})
40
+ logger.info(f"Parsed PDF with {len(pages)} pages")
41
+ return pages
42
+
43
+
44
+ def parse_docx_bytes(b: bytes) -> List[Dict[str, Any]]:
45
+ f = io.BytesIO(b)
46
+ doc = Document(f)
47
+ text = []
48
+ images = []
49
+ for rel in doc.part.rels.values():
50
+ if "image" in rel.reltype:
51
+ data = rel.target_part.blob
52
+ try:
53
+ im = Image.open(io.BytesIO(data)).convert("RGB")
54
+ images.append(im)
55
+ except Exception:
56
+ pass
57
+ for p in doc.paragraphs:
58
+ text.append(p.text)
59
+ pages = [{"page_num": 1, "text": "\n".join(text), "images": images}]
60
+ logger.info("Parsed DOCX into single concatenated page")
61
+ return pages
62
+
63
+
ingestion_python/utils/logger.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import sys
3
+ from typing import Optional
4
+
5
+
6
+ _DEFAULT_FORMAT = "%(asctime)s %(levelname)s %(message)s"
7
+
8
+
9
+ def _ensure_root_handler() -> None:
10
+ root_logger = logging.getLogger()
11
+ if root_logger.handlers:
12
+ return
13
+ handler = logging.StreamHandler(stream=sys.stdout)
14
+ formatter = logging.Formatter(_DEFAULT_FORMAT)
15
+ handler.setFormatter(formatter)
16
+ root_logger.addHandler(handler)
17
+ root_logger.setLevel(logging.INFO)
18
+
19
+
20
+ class _TaggedAdapter(logging.LoggerAdapter):
21
+ def process(self, msg, kwargs):
22
+ tag = self.extra.get("tag", "")
23
+ if tag and not str(msg).startswith(tag):
24
+ msg = f"{tag} {msg}"
25
+ return msg, kwargs
26
+
27
+
28
+ def get_logger(tag: str, name: Optional[str] = None) -> logging.Logger:
29
+ _ensure_root_handler()
30
+ logger_name = name or __name__
31
+ base = logging.getLogger(logger_name)
32
+ return _TaggedAdapter(base, {"tag": f"[{tag}]"})
33
+
34
+ import logging
35
+ import sys
36
+ from typing import Optional
37
+
38
+
39
+ _DEFAULT_FORMAT = "%(asctime)s %(levelname)s %(message)s"
40
+
41
+
42
+ def _ensure_root_handler() -> None:
43
+ root_logger = logging.getLogger()
44
+ if root_logger.handlers:
45
+ return
46
+ handler = logging.StreamHandler(stream=sys.stdout)
47
+ formatter = logging.Formatter(_DEFAULT_FORMAT)
48
+ handler.setFormatter(formatter)
49
+ root_logger.addHandler(handler)
50
+ root_logger.setLevel(logging.INFO)
51
+
52
+
53
+ class _TaggedAdapter(logging.LoggerAdapter):
54
+ def process(self, msg, kwargs):
55
+ tag = self.extra.get("tag", "")
56
+ if tag and not str(msg).startswith(tag):
57
+ msg = f"{tag} {msg}"
58
+ return msg, kwargs
59
+
60
+
61
+ def get_logger(tag: str, name: Optional[str] = None) -> logging.Logger:
62
+ """
63
+ Return a logger that injects a [TAG] prefix into records.
64
+ Example: logger = get_logger("APP") → logs like: [APP] message
65
+ """
66
+ _ensure_root_handler()
67
+ logger_name = name or __name__
68
+ base = logging.getLogger(logger_name)
69
+ return _TaggedAdapter(base, {"tag": f"[{tag}]"})
70
+
71
+
ingestion_python/utils/rag/embeddings.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ────────────────────────────── utils/embeddings.py ──────────────────────────────
2
+ import os
3
+ from typing import List
4
+ import requests
5
+
6
+ from utils.logger import get_logger
7
+
8
+
9
+ logger = get_logger("EMBED", __name__)
10
+
11
+
12
+ class EmbeddingClient:
13
+ """Embedding client that calls external embedding service via HTTP.
14
+
15
+ Expects environment variable EMBEDDER_BASE_URL pointing at an API with:
16
+ POST /embed {"texts": [..]} -> {"vectors": [[..], ...], "model": "..."}
17
+ """
18
+
19
+ def __init__(self, base_url: str = None):
20
+ self.base_url = (base_url or os.getenv("EMBEDDER_BASE_URL", "")).rstrip("/")
21
+ if not self.base_url:
22
+ logger.warning("EMBEDDER_BASE_URL not set; embedding calls will fail.")
23
+
24
+ def embed(self, texts: List[str]) -> List[list]:
25
+ if not texts:
26
+ return []
27
+ if not self.base_url:
28
+ raise RuntimeError("EMBEDDER_BASE_URL not configured")
29
+ url = f"{self.base_url}/embed"
30
+ try:
31
+ resp = requests.post(url, json={"texts": texts}, timeout=60)
32
+ if resp.status_code >= 400:
33
+ raise RuntimeError(f"Embedding API error {resp.status_code}: {resp.text[:200]}")
34
+ data = resp.json()
35
+ vectors = data.get("vectors") or []
36
+ return vectors
37
+ except Exception as e:
38
+ logger.warning(f"Embedding API failed: {e}")
39
+ raise
ingestion_python/utils/rag/rag.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ────────────────────────────── utils/rag.py ──────────────────────────────
2
+ import os
3
+ import math
4
+ from typing import List, Dict, Any, Optional
5
+ from pymongo import MongoClient, ASCENDING, TEXT
6
+ from pymongo.collection import Collection
7
+ from pymongo.errors import PyMongoError
8
+ import numpy as np
9
+ import logging
10
+ from ..logger import get_logger
11
+
12
+ VECTOR_DIM = 384 # all-MiniLM-L6-v2
13
+ INDEX_NAME = os.getenv("MONGO_VECTOR_INDEX", "vector_index")
14
+ USE_ATLAS_VECTOR = os.getenv("ATLAS_VECTOR", "0") == "1"
15
+
16
+ logger = get_logger("RAG", __name__)
17
+
18
+
19
+ class RAGStore:
20
+ def __init__(self, mongo_uri: str, db_name: str = "studybuddy"):
21
+ self.client = MongoClient(mongo_uri)
22
+ self.db = self.client[db_name]
23
+ self.chunks: Collection = self.db["chunks"]
24
+ self.files: Collection = self.db["files"]
25
+
26
+ # ── Write ────────────────────────────────────────────────────────────────
27
+ def store_cards(self, cards: List[Dict[str, Any]]):
28
+ if not cards:
29
+ return
30
+ for c in cards:
31
+ # basic validation
32
+ emb = c.get("embedding")
33
+ if not emb or len(emb) != VECTOR_DIM:
34
+ raise ValueError("Invalid embedding length; expected %d" % VECTOR_DIM)
35
+ self.chunks.insert_many(cards, ordered=False)
36
+ logger.info(f"Inserted {len(cards)} cards into MongoDB")
37
+
38
+ def upsert_file_summary(self, user_id: str, project_id: str, filename: str, summary: str):
39
+ self.files.update_one(
40
+ {"user_id": user_id, "project_id": project_id, "filename": filename},
41
+ {"$set": {"summary": summary}},
42
+ upsert=True
43
+ )
44
+ logger.info(f"Upserted summary for {filename} (user {user_id}, project {project_id})")
45
+
46
+ # ── Read ────────────────────────────────────────────────────────────────
47
+ def list_cards(self, user_id: str, project_id: str, filename: Optional[str], limit: int, skip: int):
48
+ q = {"user_id": user_id, "project_id": project_id}
49
+ if filename:
50
+ q["filename"] = filename
51
+ cur = self.chunks.find(q, {"embedding": 0}).skip(skip).limit(limit).sort([("_id", ASCENDING)])
52
+ # Convert MongoDB documents to JSON-serializable format
53
+ cards = []
54
+ for card in cur:
55
+ serializable_card = {}
56
+ for key, value in card.items():
57
+ if key == '_id':
58
+ serializable_card[key] = str(value) # Convert ObjectId to string
59
+ elif hasattr(value, 'isoformat'): # Handle datetime objects
60
+ serializable_card[key] = value.isoformat()
61
+ else:
62
+ serializable_card[key] = value
63
+ cards.append(serializable_card)
64
+ return cards
65
+
66
+ def get_file_summary(self, user_id: str, project_id: str, filename: str):
67
+ doc = self.files.find_one({"user_id": user_id, "project_id": project_id, "filename": filename})
68
+ if doc:
69
+ # Convert MongoDB document to JSON-serializable format
70
+ serializable_doc = {}
71
+ for key, value in doc.items():
72
+ if key == '_id':
73
+ serializable_doc[key] = str(value) # Convert ObjectId to string
74
+ elif hasattr(value, 'isoformat'): # Handle datetime objects
75
+ serializable_doc[key] = value.isoformat()
76
+ else:
77
+ serializable_doc[key] = value
78
+ return serializable_doc
79
+ return None
80
+
81
+ def get_file_chunks(self, user_id: str, project_id: str, filename: str, limit: int = 20) -> List[Dict[str, Any]]:
82
+ """Get chunks for a specific file"""
83
+ cursor = self.chunks.find({
84
+ "user_id": user_id,
85
+ "project_id": project_id,
86
+ "filename": filename
87
+ }).limit(limit)
88
+
89
+ chunks = []
90
+ for doc in cursor:
91
+ # Convert MongoDB document to JSON-serializable format
92
+ serializable_doc = {}
93
+ for key, value in doc.items():
94
+ if key == '_id':
95
+ serializable_doc[key] = str(value)
96
+ elif hasattr(value, 'isoformat'):
97
+ serializable_doc[key] = value.isoformat()
98
+ else:
99
+ serializable_doc[key] = value
100
+ chunks.append(serializable_doc)
101
+
102
+ return chunks
103
+
104
+ def list_files(self, user_id: str, project_id: str):
105
+ """List all files for a project with their summaries"""
106
+ files_cursor = self.files.find(
107
+ {"user_id": user_id, "project_id": project_id},
108
+ {"_id": 0, "filename": 1, "summary": 1}
109
+ ).sort("filename", ASCENDING)
110
+
111
+ # Convert MongoDB documents to JSON-serializable format
112
+ files = []
113
+ for file_doc in files_cursor:
114
+ serializable_file = {}
115
+ for key, value in file_doc.items():
116
+ if hasattr(value, 'isoformat'): # Handle datetime objects
117
+ serializable_file[key] = value.isoformat()
118
+ else:
119
+ serializable_file[key] = value
120
+ files.append(serializable_file)
121
+ return files
122
+
123
+ def vector_search(self, user_id: str, project_id: str, query_vector: List[float], k: int = 6, filenames: Optional[List[str]] = None, search_type: str = "hybrid"):
124
+ """
125
+ Enhanced vector search with multiple strategies:
126
+ - hybrid: Combines Atlas and local search
127
+ - flat: Exhaustive search for maximum accuracy
128
+ - atlas: Uses Atlas Vector Search only
129
+ - local: Uses local cosine similarity only
130
+ """
131
+ if search_type == "flat" or (search_type == "hybrid" and not USE_ATLAS_VECTOR):
132
+ return self._flat_vector_search(user_id, project_id, query_vector, k, filenames)
133
+ elif search_type == "atlas" and USE_ATLAS_VECTOR:
134
+ return self._atlas_vector_search(user_id, project_id, query_vector, k, filenames)
135
+ elif search_type == "local":
136
+ return self._local_vector_search(user_id, project_id, query_vector, k, filenames)
137
+ else:
138
+ # Default hybrid approach
139
+ if USE_ATLAS_VECTOR:
140
+ atlas_results = self._atlas_vector_search(user_id, project_id, query_vector, k, filenames)
141
+ if atlas_results:
142
+ return atlas_results
143
+ return self._local_vector_search(user_id, project_id, query_vector, k, filenames)
144
+
145
+ def _atlas_vector_search(self, user_id: str, project_id: str, query_vector: List[float], k: int, filenames: Optional[List[str]] = None):
146
+ """Atlas Vector Search implementation"""
147
+ match_stage = {"user_id": user_id, "project_id": project_id}
148
+ if filenames:
149
+ match_stage["filename"] = {"$in": filenames}
150
+ pipeline = [
151
+ {
152
+ "$search": {
153
+ "index": INDEX_NAME,
154
+ "knnBeta": {
155
+ "vector": query_vector,
156
+ "path": "embedding",
157
+ "k": k,
158
+ }
159
+ }
160
+ },
161
+ {"$match": match_stage},
162
+ {"$project": {"doc": "$$ROOT", "score": {"$meta": "searchScore"}}},
163
+ {"$limit": k},
164
+ ]
165
+ hits = list(self.chunks.aggregate(pipeline))
166
+ return self._serialize_hits(hits)
167
+
168
+ def _local_vector_search(self, user_id: str, project_id: str, query_vector: List[float], k: int, filenames: Optional[List[str]] = None):
169
+ """Local cosine similarity search with improved sampling"""
170
+ q = {"user_id": user_id, "project_id": project_id}
171
+ if filenames:
172
+ q["filename"] = {"$in": filenames}
173
+
174
+ # Increase sample size for better accuracy
175
+ sample_limit = max(5000, k * 50)
176
+ sample = list(self.chunks.find(q).sort([("_id", -1)]).limit(sample_limit))
177
+ if not sample:
178
+ return []
179
+
180
+ qv = np.array(query_vector, dtype="float32")
181
+ scores = []
182
+
183
+ for d in sample:
184
+ v = np.array(d.get("embedding", [0]*VECTOR_DIM), dtype="float32")
185
+ denom = (np.linalg.norm(qv) * np.linalg.norm(v)) or 1.0
186
+ sim = float(np.dot(qv, v) / denom)
187
+ scores.append((sim, d))
188
+
189
+ scores.sort(key=lambda x: x[0], reverse=True)
190
+ top = scores[:k]
191
+ logger.info(f"Local vector search: {len(sample)} docs sampled, {len(top)} results")
192
+
193
+ return self._serialize_results(top)
194
+
195
+ def _flat_vector_search(self, user_id: str, project_id: str, query_vector: List[float], k: int, filenames: Optional[List[str]] = None):
196
+ """Flat exhaustive search for maximum accuracy"""
197
+ q = {"user_id": user_id, "project_id": project_id}
198
+ if filenames:
199
+ q["filename"] = {"$in": filenames}
200
+
201
+ # Get ALL relevant documents for exhaustive search
202
+ all_docs = list(self.chunks.find(q))
203
+ if not all_docs:
204
+ return []
205
+
206
+ qv = np.array(query_vector, dtype="float32")
207
+ scores = []
208
+
209
+ for doc in all_docs:
210
+ v = np.array(doc.get("embedding", [0]*VECTOR_DIM), dtype="float32")
211
+ denom = (np.linalg.norm(qv) * np.linalg.norm(v)) or 1.0
212
+ sim = float(np.dot(qv, v) / denom)
213
+ scores.append((sim, doc))
214
+
215
+ scores.sort(key=lambda x: x[0], reverse=True)
216
+ top = scores[:k]
217
+ logger.info(f"Flat vector search: {len(all_docs)} docs searched, {len(top)} results")
218
+
219
+ return self._serialize_results(top)
220
+
221
+ def _serialize_hits(self, hits):
222
+ """Serialize Atlas search hits"""
223
+ serializable_hits = []
224
+ for hit in hits:
225
+ doc = hit["doc"]
226
+ serializable_doc = self._serialize_doc(doc)
227
+ serializable_hits.append({
228
+ "doc": serializable_doc,
229
+ "score": float(hit.get("score", 0.0))
230
+ })
231
+ return serializable_hits
232
+
233
+ def _serialize_results(self, results):
234
+ """Serialize local search results"""
235
+ serializable_results = []
236
+ for score, doc in results:
237
+ serializable_doc = self._serialize_doc(doc)
238
+ serializable_results.append({
239
+ "doc": serializable_doc,
240
+ "score": float(score)
241
+ })
242
+ return serializable_results
243
+
244
+ def _serialize_doc(self, doc):
245
+ """Convert MongoDB document to JSON-serializable format"""
246
+ serializable_doc = {}
247
+ for key, value in doc.items():
248
+ if key == '_id':
249
+ serializable_doc[key] = str(value)
250
+ elif hasattr(value, 'isoformat'):
251
+ serializable_doc[key] = value.isoformat()
252
+ else:
253
+ serializable_doc[key] = value
254
+ return serializable_doc
255
+
256
+
257
+ def ensure_indexes(store: RAGStore):
258
+ # Basic text index for fallback keyword search (optional)
259
+ try:
260
+ store.chunks.create_index([("user_id", ASCENDING), ("project_id", ASCENDING), ("filename", ASCENDING)])
261
+ store.chunks.create_index([("content", TEXT), ("topic_name", TEXT), ("summary", TEXT)], name="text_idx")
262
+ store.files.create_index([("user_id", ASCENDING), ("project_id", ASCENDING), ("filename", ASCENDING)], unique=True)
263
+ except PyMongoError as e:
264
+ logger.warning(f"Index creation warning: {e}")
265
+ # Note: For Atlas Vector, create an Atlas Search index named INDEX_NAME on field "embedding" with vector options.
266
+ # Example (in Atlas UI):
267
+ # {
268
+ # "mappings": {
269
+ # "dynamic": false,
270
+ # "fields": {
271
+ # "embedding": {
272
+ # "type": "knnVector",
273
+ # "dimensions": 384,
274
+ # "similarity": "cosine"
275
+ # }
276
+ # }
277
+ # }
278
+ # }
ingestion_python/utils/service/common.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import unicodedata
3
+ from utils.logger import get_logger
4
+
5
+ logger = get_logger("COMMON", __name__)
6
+
7
+ def split_sentences(text: str):
8
+ return re.split(r"(?<=[\.\!\?])\s+", text.strip())
9
+
10
+ def slugify(value: str):
11
+ value = str(value)
12
+ value = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii")
13
+ value = re.sub(r"[^\w\s-]", "", value).strip().lower()
14
+ return re.sub(r"[-\s]+", "-", value)
15
+
16
+ def trim_text(s: str, n: int):
17
+ s = s or ""
18
+ if len(s) <= n:
19
+ return s
20
+ return s[:n] + "…"
ingestion_python/utils/service/summarizer.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List
3
+ from utils.logger import get_logger
4
+
5
+ logger = get_logger("SUM", __name__)
6
+
7
+
8
+ async def clean_chunk_text(text: str) -> str:
9
+ """Clean and normalize text for processing."""
10
+ if not text:
11
+ return ""
12
+
13
+ # Remove extra whitespace and normalize
14
+ text = " ".join(text.split())
15
+
16
+ # Remove common artifacts
17
+ text = text.replace("\\n", " ").replace("\\t", " ")
18
+
19
+ return text.strip()
20
+
21
+
22
+ async def cheap_summarize(text: str, max_sentences: int = 3) -> str:
23
+ """Simple text-based summarization without external APIs."""
24
+ if not text or len(text.strip()) < 50:
25
+ return text.strip()
26
+
27
+ try:
28
+ # Simple extractive summarization: take first few sentences
29
+ sentences = re.split(r'[.!?]+', text)
30
+ sentences = [s.strip() for s in sentences if s.strip()]
31
+
32
+ if len(sentences) <= max_sentences:
33
+ return text.strip()
34
+
35
+ # Take first max_sentences sentences
36
+ summary_sentences = sentences[:max_sentences]
37
+ summary = '. '.join(summary_sentences)
38
+
39
+ # Add period if it doesn't end with punctuation
40
+ if not summary.endswith(('.', '!', '?')):
41
+ summary += '.'
42
+
43
+ return summary
44
+
45
+ except Exception as e:
46
+ logger.warning(f"[SUM] Summarization failed: {e}")
47
+ # Fallback: return first part of text
48
+ return text[:200] + "..." if len(text) > 200 else text