Spaces:
Sleeping
Sleeping
Merge commit 'e34edc7cd55f292dd0b192dc00b782c22208fde6' as 'ingestion_python'
Browse files- ingestion_python/.dockerignore +46 -0
- ingestion_python/CURL.md +138 -0
- ingestion_python/Dockerfile +33 -0
- ingestion_python/README.md +246 -0
- ingestion_python/api/models.py +37 -0
- ingestion_python/api/routes.py +238 -0
- ingestion_python/app.py +58 -0
- ingestion_python/helpers/pages.py +24 -0
- ingestion_python/requirements.txt +17 -0
- ingestion_python/services/ingestion_service.py +119 -0
- ingestion_python/services/maverick_captioner.py +141 -0
- ingestion_python/test_upload1.sh +241 -0
- ingestion_python/test_upload2.sh +238 -0
- ingestion_python/test_upload3.sh +227 -0
- ingestion_python/utils/__init__.py +2 -0
- ingestion_python/utils/api/rotator.py +67 -0
- ingestion_python/utils/api/router.py +359 -0
- ingestion_python/utils/embedding.py +44 -0
- ingestion_python/utils/ingestion/chunker.py +130 -0
- ingestion_python/utils/ingestion/parser.py +63 -0
- ingestion_python/utils/logger.py +71 -0
- ingestion_python/utils/rag/embeddings.py +39 -0
- ingestion_python/utils/rag/rag.py +278 -0
- ingestion_python/utils/service/common.py +20 -0
- ingestion_python/utils/service/summarizer.py +48 -0
ingestion_python/.dockerignore
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ignore unnecessary files for Docker build
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
*.pyd
|
| 6 |
+
.Python
|
| 7 |
+
env/
|
| 8 |
+
venv/
|
| 9 |
+
.venv/
|
| 10 |
+
pip-log.txt
|
| 11 |
+
pip-delete-this-directory.txt
|
| 12 |
+
.tox/
|
| 13 |
+
.coverage
|
| 14 |
+
.coverage.*
|
| 15 |
+
.cache
|
| 16 |
+
nosetests.xml
|
| 17 |
+
coverage.xml
|
| 18 |
+
*.cover
|
| 19 |
+
*.log
|
| 20 |
+
.git/
|
| 21 |
+
.gitignore
|
| 22 |
+
.dockerignore
|
| 23 |
+
|
| 24 |
+
# Ignore documentation files (keep only main README)
|
| 25 |
+
*.md
|
| 26 |
+
!README.md
|
| 27 |
+
|
| 28 |
+
# Ignore test files
|
| 29 |
+
test_*.py
|
| 30 |
+
*_test.py
|
| 31 |
+
*.sh
|
| 32 |
+
tests/
|
| 33 |
+
|
| 34 |
+
# Ignore IDE files
|
| 35 |
+
.vscode/
|
| 36 |
+
.idea/
|
| 37 |
+
*.swp
|
| 38 |
+
*.swo
|
| 39 |
+
*~
|
| 40 |
+
|
| 41 |
+
# Ignore OS files
|
| 42 |
+
.DS_Store
|
| 43 |
+
Thumbs.db
|
| 44 |
+
|
| 45 |
+
# Ignore unnecessary directories
|
| 46 |
+
# config/ and services/ are needed for the application
|
ingestion_python/CURL.md
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CURL Test Commands for Ingestion Pipeline
|
| 2 |
+
|
| 3 |
+
## Backend Configuration
|
| 4 |
+
- **URL**: `https://binkhoale1812-studdybuddy-ingestion1.hf.space/`
|
| 5 |
+
- **User ID**: `44e65346-8eaa-4f95-b17a-f6219953e7a8`
|
| 6 |
+
- **Project ID**: `496e2fad-ec7e-4562-b06a-ea2491f2460`
|
| 7 |
+
- **Test Files**: `Lecture5_ML.pdf`, `Lecture6_ANN_DL.pdf`
|
| 8 |
+
|
| 9 |
+
## 1. Health Check
|
| 10 |
+
|
| 11 |
+
```bash
|
| 12 |
+
curl -X GET "https://binkhoale1812-studdybuddy-ingestion1.hf.space/health" \
|
| 13 |
+
-H "Content-Type: application/json"
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
## 2. Upload Files
|
| 17 |
+
|
| 18 |
+
```bash
|
| 19 |
+
curl -X POST "https://binkhoale1812-studdybuddy-ingestion1.hf.space/upload" \
|
| 20 |
+
-F "user_id=44e65346-8eaa-4f95-b17a-f6219953e7a8" \
|
| 21 |
+
-F "project_id=496e2fad-ec7e-4562-b06a-ea2491f2460" \
|
| 22 |
+
-F "files=@../exefiles/Lecture5_ML.pdf" \
|
| 23 |
+
-F "files=@../exefiles/Lecture6_ANN_DL.pdf"
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
## 3. Check Upload Status
|
| 27 |
+
|
| 28 |
+
Replace `{JOB_ID}` with the job_id from the upload response:
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
curl -X GET "https://binkhoale1812-studdybuddy-ingestion1.hf.space/upload/status?job_id={JOB_ID}" \
|
| 32 |
+
-H "Content-Type: application/json"
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
## 4. List Uploaded Files
|
| 36 |
+
|
| 37 |
+
```bash
|
| 38 |
+
curl -X GET "https://binkhoale1812-studdybuddy-ingestion1.hf.space/files?user_id=44e65346-8eaa-4f95-b17a-f6219953e7a8&project_id=496e2fad-ec7e-4562-b06a-ea2491f2460" \
|
| 39 |
+
-H "Content-Type: application/json"
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
## 5. Get File Chunks (Lecture5_ML.pdf)
|
| 43 |
+
|
| 44 |
+
```bash
|
| 45 |
+
curl -X GET "https://binkhoale1812-studdybuddy-ingestion1.hf.space/files/chunks?user_id=44e65346-8eaa-4f95-b17a-f6219953e7a8&project_id=496e2fad-ec7e-4562-b06a-ea2491f2460&filename=Lecture5_ML.pdf&limit=5" \
|
| 46 |
+
-H "Content-Type: application/json"
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
## 6. Get File Chunks (Lecture6_ANN_DL.pdf)
|
| 50 |
+
|
| 51 |
+
```bash
|
| 52 |
+
curl -X GET "https://binkhoale1812-studdybuddy-ingestion1.hf.space/files/chunks?user_id=44e65346-8eaa-4f95-b17a-f6219953e7a8&project_id=496e2fad-ec7e-4562-b06a-ea2491f2460&filename=Lecture6_ANN_DL.pdf&limit=5" \
|
| 53 |
+
-H "Content-Type: application/json"
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
## Expected Responses
|
| 57 |
+
|
| 58 |
+
### Health Check Response
|
| 59 |
+
```json
|
| 60 |
+
{
|
| 61 |
+
"ok": true,
|
| 62 |
+
"mongodb_connected": true,
|
| 63 |
+
"service": "ingestion_pipeline"
|
| 64 |
+
}
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
### Upload Response
|
| 68 |
+
```json
|
| 69 |
+
{
|
| 70 |
+
"job_id": "uuid-string",
|
| 71 |
+
"status": "processing",
|
| 72 |
+
"total_files": 2
|
| 73 |
+
}
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
### Status Response
|
| 77 |
+
```json
|
| 78 |
+
{
|
| 79 |
+
"job_id": "uuid-string",
|
| 80 |
+
"status": "completed",
|
| 81 |
+
"total": 2,
|
| 82 |
+
"completed": 2,
|
| 83 |
+
"progress": 100.0,
|
| 84 |
+
"last_error": null,
|
| 85 |
+
"created_at": 1234567890.123
|
| 86 |
+
}
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
### Files List Response
|
| 90 |
+
```json
|
| 91 |
+
{
|
| 92 |
+
"files": [
|
| 93 |
+
{
|
| 94 |
+
"filename": "Lecture5_ML.pdf",
|
| 95 |
+
"summary": "Document summary..."
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"filename": "Lecture6_ANN_DL.pdf",
|
| 99 |
+
"summary": "Document summary..."
|
| 100 |
+
}
|
| 101 |
+
],
|
| 102 |
+
"filenames": ["Lecture5_ML.pdf", "Lecture6_ANN_DL.pdf"]
|
| 103 |
+
}
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
### Chunks Response
|
| 107 |
+
```json
|
| 108 |
+
{
|
| 109 |
+
"chunks": [
|
| 110 |
+
{
|
| 111 |
+
"user_id": "44e65346-8eaa-4f95-b17a-f6219953e7a8",
|
| 112 |
+
"project_id": "496e2fad-ec7e-4562-b06a-ea2491f2460",
|
| 113 |
+
"filename": "Lecture5_ML.pdf",
|
| 114 |
+
"topic_name": "Machine Learning Introduction",
|
| 115 |
+
"summary": "Chunk summary...",
|
| 116 |
+
"content": "Chunk content...",
|
| 117 |
+
"embedding": [0.1, 0.2, ...],
|
| 118 |
+
"page_span": [1, 3],
|
| 119 |
+
"card_id": "lecture5_ml-c0001"
|
| 120 |
+
}
|
| 121 |
+
]
|
| 122 |
+
}
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
## Testing Steps
|
| 126 |
+
|
| 127 |
+
1. **Run Health Check**: Verify the service is running
|
| 128 |
+
2. **Upload Files**: Upload both PDF files
|
| 129 |
+
3. **Monitor Progress**: Check job status until completion
|
| 130 |
+
4. **Verify Files**: List uploaded files
|
| 131 |
+
5. **Inspect Chunks**: Get document chunks to verify processing
|
| 132 |
+
|
| 133 |
+
## Troubleshooting
|
| 134 |
+
|
| 135 |
+
- **Connection Issues**: Check if the backend URL is accessible
|
| 136 |
+
- **File Not Found**: Ensure PDF files exist in `../exefiles/` directory
|
| 137 |
+
- **Upload Fails**: Check file size limits and format support
|
| 138 |
+
- **Processing Stuck**: Monitor job status and check logs
|
ingestion_python/Dockerfile
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face Spaces - Docker for Ingestion Pipeline
|
| 2 |
+
FROM python:3.11-slim
|
| 3 |
+
|
| 4 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 5 |
+
ENV PYTHONUNBUFFERED=1
|
| 6 |
+
|
| 7 |
+
# System deps (same as main system)
|
| 8 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 9 |
+
build-essential curl git libglib2.0-0 libgl1 \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
+
|
| 12 |
+
# Create and use a non-root user (same as main system)
|
| 13 |
+
RUN useradd -m -u 1000 user
|
| 14 |
+
USER user
|
| 15 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 16 |
+
|
| 17 |
+
# Set working directory
|
| 18 |
+
WORKDIR /app
|
| 19 |
+
|
| 20 |
+
# Copy ingestion pipeline files (includes utils and helpers)
|
| 21 |
+
COPY . .
|
| 22 |
+
|
| 23 |
+
# Install Python dependencies (same as main system)
|
| 24 |
+
RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
|
| 25 |
+
|
| 26 |
+
# No local model caches or warmup needed (remote embedding service)
|
| 27 |
+
|
| 28 |
+
# Expose port for HF Spaces
|
| 29 |
+
ENV PORT=7860
|
| 30 |
+
EXPOSE 7860
|
| 31 |
+
|
| 32 |
+
# Start FastAPI (single worker so app.state.jobs remains consistent)
|
| 33 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
|
ingestion_python/README.md
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: StuddyBuddy Ingestion
|
| 3 |
+
emoji: ⚙️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: pink
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
short_description: 'backend for data ingestion'
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Ingestion Pipeline
|
| 13 |
+
|
| 14 |
+
A dedicated service for processing file uploads and storing them in MongoDB Atlas. This service mirrors the main system's file processing functionality while running as a separate service to share the processing load.
|
| 15 |
+
|
| 16 |
+
[API docs](API.md) | [System docs](COMPATIBILITY.md)
|
| 17 |
+
|
| 18 |
+
## 🏗️ Architecture
|
| 19 |
+
|
| 20 |
+
```
|
| 21 |
+
┌─────────────────────────────────────────────────────────────────────────────────┐
|
| 22 |
+
│ USER INTERFACE │
|
| 23 |
+
│ ┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐ │
|
| 24 |
+
│ │ Frontend UI │ │ Load Balancer │ │ Main System │ │
|
| 25 |
+
│ │ │◄──►│ │◄──►│ (Port 7860) │ │
|
| 26 |
+
│ │ - File Upload │ │ - Route Requests │ │ - Chat & Reports│ │
|
| 27 |
+
│ │ - Chat Interface│ │ - Health Checks │ │ - User Management│ │
|
| 28 |
+
│ │ - Project Mgmt │ │ - Load Balancing │ │ - Analytics │ │
|
| 29 |
+
│ └─────────────────┘ └──────────────────┘ └─────────────────┘ │
|
| 30 |
+
└─────────────────────────────────────────────────────────────────────────────────┘
|
| 31 |
+
│
|
| 32 |
+
▼
|
| 33 |
+
┌─────────────────────────────────────────────────────────────────────────────────┐
|
| 34 |
+
│ INGESTION PIPELINE │
|
| 35 |
+
│ ┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐ │
|
| 36 |
+
│ │ File Processing │ │ Data Storage │ │ Monitoring │ │
|
| 37 |
+
│ │ - PDF/DOCX Parse│ │ - MongoDB Atlas │ │ - Job Status │ │
|
| 38 |
+
│ │ - Image Caption │ │ - Vector Search │ │ - Health Checks │ │
|
| 39 |
+
│ │ - Text Chunking │ │ - Embeddings │ │ - Error Handling│ │
|
| 40 |
+
│ │ - Embedding Gen │ │ - User/Project │ │ - Logging │ │
|
| 41 |
+
│ └─────────────────┘ └──────────────────┘ └─────────────────┘ │
|
| 42 |
+
└─────────────────────────────────────────────────────────────────────────────────┘
|
| 43 |
+
│
|
| 44 |
+
▼
|
| 45 |
+
┌─────────────────────────────────────────────────────────────────────────────────┐
|
| 46 |
+
│ SHARED DATABASE │
|
| 47 |
+
│ ┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐ │
|
| 48 |
+
│ │ MongoDB Atlas │ │ Collections │ │ Indexes │ │
|
| 49 |
+
│ │ │ │ - chunks │ │ - Vector Search │ │
|
| 50 |
+
│ │ - Same Cluster │ │ - files │ │ - Text Search │ │
|
| 51 |
+
│ │ - Same Database │ │ - chat_sessions │ │ - User/Project │ │
|
| 52 |
+
│ │ - Same Schema │ │ - chat_messages │ │ - Performance │ │
|
| 53 |
+
│ └─────────────────┘ └──────────────────┘ └─────────────────┘ │
|
| 54 |
+
└───────────────────────────────────���─────────────────────────────────────────────┘
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
## 📁 Project Structure
|
| 58 |
+
|
| 59 |
+
```
|
| 60 |
+
ingestion_pipeline/
|
| 61 |
+
├── __init__.py
|
| 62 |
+
├── app.py # Main FastAPI application
|
| 63 |
+
├── requirements.txt # Python dependencies
|
| 64 |
+
├── Dockerfile # HuggingFace deployment
|
| 65 |
+
├── deploy.sh # Deployment script
|
| 66 |
+
├── test_pipeline.py # Test script
|
| 67 |
+
├── README.md # This file
|
| 68 |
+
├── config/ # Configuration
|
| 69 |
+
│ ├── __init__.py
|
| 70 |
+
│ └── settings.py
|
| 71 |
+
├── api/ # API layer
|
| 72 |
+
│ ├── __init__.py
|
| 73 |
+
│ ├── models.py # Pydantic models
|
| 74 |
+
│ └── routes.py # API routes
|
| 75 |
+
└── services/ # Business logic
|
| 76 |
+
├── __init__.py
|
| 77 |
+
└── ingestion_service.py
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
## 🚀 Quick Start
|
| 81 |
+
|
| 82 |
+
### Prerequisites
|
| 83 |
+
- Docker
|
| 84 |
+
- MongoDB Atlas cluster
|
| 85 |
+
- Python 3.11+
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
## 🔧 API Endpoints
|
| 89 |
+
|
| 90 |
+
### Health Check
|
| 91 |
+
```http
|
| 92 |
+
GET /health
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
### Upload Files
|
| 96 |
+
```http
|
| 97 |
+
POST /upload
|
| 98 |
+
Content-Type: multipart/form-data
|
| 99 |
+
|
| 100 |
+
user_id: string
|
| 101 |
+
project_id: string
|
| 102 |
+
files: File[]
|
| 103 |
+
replace_filenames: string (optional)
|
| 104 |
+
rename_map: string (optional)
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
### Job Status
|
| 108 |
+
```http
|
| 109 |
+
GET /upload/status?job_id={job_id}
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
### List Files
|
| 113 |
+
```http
|
| 114 |
+
GET /files?user_id={user_id}&project_id={project_id}
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
### Get File Chunks
|
| 118 |
+
```http
|
| 119 |
+
GET /files/chunks?user_id={user_id}&project_id={project_id}&filename={filename}&limit={limit}
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
## 🔄 Data Flow
|
| 123 |
+
|
| 124 |
+
### File Processing Pipeline
|
| 125 |
+
1. **File Upload**: User uploads files via frontend
|
| 126 |
+
2. **Load Balancing**: Request routed to ingestion pipeline
|
| 127 |
+
3. **File Processing**:
|
| 128 |
+
- PDF/DOCX parsing with image extraction
|
| 129 |
+
- BLIP image captioning
|
| 130 |
+
- Semantic chunking with overlap
|
| 131 |
+
- Embedding generation (all-MiniLM-L6-v2)
|
| 132 |
+
4. **Data Storage**:
|
| 133 |
+
- Chunks stored in `chunks` collection
|
| 134 |
+
- File summaries in `files` collection
|
| 135 |
+
- Both scoped by `user_id` and `project_id`
|
| 136 |
+
5. **Response**: Job ID returned for progress tracking
|
| 137 |
+
|
| 138 |
+
### Data Consistency
|
| 139 |
+
- **Same Database**: Uses identical MongoDB Atlas cluster
|
| 140 |
+
- **Same Collections**: Stores in `chunks` and `files` collections
|
| 141 |
+
- **Same Schema**: Identical data structure and metadata
|
| 142 |
+
- **Same Scoping**: All data scoped by `user_id` and `project_id`
|
| 143 |
+
- **Same Indexes**: Uses identical database indexes
|
| 144 |
+
|
| 145 |
+
## 🐳 Docker Deployment
|
| 146 |
+
|
| 147 |
+
### HuggingFace Spaces
|
| 148 |
+
The service is designed for HuggingFace Spaces deployment with:
|
| 149 |
+
- Port 7860 (HuggingFace default)
|
| 150 |
+
- Non-root user for security
|
| 151 |
+
- HuggingFace cache directories
|
| 152 |
+
- Model preloading and warmup
|
| 153 |
+
|
| 154 |
+
### Logging
|
| 155 |
+
- Comprehensive logging for all operations
|
| 156 |
+
- Error tracking and debugging
|
| 157 |
+
- Performance monitoring
|
| 158 |
+
|
| 159 |
+
### Job Tracking
|
| 160 |
+
- Upload progress monitoring
|
| 161 |
+
- Error handling and reporting
|
| 162 |
+
- Status updates
|
| 163 |
+
|
| 164 |
+
## 🔧 Configuration
|
| 165 |
+
|
| 166 |
+
### Environment Variables
|
| 167 |
+
| Variable | Default | Description |
|
| 168 |
+
|----------|---------|-------------|
|
| 169 |
+
| `MONGO_URI` | Required | MongoDB connection string |
|
| 170 |
+
| `MONGO_DB` | `studybuddy` | Database name |
|
| 171 |
+
| `EMBED_MODEL` | `sentence-transformers/all-MiniLM-L6-v2` | Embedding model |
|
| 172 |
+
| `ATLAS_VECTOR` | `0` | Enable Atlas Vector Search |
|
| 173 |
+
| `MAX_FILES_PER_UPLOAD` | `15` | Maximum files per upload |
|
| 174 |
+
| `MAX_FILE_MB` | `50` | Maximum file size in MB |
|
| 175 |
+
| `INGESTION_PORT` | `7860` | Service port |
|
| 176 |
+
|
| 177 |
+
### Processing Configuration
|
| 178 |
+
- **Vector Dimension**: 384 (all-MiniLM-L6-v2)
|
| 179 |
+
- **Chunk Max Words**: 500
|
| 180 |
+
- **Chunk Min Words**: 150
|
| 181 |
+
- **Chunk Overlap**: 50 words
|
| 182 |
+
|
| 183 |
+
## 🔒 Security
|
| 184 |
+
|
| 185 |
+
### Security Features
|
| 186 |
+
- Non-root user in Docker container
|
| 187 |
+
- Input validation and sanitization
|
| 188 |
+
- Error handling and logging
|
| 189 |
+
- Rate limiting (configurable)
|
| 190 |
+
|
| 191 |
+
### Best Practices
|
| 192 |
+
- Use environment variables for secrets
|
| 193 |
+
- Regular security updates
|
| 194 |
+
- Monitor logs for anomalies
|
| 195 |
+
- Implement proper access controls
|
| 196 |
+
|
| 197 |
+
## 🚀 Performance
|
| 198 |
+
|
| 199 |
+
### Optimization Features
|
| 200 |
+
- Lazy loading of ML models
|
| 201 |
+
- Efficient file processing
|
| 202 |
+
- Background task processing
|
| 203 |
+
- Memory management
|
| 204 |
+
|
| 205 |
+
### Scaling
|
| 206 |
+
- Horizontal scaling support
|
| 207 |
+
- Load balancing ready
|
| 208 |
+
- Resource optimization
|
| 209 |
+
- Performance monitoring
|
| 210 |
+
|
| 211 |
+
## 📚 Integration
|
| 212 |
+
|
| 213 |
+
### Main System Integration
|
| 214 |
+
The ingestion pipeline is designed to work seamlessly with the main system:
|
| 215 |
+
- Same API endpoints
|
| 216 |
+
- Same data structures
|
| 217 |
+
- Same processing pipeline
|
| 218 |
+
- Same storage format
|
| 219 |
+
|
| 220 |
+
### Load Balancer Integration
|
| 221 |
+
- Automatic request routing
|
| 222 |
+
- Health check integration
|
| 223 |
+
- Failover support
|
| 224 |
+
- Performance monitoring
|
| 225 |
+
|
| 226 |
+
## 🐛 Troubleshooting
|
| 227 |
+
|
| 228 |
+
### Common Issues
|
| 229 |
+
1. **MongoDB Connection**: Verify `MONGO_URI` is correct
|
| 230 |
+
2. **Port Conflicts**: Ensure port 7860 is available
|
| 231 |
+
3. **Model Loading**: Check HuggingFace cache permissions
|
| 232 |
+
4. **File Processing**: Verify file format support
|
| 233 |
+
|
| 234 |
+
## 📈 Future Enhancements
|
| 235 |
+
|
| 236 |
+
### Planned Features
|
| 237 |
+
- Multiple file format support
|
| 238 |
+
- Advanced chunking strategies
|
| 239 |
+
- Performance optimizations
|
| 240 |
+
- Enhanced monitoring
|
| 241 |
+
|
| 242 |
+
### Scalability
|
| 243 |
+
- Kubernetes deployment
|
| 244 |
+
- Auto-scaling support
|
| 245 |
+
- Load balancing improvements
|
| 246 |
+
- Resource optimization
|
ingestion_python/api/models.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pydantic models for the ingestion pipeline API
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import List, Dict, Any, Optional
|
| 6 |
+
from pydantic import BaseModel
|
| 7 |
+
|
| 8 |
+
# Response models (same as main system)
|
| 9 |
+
class UploadResponse(BaseModel):
|
| 10 |
+
job_id: str
|
| 11 |
+
status: str
|
| 12 |
+
total_files: Optional[int] = None
|
| 13 |
+
|
| 14 |
+
class JobStatusResponse(BaseModel):
|
| 15 |
+
job_id: str
|
| 16 |
+
status: str
|
| 17 |
+
total: int
|
| 18 |
+
completed: int
|
| 19 |
+
progress: float
|
| 20 |
+
last_error: Optional[str] = None
|
| 21 |
+
created_at: float
|
| 22 |
+
|
| 23 |
+
class HealthResponse(BaseModel):
|
| 24 |
+
ok: bool
|
| 25 |
+
mongodb_connected: bool
|
| 26 |
+
service: str = "ingestion_pipeline"
|
| 27 |
+
|
| 28 |
+
class FileResponse(BaseModel):
|
| 29 |
+
filename: str
|
| 30 |
+
summary: str
|
| 31 |
+
|
| 32 |
+
class FilesListResponse(BaseModel):
|
| 33 |
+
files: List[FileResponse]
|
| 34 |
+
filenames: List[str]
|
| 35 |
+
|
| 36 |
+
class ChunksResponse(BaseModel):
|
| 37 |
+
chunks: List[Dict[str, Any]]
|
ingestion_python/api/routes.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
API routes for the ingestion pipeline
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import asyncio
|
| 7 |
+
import uuid
|
| 8 |
+
import time
|
| 9 |
+
import json
|
| 10 |
+
from typing import List, Dict, Any, Optional
|
| 11 |
+
from fastapi import APIRouter, Form, File, UploadFile, HTTPException, BackgroundTasks, Request
|
| 12 |
+
|
| 13 |
+
from api.models import UploadResponse, JobStatusResponse, HealthResponse, FilesListResponse, ChunksResponse
|
| 14 |
+
from services.ingestion_service import IngestionService
|
| 15 |
+
from services.maverick_captioner import _normalize_caption
|
| 16 |
+
from utils.logger import get_logger
|
| 17 |
+
|
| 18 |
+
logger = get_logger("INGESTION_ROUTES", __name__)
|
| 19 |
+
|
| 20 |
+
# Create router
|
| 21 |
+
router = APIRouter()
|
| 22 |
+
|
| 23 |
+
# Global services (will be injected)
|
| 24 |
+
rag = None
|
| 25 |
+
embedder = None
|
| 26 |
+
captioner = None
|
| 27 |
+
ingestion_service = None
|
| 28 |
+
|
| 29 |
+
def initialize_services(rag_store, embedder_client, captioner_client):
|
| 30 |
+
"""Initialize services"""
|
| 31 |
+
global rag, embedder, captioner, ingestion_service
|
| 32 |
+
rag = rag_store
|
| 33 |
+
embedder = embedder_client
|
| 34 |
+
captioner = captioner_client
|
| 35 |
+
ingestion_service = IngestionService(rag_store, embedder_client, captioner_client)
|
| 36 |
+
|
| 37 |
+
@router.get("/health", response_model=HealthResponse)
|
| 38 |
+
async def health():
|
| 39 |
+
"""Health check endpoint"""
|
| 40 |
+
mongodb_connected = rag is not None
|
| 41 |
+
return HealthResponse(
|
| 42 |
+
ok=mongodb_connected,
|
| 43 |
+
mongodb_connected=mongodb_connected
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
@router.post("/upload", response_model=UploadResponse)
|
| 47 |
+
async def upload_files(
|
| 48 |
+
request: Request,
|
| 49 |
+
background_tasks: BackgroundTasks,
|
| 50 |
+
user_id: str = Form(...),
|
| 51 |
+
project_id: str = Form(...),
|
| 52 |
+
files: List[UploadFile] = File(...),
|
| 53 |
+
replace_filenames: Optional[str] = Form(None),
|
| 54 |
+
rename_map: Optional[str] = Form(None),
|
| 55 |
+
):
|
| 56 |
+
"""
|
| 57 |
+
Upload and process files
|
| 58 |
+
|
| 59 |
+
This endpoint mirrors the main system's upload functionality exactly.
|
| 60 |
+
"""
|
| 61 |
+
if not rag:
|
| 62 |
+
raise HTTPException(500, detail="MongoDB connection not available")
|
| 63 |
+
|
| 64 |
+
job_id = str(uuid.uuid4())
|
| 65 |
+
|
| 66 |
+
# File limits (same as main system)
|
| 67 |
+
max_files = int(os.getenv("MAX_FILES_PER_UPLOAD", "15"))
|
| 68 |
+
max_mb = int(os.getenv("MAX_FILE_MB", "50"))
|
| 69 |
+
|
| 70 |
+
if len(files) > max_files:
|
| 71 |
+
raise HTTPException(400, detail=f"Too many files. Max {max_files} allowed per upload.")
|
| 72 |
+
|
| 73 |
+
# Parse replace/rename directives (same as main system)
|
| 74 |
+
replace_set = set()
|
| 75 |
+
try:
|
| 76 |
+
if replace_filenames:
|
| 77 |
+
replace_set = set(json.loads(replace_filenames))
|
| 78 |
+
except Exception:
|
| 79 |
+
pass
|
| 80 |
+
|
| 81 |
+
rename_dict: Dict[str, str] = {}
|
| 82 |
+
try:
|
| 83 |
+
if rename_map:
|
| 84 |
+
rename_dict = json.loads(rename_map)
|
| 85 |
+
except Exception:
|
| 86 |
+
pass
|
| 87 |
+
|
| 88 |
+
# Preload files (same as main system)
|
| 89 |
+
preloaded_files = []
|
| 90 |
+
for uf in files:
|
| 91 |
+
raw = await uf.read()
|
| 92 |
+
if len(raw) > max_mb * 1024 * 1024:
|
| 93 |
+
raise HTTPException(400, detail=f"{uf.filename} exceeds {max_mb} MB limit")
|
| 94 |
+
eff_name = rename_dict.get(uf.filename, uf.filename)
|
| 95 |
+
preloaded_files.append((eff_name, raw))
|
| 96 |
+
|
| 97 |
+
# Initialize job status (same as main system)
|
| 98 |
+
from app import app
|
| 99 |
+
app.state.jobs[job_id] = {
|
| 100 |
+
"created_at": time.time(),
|
| 101 |
+
"total": len(preloaded_files),
|
| 102 |
+
"completed": 0,
|
| 103 |
+
"status": "processing",
|
| 104 |
+
"last_error": None,
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
# Background processing (mirrors main system exactly)
|
| 108 |
+
async def _process_all():
|
| 109 |
+
for idx, (fname, raw) in enumerate(preloaded_files, start=1):
|
| 110 |
+
try:
|
| 111 |
+
# Handle file replacement (same as main system)
|
| 112 |
+
if fname in replace_set:
|
| 113 |
+
try:
|
| 114 |
+
rag.db["chunks"].delete_many({"user_id": user_id, "project_id": project_id, "filename": fname})
|
| 115 |
+
rag.db["files"].delete_many({"user_id": user_id, "project_id": project_id, "filename": fname})
|
| 116 |
+
logger.info(f"[{job_id}] Replaced prior data for {fname}")
|
| 117 |
+
except Exception as de:
|
| 118 |
+
logger.warning(f"[{job_id}] Replace delete failed for {fname}: {de}")
|
| 119 |
+
|
| 120 |
+
logger.info(f"[{job_id}] ({idx}/{len(preloaded_files)}) Parsing {fname} ({len(raw)} bytes)")
|
| 121 |
+
|
| 122 |
+
# Extract pages (same as main system)
|
| 123 |
+
from helpers.pages import _extract_pages
|
| 124 |
+
pages = _extract_pages(fname, raw)
|
| 125 |
+
|
| 126 |
+
# Process images with captions (same as main system)
|
| 127 |
+
num_imgs = sum(len(p.get("images", [])) for p in pages)
|
| 128 |
+
captions = []
|
| 129 |
+
if num_imgs > 0:
|
| 130 |
+
for p in pages:
|
| 131 |
+
caps = []
|
| 132 |
+
for im in p.get("images", []):
|
| 133 |
+
try:
|
| 134 |
+
cap = captioner.caption_image(im)
|
| 135 |
+
caps.append(cap)
|
| 136 |
+
except Exception as e:
|
| 137 |
+
logger.warning(f"[{job_id}] Caption error in {fname}: {e}")
|
| 138 |
+
captions.append(caps)
|
| 139 |
+
else:
|
| 140 |
+
captions = [[] for _ in pages]
|
| 141 |
+
|
| 142 |
+
# Merge captions into text (same as main system)
|
| 143 |
+
for p, caps in zip(pages, captions):
|
| 144 |
+
if caps:
|
| 145 |
+
normalized = [ _normalize_caption(c) for c in caps if c ]
|
| 146 |
+
if normalized:
|
| 147 |
+
p["text"] = (p.get("text", "") + "\n\n" + "\n".join([f"[Image] {c}" for c in normalized])).strip()
|
| 148 |
+
|
| 149 |
+
# Build cards (same as main system)
|
| 150 |
+
from utils.ingestion.chunker import build_cards_from_pages
|
| 151 |
+
cards = await build_cards_from_pages(pages, filename=fname, user_id=user_id, project_id=project_id)
|
| 152 |
+
logger.info(f"[{job_id}] Built {len(cards)} cards for {fname}")
|
| 153 |
+
|
| 154 |
+
# Generate embeddings (same as main system)
|
| 155 |
+
embeddings = embedder.embed([c["content"] for c in cards])
|
| 156 |
+
for c, vec in zip(cards, embeddings):
|
| 157 |
+
c["embedding"] = vec
|
| 158 |
+
|
| 159 |
+
# Store in MongoDB (same as main system)
|
| 160 |
+
rag.store_cards(cards)
|
| 161 |
+
|
| 162 |
+
# Create file summary (same as main system)
|
| 163 |
+
from utils.service.summarizer import cheap_summarize
|
| 164 |
+
full_text = "\n\n".join(p.get("text", "") for p in pages)
|
| 165 |
+
file_summary = await cheap_summarize(full_text, max_sentences=6)
|
| 166 |
+
rag.upsert_file_summary(user_id=user_id, project_id=project_id, filename=fname, summary=file_summary)
|
| 167 |
+
|
| 168 |
+
logger.info(f"[{job_id}] Completed {fname}")
|
| 169 |
+
|
| 170 |
+
# Update job progress (same as main system)
|
| 171 |
+
job = app.state.jobs.get(job_id)
|
| 172 |
+
if job:
|
| 173 |
+
job["completed"] = idx
|
| 174 |
+
job["status"] = "processing" if idx < job.get("total", 0) else "completed"
|
| 175 |
+
|
| 176 |
+
except Exception as e:
|
| 177 |
+
logger.error(f"[{job_id}] Failed processing {fname}: {e}")
|
| 178 |
+
job = app.state.jobs.get(job_id)
|
| 179 |
+
if job:
|
| 180 |
+
job["last_error"] = str(e)
|
| 181 |
+
job["completed"] = idx
|
| 182 |
+
finally:
|
| 183 |
+
await asyncio.sleep(0)
|
| 184 |
+
|
| 185 |
+
# Finalize job (same as main system)
|
| 186 |
+
logger.info(f"[{job_id}] Ingestion complete for {len(preloaded_files)} files")
|
| 187 |
+
job = app.state.jobs.get(job_id)
|
| 188 |
+
if job:
|
| 189 |
+
job["status"] = "completed"
|
| 190 |
+
|
| 191 |
+
background_tasks.add_task(_process_all)
|
| 192 |
+
|
| 193 |
+
return UploadResponse(
|
| 194 |
+
job_id=job_id,
|
| 195 |
+
status="processing",
|
| 196 |
+
total_files=len(preloaded_files)
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
@router.get("/upload/status", response_model=JobStatusResponse)
|
| 200 |
+
async def upload_status(job_id: str):
|
| 201 |
+
"""Get upload job status"""
|
| 202 |
+
from app import app
|
| 203 |
+
job = app.state.jobs.get(job_id)
|
| 204 |
+
if not job:
|
| 205 |
+
raise HTTPException(404, detail="Job not found")
|
| 206 |
+
|
| 207 |
+
progress = (job["completed"] / job["total"]) * 100 if job["total"] > 0 else 0
|
| 208 |
+
|
| 209 |
+
return JobStatusResponse(
|
| 210 |
+
job_id=job_id,
|
| 211 |
+
status=job["status"],
|
| 212 |
+
total=job["total"],
|
| 213 |
+
completed=job["completed"],
|
| 214 |
+
progress=progress,
|
| 215 |
+
last_error=job.get("last_error"),
|
| 216 |
+
created_at=job["created_at"]
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
@router.get("/files", response_model=FilesListResponse)
|
| 220 |
+
async def list_files(user_id: str, project_id: str):
|
| 221 |
+
"""List files for a project (compatible with main system)"""
|
| 222 |
+
if not rag:
|
| 223 |
+
raise HTTPException(500, detail="MongoDB connection not available")
|
| 224 |
+
|
| 225 |
+
files = rag.list_files(user_id, project_id)
|
| 226 |
+
return FilesListResponse(
|
| 227 |
+
files=[{"filename": f["filename"], "summary": f["summary"]} for f in files],
|
| 228 |
+
filenames=[f["filename"] for f in files]
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
@router.get("/files/chunks", response_model=ChunksResponse)
|
| 232 |
+
async def get_file_chunks(user_id: str, project_id: str, filename: str, limit: int = 20):
|
| 233 |
+
"""Get chunks for a specific file (compatible with main system)"""
|
| 234 |
+
if not rag:
|
| 235 |
+
raise HTTPException(500, detail="MongoDB connection not available")
|
| 236 |
+
|
| 237 |
+
chunks = rag.get_file_chunks(user_id, project_id, filename, limit)
|
| 238 |
+
return ChunksResponse(chunks=chunks)
|
ingestion_python/app.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Ingestion Pipeline Service
|
| 3 |
+
|
| 4 |
+
A dedicated service for processing file uploads and storing them in MongoDB Atlas.
|
| 5 |
+
This service mirrors the main system's file processing functionality while
|
| 6 |
+
running as a separate service to share the processing load.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
from fastapi import FastAPI
|
| 11 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 12 |
+
|
| 13 |
+
# Import shared utilities (now local)
|
| 14 |
+
from utils.logger import get_logger
|
| 15 |
+
from utils.rag.rag import RAGStore, ensure_indexes
|
| 16 |
+
from utils.embedding import RemoteEmbeddingClient
|
| 17 |
+
from services.maverick_captioner import NvidiaMaverickCaptioner
|
| 18 |
+
from api.routes import router, initialize_services
|
| 19 |
+
|
| 20 |
+
logger = get_logger("INGESTION_PIPELINE", __name__)
|
| 21 |
+
|
| 22 |
+
# FastAPI app
|
| 23 |
+
app = FastAPI(title="Ingestion Pipeline", version="1.0.0")
|
| 24 |
+
app.add_middleware(
|
| 25 |
+
CORSMiddleware,
|
| 26 |
+
allow_origins=["*"],
|
| 27 |
+
allow_credentials=True,
|
| 28 |
+
allow_methods=["*"],
|
| 29 |
+
allow_headers=["*"],
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# In-memory job tracker (same as main system)
|
| 33 |
+
app.state.jobs = {}
|
| 34 |
+
|
| 35 |
+
# Global clients (same as main system)
|
| 36 |
+
try:
|
| 37 |
+
rag = RAGStore(mongo_uri=os.getenv("MONGO_URI"), db_name=os.getenv("MONGO_DB", "studybuddy"))
|
| 38 |
+
rag.client.admin.command('ping')
|
| 39 |
+
logger.info("[INGESTION_PIPELINE] MongoDB connection successful")
|
| 40 |
+
ensure_indexes(rag)
|
| 41 |
+
logger.info("[INGESTION_PIPELINE] MongoDB indexes ensured")
|
| 42 |
+
except Exception as e:
|
| 43 |
+
logger.error(f"[INGESTION_PIPELINE] Failed to initialize MongoDB: {e}")
|
| 44 |
+
rag = None
|
| 45 |
+
|
| 46 |
+
embedder = RemoteEmbeddingClient()
|
| 47 |
+
captioner = NvidiaMaverickCaptioner()
|
| 48 |
+
|
| 49 |
+
# Initialize services
|
| 50 |
+
initialize_services(rag, embedder, captioner)
|
| 51 |
+
|
| 52 |
+
# Include API routes
|
| 53 |
+
app.include_router(router)
|
| 54 |
+
|
| 55 |
+
if __name__ == "__main__":
|
| 56 |
+
import uvicorn
|
| 57 |
+
port = int(os.getenv("INGESTION_PORT", "7860"))
|
| 58 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|
ingestion_python/helpers/pages.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import List, Dict, Any
|
| 3 |
+
from fastapi import HTTPException
|
| 4 |
+
from utils.ingestion.parser import parse_pdf_bytes, parse_docx_bytes
|
| 5 |
+
|
| 6 |
+
# ────────────────────────────── Helpers ──────────────────────────────
|
| 7 |
+
def _infer_mime(filename: str) -> str:
|
| 8 |
+
lower = filename.lower()
|
| 9 |
+
if lower.endswith(".pdf"):
|
| 10 |
+
return "application/pdf"
|
| 11 |
+
if lower.endswith(".docx"):
|
| 12 |
+
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
| 13 |
+
return "application/octet-stream"
|
| 14 |
+
|
| 15 |
+
def _extract_pages(filename: str, file_bytes: bytes) -> List[Dict[str, Any]]:
|
| 16 |
+
mime = _infer_mime(filename)
|
| 17 |
+
if mime == "application/pdf":
|
| 18 |
+
return parse_pdf_bytes(file_bytes)
|
| 19 |
+
elif mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
| 20 |
+
return parse_docx_bytes(file_bytes)
|
| 21 |
+
else:
|
| 22 |
+
raise HTTPException(status_code=400, detail=f"Unsupported file type: {filename}")
|
| 23 |
+
|
| 24 |
+
|
ingestion_python/requirements.txt
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ingestion Pipeline Requirements
|
| 2 |
+
# Same as main system but focused on processing
|
| 3 |
+
|
| 4 |
+
fastapi==0.114.2
|
| 5 |
+
uvicorn[standard]==0.30.6
|
| 6 |
+
python-multipart==0.0.9
|
| 7 |
+
pymongo==4.8.0
|
| 8 |
+
httpx==0.27.2
|
| 9 |
+
requests==2.32.3
|
| 10 |
+
python-docx==1.1.2
|
| 11 |
+
PyMuPDF==1.24.10
|
| 12 |
+
pillow==10.4.0
|
| 13 |
+
sumy==0.11.0
|
| 14 |
+
numpy==1.26.4
|
| 15 |
+
reportlab==4.0.9
|
| 16 |
+
markdown==3.6
|
| 17 |
+
python-dotenv==1.0.0
|
ingestion_python/services/ingestion_service.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Ingestion service for processing files and storing them in MongoDB
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import asyncio
|
| 6 |
+
import uuid
|
| 7 |
+
import time
|
| 8 |
+
import json
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
+
from utils.logger import get_logger
|
| 11 |
+
from utils.rag.rag import RAGStore
|
| 12 |
+
from utils.embedding import RemoteEmbeddingClient
|
| 13 |
+
from services.maverick_captioner import NvidiaMaverickCaptioner, _normalize_caption
|
| 14 |
+
from utils.ingestion.chunker import build_cards_from_pages
|
| 15 |
+
from utils.service.summarizer import cheap_summarize
|
| 16 |
+
from helpers.pages import _extract_pages
|
| 17 |
+
|
| 18 |
+
logger = get_logger("INGESTION_SERVICE", __name__)
|
| 19 |
+
|
| 20 |
+
class IngestionService:
|
| 21 |
+
"""Service for processing file uploads and storing them in MongoDB"""
|
| 22 |
+
|
| 23 |
+
def __init__(self, rag_store: RAGStore, embedder: RemoteEmbeddingClient, captioner: NvidiaMaverickCaptioner):
|
| 24 |
+
self.rag = rag_store
|
| 25 |
+
self.embedder = embedder
|
| 26 |
+
self.captioner = captioner
|
| 27 |
+
|
| 28 |
+
async def process_files(
|
| 29 |
+
self,
|
| 30 |
+
user_id: str,
|
| 31 |
+
project_id: str,
|
| 32 |
+
files: List[tuple], # (filename, raw_bytes)
|
| 33 |
+
replace_filenames: Optional[List[str]] = None,
|
| 34 |
+
rename_map: Optional[Dict[str, str]] = None,
|
| 35 |
+
job_id: Optional[str] = None
|
| 36 |
+
) -> str:
|
| 37 |
+
"""
|
| 38 |
+
Process files and store them in MongoDB
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
user_id: User identifier
|
| 42 |
+
project_id: Project identifier
|
| 43 |
+
files: List of (filename, raw_bytes) tuples
|
| 44 |
+
replace_filenames: Optional list of filenames to replace
|
| 45 |
+
rename_map: Optional mapping of old names to new names
|
| 46 |
+
job_id: Optional job ID for tracking
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
Job ID for tracking progress
|
| 50 |
+
"""
|
| 51 |
+
if not job_id:
|
| 52 |
+
job_id = str(uuid.uuid4())
|
| 53 |
+
|
| 54 |
+
replace_set = set(replace_filenames or [])
|
| 55 |
+
|
| 56 |
+
for idx, (fname, raw) in enumerate(files, start=1):
|
| 57 |
+
try:
|
| 58 |
+
# Handle file replacement
|
| 59 |
+
if fname in replace_set:
|
| 60 |
+
try:
|
| 61 |
+
self.rag.db["chunks"].delete_many({"user_id": user_id, "project_id": project_id, "filename": fname})
|
| 62 |
+
self.rag.db["files"].delete_many({"user_id": user_id, "project_id": project_id, "filename": fname})
|
| 63 |
+
logger.info(f"[{job_id}] Replaced prior data for {fname}")
|
| 64 |
+
except Exception as de:
|
| 65 |
+
logger.warning(f"[{job_id}] Replace delete failed for {fname}: {de}")
|
| 66 |
+
|
| 67 |
+
logger.info(f"[{job_id}] ({idx}/{len(files)}) Parsing {fname} ({len(raw)} bytes)")
|
| 68 |
+
|
| 69 |
+
# Extract pages
|
| 70 |
+
pages = _extract_pages(fname, raw)
|
| 71 |
+
|
| 72 |
+
# Process images with captions
|
| 73 |
+
num_imgs = sum(len(p.get("images", [])) for p in pages)
|
| 74 |
+
captions = []
|
| 75 |
+
if num_imgs > 0:
|
| 76 |
+
for p in pages:
|
| 77 |
+
caps = []
|
| 78 |
+
for im in p.get("images", []):
|
| 79 |
+
try:
|
| 80 |
+
cap = self.captioner.caption_image(im)
|
| 81 |
+
caps.append(cap)
|
| 82 |
+
except Exception as e:
|
| 83 |
+
logger.warning(f"[{job_id}] Caption error in {fname}: {e}")
|
| 84 |
+
captions.append(caps)
|
| 85 |
+
else:
|
| 86 |
+
captions = [[] for _ in pages]
|
| 87 |
+
|
| 88 |
+
# Merge captions into text
|
| 89 |
+
for p, caps in zip(pages, captions):
|
| 90 |
+
if caps:
|
| 91 |
+
normalized = [ _normalize_caption(c) for c in caps if c ]
|
| 92 |
+
if normalized:
|
| 93 |
+
p["text"] = (p.get("text", "") + "\n\n" + "\n".join([f"[Image] {c}" for c in normalized])).strip()
|
| 94 |
+
|
| 95 |
+
# Build cards
|
| 96 |
+
cards = await build_cards_from_pages(pages, filename=fname, user_id=user_id, project_id=project_id)
|
| 97 |
+
logger.info(f"[{job_id}] Built {len(cards)} cards for {fname}")
|
| 98 |
+
|
| 99 |
+
# Generate embeddings
|
| 100 |
+
embeddings = self.embedder.embed([c["content"] for c in cards])
|
| 101 |
+
for c, vec in zip(cards, embeddings):
|
| 102 |
+
c["embedding"] = vec
|
| 103 |
+
|
| 104 |
+
# Store in MongoDB
|
| 105 |
+
self.rag.store_cards(cards)
|
| 106 |
+
|
| 107 |
+
# Create file summary
|
| 108 |
+
full_text = "\n\n".join(p.get("text", "") for p in pages)
|
| 109 |
+
file_summary = await cheap_summarize(full_text, max_sentences=6)
|
| 110 |
+
self.rag.upsert_file_summary(user_id=user_id, project_id=project_id, filename=fname, summary=file_summary)
|
| 111 |
+
|
| 112 |
+
logger.info(f"[{job_id}] Completed {fname}")
|
| 113 |
+
|
| 114 |
+
except Exception as e:
|
| 115 |
+
logger.error(f"[{job_id}] Failed processing {fname}: {e}")
|
| 116 |
+
raise
|
| 117 |
+
|
| 118 |
+
logger.info(f"[{job_id}] Ingestion complete for {len(files)} files")
|
| 119 |
+
return job_id
|
ingestion_python/services/maverick_captioner.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import io
|
| 3 |
+
import os
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
import requests
|
| 7 |
+
from PIL import Image
|
| 8 |
+
|
| 9 |
+
from utils.logger import get_logger
|
| 10 |
+
try:
|
| 11 |
+
from utils.api.rotator import APIKeyRotator # available in full repo
|
| 12 |
+
except Exception: # standalone fallback
|
| 13 |
+
class APIKeyRotator: # type: ignore
|
| 14 |
+
def __init__(self, prefix: str = "NVIDIA_API_", max_slots: int = 5):
|
| 15 |
+
self.keys = []
|
| 16 |
+
for i in range(1, max_slots + 1):
|
| 17 |
+
k = os.getenv(f"{prefix}{i}")
|
| 18 |
+
if k:
|
| 19 |
+
self.keys.append(k)
|
| 20 |
+
if not self.keys:
|
| 21 |
+
single = os.getenv(prefix.rstrip("_"))
|
| 22 |
+
if single:
|
| 23 |
+
self.keys.append(single)
|
| 24 |
+
self._idx = 0
|
| 25 |
+
|
| 26 |
+
def get_key(self) -> Optional[str]:
|
| 27 |
+
if not self.keys:
|
| 28 |
+
return None
|
| 29 |
+
k = self.keys[self._idx % len(self.keys)]
|
| 30 |
+
self._idx += 1
|
| 31 |
+
return k
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
logger = get_logger("MAVERICK_CAPTIONER", __name__)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _normalize_caption(text: str) -> str:
|
| 38 |
+
if not text:
|
| 39 |
+
return ""
|
| 40 |
+
t = text.strip()
|
| 41 |
+
# Remove common conversational/openers and meta phrases
|
| 42 |
+
banned_prefixes = [
|
| 43 |
+
"sure,", "sure.", "sure", "here is", "here are", "this image", "the image", "image shows",
|
| 44 |
+
"the picture", "the photo", "the text describes", "the text describe", "it shows", "it depicts",
|
| 45 |
+
"caption:", "description:", "output:", "result:", "answer:", "analysis:", "observation:",
|
| 46 |
+
]
|
| 47 |
+
t_lower = t.lower()
|
| 48 |
+
for p in banned_prefixes:
|
| 49 |
+
if t_lower.startswith(p):
|
| 50 |
+
t = t[len(p):].lstrip(" :-\u2014\u2013")
|
| 51 |
+
t_lower = t.lower()
|
| 52 |
+
|
| 53 |
+
# Strip surrounding quotes and markdown artifacts
|
| 54 |
+
t = t.strip().strip('"').strip("'").strip()
|
| 55 |
+
# Collapse whitespace
|
| 56 |
+
t = " ".join(t.split())
|
| 57 |
+
return t
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class NvidiaMaverickCaptioner:
|
| 61 |
+
"""Caption images using NVIDIA Integrate API (meta/llama-4-maverick-17b-128e-instruct)."""
|
| 62 |
+
|
| 63 |
+
def __init__(self, rotator: Optional[APIKeyRotator] = None, model: Optional[str] = None):
|
| 64 |
+
self.rotator = rotator or APIKeyRotator(prefix="NVIDIA_API_", max_slots=5)
|
| 65 |
+
self.model = model or os.getenv("NVIDIA_MAVERICK_MODEL", "meta/llama-4-maverick-17b-128e-instruct")
|
| 66 |
+
self.invoke_url = "https://integrate.api.nvidia.com/v1/chat/completions"
|
| 67 |
+
|
| 68 |
+
def _encode_image_jpeg_b64(self, image: Image.Image) -> str:
|
| 69 |
+
buf = io.BytesIO()
|
| 70 |
+
# Convert to RGB to ensure JPEG-compatible
|
| 71 |
+
image.convert("RGB").save(buf, format="JPEG", quality=90)
|
| 72 |
+
return base64.b64encode(buf.getvalue()).decode("utf-8")
|
| 73 |
+
|
| 74 |
+
def caption_image(self, image: Image.Image) -> str:
|
| 75 |
+
try:
|
| 76 |
+
key = self.rotator.get_key()
|
| 77 |
+
if not key:
|
| 78 |
+
logger.warning("NVIDIA API key not available; skipping image caption.")
|
| 79 |
+
return ""
|
| 80 |
+
|
| 81 |
+
img_b64 = self._encode_image_jpeg_b64(image)
|
| 82 |
+
|
| 83 |
+
# Strict, non-conversational system prompt
|
| 84 |
+
system_prompt = (
|
| 85 |
+
"You are an expert vision captioner. Produce a precise, information-dense caption of the image. "
|
| 86 |
+
"Do not include conversational phrases, prefaces, meta commentary, or apologies. "
|
| 87 |
+
"Avoid starting with phrases like 'The image/picture/photo shows' or 'Here is'. "
|
| 88 |
+
"Write a single concise paragraph with concrete entities, text in the image, and notable details."
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
user_prompt = (
|
| 92 |
+
"Caption this image at the finest level of detail. Include any visible text verbatim. "
|
| 93 |
+
"Return only the caption text."
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
# Multimodal content format for NVIDIA Integrate API
|
| 97 |
+
messages = [
|
| 98 |
+
{"role": "system", "content": system_prompt},
|
| 99 |
+
{
|
| 100 |
+
"role": "user",
|
| 101 |
+
"content": [
|
| 102 |
+
{"type": "text", "text": user_prompt},
|
| 103 |
+
{
|
| 104 |
+
"type": "image_url",
|
| 105 |
+
"image_url": {
|
| 106 |
+
"url": f"data:image/jpeg;base64,{img_b64}"
|
| 107 |
+
}
|
| 108 |
+
},
|
| 109 |
+
]
|
| 110 |
+
},
|
| 111 |
+
]
|
| 112 |
+
|
| 113 |
+
payload = {
|
| 114 |
+
"model": self.model,
|
| 115 |
+
"messages": messages,
|
| 116 |
+
"max_tokens": 512,
|
| 117 |
+
"temperature": 0.2,
|
| 118 |
+
"top_p": 0.9,
|
| 119 |
+
"frequency_penalty": 0.0,
|
| 120 |
+
"presence_penalty": 0.0,
|
| 121 |
+
"stream": False,
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
headers = {
|
| 125 |
+
"Authorization": f"Bearer {key}",
|
| 126 |
+
"Accept": "application/json",
|
| 127 |
+
"Content-Type": "application/json",
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
resp = requests.post(self.invoke_url, headers=headers, json=payload, timeout=60)
|
| 131 |
+
if resp.status_code >= 400:
|
| 132 |
+
logger.warning(f"Maverick caption API error {resp.status_code}: {resp.text[:200]}")
|
| 133 |
+
return ""
|
| 134 |
+
data = resp.json()
|
| 135 |
+
text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
| 136 |
+
return _normalize_caption(text)
|
| 137 |
+
except Exception as e:
|
| 138 |
+
logger.warning(f"Maverick caption failed: {e}")
|
| 139 |
+
return ""
|
| 140 |
+
|
| 141 |
+
|
ingestion_python/test_upload1.sh
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
set -euo pipefail
|
| 4 |
+
|
| 5 |
+
echo "🚀 Testing Ingestion Pipeline Upload"
|
| 6 |
+
echo "======================================"
|
| 7 |
+
|
| 8 |
+
# Configuration
|
| 9 |
+
BACKEND_URL="https://binkhoale1812-studdybuddy-ingestion1.hf.space"
|
| 10 |
+
USER_ID="44e65346-8eaa-4f95-b17a-f6219953e7a8"
|
| 11 |
+
PROJECT_ID="496e2fad-ec7e-4562-b06a-ea2491f2460"
|
| 12 |
+
|
| 13 |
+
# Test files
|
| 14 |
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
| 15 |
+
FILE1="$SCRIPT_DIR/../exefiles/Lecture5_ML.pdf"
|
| 16 |
+
FILE2="$SCRIPT_DIR/../exefiles/Lecture6_ANN_DL.pdf"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# Debug toggles
|
| 20 |
+
DEBUG=${DEBUG:-0}
|
| 21 |
+
TRACE=${TRACE:-0}
|
| 22 |
+
|
| 23 |
+
echo "📋 Configuration:"
|
| 24 |
+
echo " Backend URL: $BACKEND_URL"
|
| 25 |
+
echo " User ID: $USER_ID"
|
| 26 |
+
echo " Project ID: $PROJECT_ID"
|
| 27 |
+
echo " Files: $FILE1, $FILE2"
|
| 28 |
+
echo ""
|
| 29 |
+
|
| 30 |
+
# Validate files and resolve absolute paths
|
| 31 |
+
if [ ! -f "$FILE1" ]; then
|
| 32 |
+
echo "❌ Missing file: $FILE1"; exit 26
|
| 33 |
+
fi
|
| 34 |
+
if [ ! -f "$FILE2" ]; then
|
| 35 |
+
echo "❌ Missing file: $FILE2"; exit 26
|
| 36 |
+
fi
|
| 37 |
+
FILE1_DIR="$(cd "$(dirname "$FILE1")" && pwd)"; FILE1_BASENAME="$(basename "$FILE1")"; FILE1="$FILE1_DIR/$FILE1_BASENAME"
|
| 38 |
+
FILE2_DIR="$(cd "$(dirname "$FILE2")" && pwd)"; FILE2_BASENAME="$(basename "$FILE2")"; FILE2="$FILE2_DIR/$FILE2_BASENAME"
|
| 39 |
+
|
| 40 |
+
curl_base() {
|
| 41 |
+
local method="$1"; shift
|
| 42 |
+
local url="$1"; shift
|
| 43 |
+
local extra=("$@")
|
| 44 |
+
local common=(
|
| 45 |
+
-L --http1.1 --fail-with-body -sS
|
| 46 |
+
--connect-timeout 60
|
| 47 |
+
--retry 5 --retry-delay 4 --retry-connrefused
|
| 48 |
+
)
|
| 49 |
+
if [ "$DEBUG" = "1" ]; then
|
| 50 |
+
common+=( -v )
|
| 51 |
+
fi
|
| 52 |
+
if [ "$TRACE" = "1" ]; then
|
| 53 |
+
common+=( --trace-time --trace-ascii - )
|
| 54 |
+
fi
|
| 55 |
+
curl -X "$method" "$url" "${common[@]}" "${extra[@]}"
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
json_with_status() {
|
| 59 |
+
local method="$1"; shift
|
| 60 |
+
local url="$1"; shift
|
| 61 |
+
local extra=("$@")
|
| 62 |
+
curl_base "$method" "$url" "${extra[@]}" \
|
| 63 |
+
-w "\nHTTP Status: %{http_code}\n"
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
# Step 0: Preflight (for browser parity)
|
| 67 |
+
echo "🛰️ Step 0: OPTIONS /upload (preflight parity)"
|
| 68 |
+
echo "---------------------------------------------"
|
| 69 |
+
json_with_status OPTIONS "$BACKEND_URL/upload" -H "Origin: https://example.com" -H "Access-Control-Request-Method: POST" || true
|
| 70 |
+
echo ""; echo ""
|
| 71 |
+
|
| 72 |
+
# Step 1: Health Check
|
| 73 |
+
echo "🏥 Step 1: Health Check"
|
| 74 |
+
echo "------------------------"
|
| 75 |
+
json_with_status GET "$BACKEND_URL/health" -H "Accept: application/json" || true
|
| 76 |
+
echo ""; echo ""
|
| 77 |
+
|
| 78 |
+
# Step 2: Upload Files
|
| 79 |
+
echo "📁 Step 2: Upload Files (sequential)"
|
| 80 |
+
echo "------------------------------------"
|
| 81 |
+
echo "Uploading $(basename "$FILE1")..."
|
| 82 |
+
|
| 83 |
+
UPLOAD_HEADERS=$(mktemp)
|
| 84 |
+
UPLOAD_BODY=$(mktemp)
|
| 85 |
+
|
| 86 |
+
set +e
|
| 87 |
+
HTTP_CODE=$(curl -L --http1.1 --fail-with-body -sS \
|
| 88 |
+
--connect-timeout 60 --retry 3 --retry-delay 4 --retry-connrefused \
|
| 89 |
+
-H "Expect:" \
|
| 90 |
+
-X POST "$BACKEND_URL/upload" \
|
| 91 |
+
-F "user_id=$USER_ID" \
|
| 92 |
+
-F "project_id=$PROJECT_ID" \
|
| 93 |
+
-F "files=@$FILE1" \
|
| 94 |
+
-D "$UPLOAD_HEADERS" -o "$UPLOAD_BODY" \
|
| 95 |
+
-w "%{http_code}")
|
| 96 |
+
RET=$?
|
| 97 |
+
set -e
|
| 98 |
+
|
| 99 |
+
echo "HTTP Status: $HTTP_CODE"
|
| 100 |
+
echo "--- Response Headers ---"; sed -e 's/\r$//' "$UPLOAD_HEADERS" | sed 's/^/ /'
|
| 101 |
+
echo "--- Response Body ---"; sed 's/^/ /' "$UPLOAD_BODY"
|
| 102 |
+
|
| 103 |
+
if [ "$RET" -ne 0 ] || [ "$HTTP_CODE" = "000" ]; then
|
| 104 |
+
echo "❌ Upload failed (curl exit=$RET, http=$HTTP_CODE)"; exit 1
|
| 105 |
+
fi
|
| 106 |
+
|
| 107 |
+
# Extract job_id (prefer jq)
|
| 108 |
+
if command -v jq >/dev/null 2>&1; then
|
| 109 |
+
JOB_ID=$(jq -r '.job_id // empty' < "$UPLOAD_BODY")
|
| 110 |
+
else
|
| 111 |
+
JOB_ID=$(python3 - <<'PY'
|
| 112 |
+
import sys, json
|
| 113 |
+
try:
|
| 114 |
+
data=json.load(sys.stdin)
|
| 115 |
+
print(data.get('job_id',''))
|
| 116 |
+
except Exception:
|
| 117 |
+
print('')
|
| 118 |
+
PY
|
| 119 |
+
< "$UPLOAD_BODY")
|
| 120 |
+
fi
|
| 121 |
+
|
| 122 |
+
if [ -z "${JOB_ID:-}" ]; then
|
| 123 |
+
echo "❌ Failed to extract job_id from upload response"; exit 1
|
| 124 |
+
fi
|
| 125 |
+
|
| 126 |
+
echo ""
|
| 127 |
+
echo "✅ Upload 1 initiated successfully!"
|
| 128 |
+
echo " Job ID: $JOB_ID"
|
| 129 |
+
echo ""
|
| 130 |
+
|
| 131 |
+
# Step 3: Monitor Upload Progress
|
| 132 |
+
echo "📊 Step 3: Monitor Upload Progress"
|
| 133 |
+
echo "----------------------------------"
|
| 134 |
+
|
| 135 |
+
for i in {1..48}; do
|
| 136 |
+
echo "Checking progress (attempt $i/12)..."
|
| 137 |
+
json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID" -H "Accept: application/json" | sed 's/^/ /'
|
| 138 |
+
STATUS_LINE=$(json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID" -H "Accept: application/json" | tail -n +1)
|
| 139 |
+
if echo "$STATUS_LINE" | grep -q '"status":"completed"'; then
|
| 140 |
+
echo "✅ Upload completed successfully!"; break
|
| 141 |
+
elif echo "$STATUS_LINE" | grep -q '"status":"processing"'; then
|
| 142 |
+
echo "⏳ Still processing... waiting 20 seconds"; sleep 20
|
| 143 |
+
else
|
| 144 |
+
echo "❌ Upload failed or unknown status"; break
|
| 145 |
+
fi
|
| 146 |
+
echo ""
|
| 147 |
+
done
|
| 148 |
+
|
| 149 |
+
echo ""
|
| 150 |
+
|
| 151 |
+
# Now upload second file after first completes
|
| 152 |
+
echo "📁 Step 3: Upload second file"
|
| 153 |
+
echo "------------------------------"
|
| 154 |
+
echo "Uploading $(basename "$FILE2")..."
|
| 155 |
+
|
| 156 |
+
UPLOAD_HEADERS2=$(mktemp)
|
| 157 |
+
UPLOAD_BODY2=$(mktemp)
|
| 158 |
+
|
| 159 |
+
set +e
|
| 160 |
+
HTTP_CODE2=$(curl -L --http1.1 --fail-with-body -sS \
|
| 161 |
+
--connect-timeout 60 --retry 3 --retry-delay 4 --retry-connrefused \
|
| 162 |
+
-H "Expect:" \
|
| 163 |
+
-X POST "$BACKEND_URL/upload" \
|
| 164 |
+
-F "user_id=$USER_ID" \
|
| 165 |
+
-F "project_id=$PROJECT_ID" \
|
| 166 |
+
-F "files=@$FILE2" \
|
| 167 |
+
-D "$UPLOAD_HEADERS2" -o "$UPLOAD_BODY2" \
|
| 168 |
+
-w "%{http_code}")
|
| 169 |
+
RET2=$?
|
| 170 |
+
set -e
|
| 171 |
+
|
| 172 |
+
echo "HTTP Status: $HTTP_CODE2"
|
| 173 |
+
echo "--- Response Headers ---"; sed -e 's/\r$//' "$UPLOAD_HEADERS2" | sed 's/^/ /'
|
| 174 |
+
echo "--- Response Body ---"; sed 's/^/ /' "$UPLOAD_BODY2"
|
| 175 |
+
|
| 176 |
+
if [ "$RET2" -ne 0 ] || [ "$HTTP_CODE2" = "000" ]; then
|
| 177 |
+
echo "❌ Upload 2 failed (curl exit=$RET2, http=$HTTP_CODE2)"; exit 1
|
| 178 |
+
fi
|
| 179 |
+
|
| 180 |
+
# Extract job_id2
|
| 181 |
+
if command -v jq >/dev/null 2>&1; then
|
| 182 |
+
JOB_ID2=$(jq -r '.job_id // empty' < "$UPLOAD_BODY2")
|
| 183 |
+
else
|
| 184 |
+
JOB_ID2=$(python3 - <<'PY'
|
| 185 |
+
import sys, json
|
| 186 |
+
try:
|
| 187 |
+
data=json.load(sys.stdin)
|
| 188 |
+
print(data.get('job_id',''))
|
| 189 |
+
except Exception:
|
| 190 |
+
print('')
|
| 191 |
+
PY
|
| 192 |
+
< "$UPLOAD_BODY2")
|
| 193 |
+
fi
|
| 194 |
+
|
| 195 |
+
if [ -z "${JOB_ID2:-}" ]; then
|
| 196 |
+
echo "❌ Failed to extract job_id from second upload response"; exit 1
|
| 197 |
+
fi
|
| 198 |
+
|
| 199 |
+
echo ""
|
| 200 |
+
echo "✅ Upload 2 initiated successfully!"
|
| 201 |
+
echo " Job ID: $JOB_ID2"
|
| 202 |
+
echo ""
|
| 203 |
+
|
| 204 |
+
echo "📊 Step 4: Monitor Upload 2 Progress"
|
| 205 |
+
echo "-------------------------------------"
|
| 206 |
+
for i in {1..48}; do
|
| 207 |
+
echo "Checking progress (attempt $i/48)..."
|
| 208 |
+
json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID2" -H "Accept: application/json" | sed 's/^/ /'
|
| 209 |
+
STATUS_LINE=$(json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID2" -H "Accept: application/json" | tail -n +1)
|
| 210 |
+
if echo "$STATUS_LINE" | grep -q '"status":"completed"'; then
|
| 211 |
+
echo "✅ Upload 2 completed successfully!"; break
|
| 212 |
+
elif echo "$STATUS_LINE" | grep -q '"status":"processing"'; then
|
| 213 |
+
echo "⏳ Still processing... waiting 20 seconds"; sleep 20
|
| 214 |
+
else
|
| 215 |
+
echo "❌ Upload 2 failed or unknown status"; break
|
| 216 |
+
fi
|
| 217 |
+
echo ""
|
| 218 |
+
done
|
| 219 |
+
|
| 220 |
+
echo ""
|
| 221 |
+
|
| 222 |
+
# Step 5: List Uploaded Files
|
| 223 |
+
echo "📋 Step 4: List Uploaded Files"
|
| 224 |
+
echo "-------------------------------"
|
| 225 |
+
json_with_status GET "$BACKEND_URL/files?user_id=$USER_ID&project_id=$PROJECT_ID" -H "Accept: application/json" | sed 's/^/ /'
|
| 226 |
+
echo ""; echo ""
|
| 227 |
+
|
| 228 |
+
# Step 5: Get File Chunks (for Lecture7_GA_EC.pdf)
|
| 229 |
+
echo "🔍 Step 5: Get File Chunks for Lecture7_GA_EC.pdf"
|
| 230 |
+
echo "----------------------------------------------"
|
| 231 |
+
json_with_status GET "$BACKEND_URL/files/chunks?user_id=$USER_ID&project_id=$PROJECT_ID&filename=Lecture7_GA_EC.pdf&limit=5" -H "Accept: application/json" | sed 's/^/ /'
|
| 232 |
+
echo ""; echo ""
|
| 233 |
+
|
| 234 |
+
# Step 6: Get File Chunks (for Tut7.pdf)
|
| 235 |
+
echo "🔍 Step 6: Get File Chunks for Tut7.pdf"
|
| 236 |
+
echo "------------------------------------------------"
|
| 237 |
+
json_with_status GET "$BACKEND_URL/files/chunks?user_id=$USER_ID&project_id=$PROJECT_ID&filename=Tut7.pdf&limit=5" -H "Accept: application/json" | sed 's/^/ /'
|
| 238 |
+
|
| 239 |
+
echo ""
|
| 240 |
+
echo "🎉 Test completed!"
|
| 241 |
+
echo "=================="
|
ingestion_python/test_upload2.sh
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
set -euo pipefail
|
| 4 |
+
|
| 5 |
+
echo "🚀 Testing Ingestion Pipeline Upload"
|
| 6 |
+
echo "======================================"
|
| 7 |
+
|
| 8 |
+
# Configuration
|
| 9 |
+
BACKEND_URL="https://binkhoale1812-studdybuddy-ingestion2.hf.space"
|
| 10 |
+
USER_ID="44e65346-8eaa-4f95-b17a-f6219953e7a8"
|
| 11 |
+
PROJECT_ID="496e2fad-ec7e-4562-b06a-ea2491f2460"
|
| 12 |
+
|
| 13 |
+
# Test files (resolve relative to this script's directory)
|
| 14 |
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
| 15 |
+
FILE1="$SCRIPT_DIR/../exefiles/Lecture7_GA_EC.pdf"
|
| 16 |
+
FILE2="$SCRIPT_DIR/../exefiles/Tut7.pdf"
|
| 17 |
+
|
| 18 |
+
# Debug toggles
|
| 19 |
+
DEBUG=${DEBUG:-0}
|
| 20 |
+
TRACE=${TRACE:-0}
|
| 21 |
+
|
| 22 |
+
echo "📋 Configuration:"
|
| 23 |
+
echo " Backend URL: $BACKEND_URL"
|
| 24 |
+
echo " User ID: $USER_ID"
|
| 25 |
+
echo " Project ID: $PROJECT_ID"
|
| 26 |
+
echo " Files: $FILE1, $FILE2"
|
| 27 |
+
echo ""
|
| 28 |
+
|
| 29 |
+
# Validate files and resolve absolute paths
|
| 30 |
+
if [ ! -f "$FILE1" ]; then
|
| 31 |
+
echo "❌ Missing file: $FILE1"; exit 26
|
| 32 |
+
fi
|
| 33 |
+
if [ ! -f "$FILE2" ]; then
|
| 34 |
+
echo "❌ Missing file: $FILE2"; exit 26
|
| 35 |
+
fi
|
| 36 |
+
FILE1_DIR="$(cd "$(dirname "$FILE1")" && pwd)"; FILE1_BASENAME="$(basename "$FILE1")"; FILE1="$FILE1_DIR/$FILE1_BASENAME"
|
| 37 |
+
FILE2_DIR="$(cd "$(dirname "$FILE2")" && pwd)"; FILE2_BASENAME="$(basename "$FILE2")"; FILE2="$FILE2_DIR/$FILE2_BASENAME"
|
| 38 |
+
|
| 39 |
+
curl_base() {
|
| 40 |
+
local method="$1"; shift
|
| 41 |
+
local url="$1"; shift
|
| 42 |
+
local extra=("$@")
|
| 43 |
+
local common=(
|
| 44 |
+
-L --http1.1 --fail-with-body -sS
|
| 45 |
+
--connect-timeout 60
|
| 46 |
+
--retry 5 --retry-delay 4 --retry-connrefused
|
| 47 |
+
)
|
| 48 |
+
if [ "$DEBUG" = "1" ]; then
|
| 49 |
+
common+=( -v )
|
| 50 |
+
fi
|
| 51 |
+
if [ "$TRACE" = "1" ]; then
|
| 52 |
+
common+=( --trace-time --trace-ascii - )
|
| 53 |
+
fi
|
| 54 |
+
curl -X "$method" "$url" "${common[@]}" "${extra[@]}"
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
json_with_status() {
|
| 58 |
+
local method="$1"; shift
|
| 59 |
+
local url="$1"; shift
|
| 60 |
+
local extra=("$@")
|
| 61 |
+
curl_base "$method" "$url" "${extra[@]}" \
|
| 62 |
+
-w "\nHTTP Status: %{http_code}\n"
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
# Step 0: Preflight (for browser parity)
|
| 66 |
+
echo "🛰️ Step 0: OPTIONS /upload (preflight parity)"
|
| 67 |
+
echo "---------------------------------------------"
|
| 68 |
+
json_with_status OPTIONS "$BACKEND_URL/upload" -H "Origin: https://example.com" -H "Access-Control-Request-Method: POST" || true
|
| 69 |
+
echo ""; echo ""
|
| 70 |
+
|
| 71 |
+
# Step 1: Health Check
|
| 72 |
+
echo "🏥 Step 1: Health Check"
|
| 73 |
+
echo "------------------------"
|
| 74 |
+
json_with_status GET "$BACKEND_URL/health" -H "Accept: application/json" || true
|
| 75 |
+
echo ""; echo ""
|
| 76 |
+
|
| 77 |
+
# Step 2: Upload Files
|
| 78 |
+
echo "📁 Step 2: Upload Files (sequential)"
|
| 79 |
+
echo "------------------------------------"
|
| 80 |
+
echo "Uploading $(basename "$FILE1")..."
|
| 81 |
+
|
| 82 |
+
UPLOAD_HEADERS=$(mktemp)
|
| 83 |
+
UPLOAD_BODY=$(mktemp)
|
| 84 |
+
|
| 85 |
+
set +e
|
| 86 |
+
HTTP_CODE=$(curl -L --http1.1 --fail-with-body -sS \
|
| 87 |
+
--connect-timeout 60 --retry 3 --retry-delay 4 --retry-connrefused \
|
| 88 |
+
-H "Expect:" \
|
| 89 |
+
-X POST "$BACKEND_URL/upload" \
|
| 90 |
+
-F "user_id=$USER_ID" \
|
| 91 |
+
-F "project_id=$PROJECT_ID" \
|
| 92 |
+
-F "files=@$FILE1" \
|
| 93 |
+
-D "$UPLOAD_HEADERS" -o "$UPLOAD_BODY" \
|
| 94 |
+
-w "%{http_code}")
|
| 95 |
+
RET=$?
|
| 96 |
+
set -e
|
| 97 |
+
|
| 98 |
+
echo "HTTP Status: $HTTP_CODE"
|
| 99 |
+
echo "--- Response Headers ---"; sed -e 's/\r$//' "$UPLOAD_HEADERS" | sed 's/^/ /'
|
| 100 |
+
echo "--- Response Body ---"; sed 's/^/ /' "$UPLOAD_BODY"
|
| 101 |
+
|
| 102 |
+
if [ "$RET" -ne 0 ] || [ "$HTTP_CODE" = "000" ]; then
|
| 103 |
+
echo "❌ Upload failed (curl exit=$RET, http=$HTTP_CODE)"; exit 1
|
| 104 |
+
fi
|
| 105 |
+
|
| 106 |
+
# Extract job_id (prefer jq)
|
| 107 |
+
if command -v jq >/dev/null 2>&1; then
|
| 108 |
+
JOB_ID=$(jq -r '.job_id // empty' < "$UPLOAD_BODY")
|
| 109 |
+
else
|
| 110 |
+
JOB_ID=$(python3 - <<'PY'
|
| 111 |
+
import sys, json
|
| 112 |
+
try:
|
| 113 |
+
data=json.load(sys.stdin)
|
| 114 |
+
print(data.get('job_id',''))
|
| 115 |
+
except Exception:
|
| 116 |
+
print('')
|
| 117 |
+
PY
|
| 118 |
+
< "$UPLOAD_BODY")
|
| 119 |
+
fi
|
| 120 |
+
|
| 121 |
+
if [ -z "${JOB_ID:-}" ]; then
|
| 122 |
+
echo "❌ Failed to extract job_id from upload response"; exit 1
|
| 123 |
+
fi
|
| 124 |
+
|
| 125 |
+
echo ""
|
| 126 |
+
echo "✅ Upload 1 initiated successfully!"
|
| 127 |
+
echo " Job ID: $JOB_ID"
|
| 128 |
+
echo ""
|
| 129 |
+
|
| 130 |
+
# Step 3: Monitor Upload Progress
|
| 131 |
+
echo "📊 Step 3: Monitor Upload Progress"
|
| 132 |
+
echo "----------------------------------"
|
| 133 |
+
|
| 134 |
+
for i in {1..48}; do
|
| 135 |
+
echo "Checking progress (attempt $i/12)..."
|
| 136 |
+
json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID" -H "Accept: application/json" | sed 's/^/ /'
|
| 137 |
+
STATUS_LINE=$(json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID" -H "Accept: application/json" | tail -n +1)
|
| 138 |
+
if echo "$STATUS_LINE" | grep -q '"status":"completed"'; then
|
| 139 |
+
echo "✅ Upload completed successfully!"; break
|
| 140 |
+
elif echo "$STATUS_LINE" | grep -q '"status":"processing"'; then
|
| 141 |
+
echo "⏳ Still processing... waiting 20 seconds"; sleep 20
|
| 142 |
+
else
|
| 143 |
+
echo "❌ Upload failed or unknown status"; break
|
| 144 |
+
fi
|
| 145 |
+
echo ""
|
| 146 |
+
done
|
| 147 |
+
|
| 148 |
+
echo ""
|
| 149 |
+
|
| 150 |
+
echo "📁 Step 3: Upload second file"
|
| 151 |
+
echo "------------------------------"
|
| 152 |
+
echo "Uploading $(basename "$FILE2")..."
|
| 153 |
+
|
| 154 |
+
UPLOAD_HEADERS2=$(mktemp)
|
| 155 |
+
UPLOAD_BODY2=$(mktemp)
|
| 156 |
+
|
| 157 |
+
set +e
|
| 158 |
+
HTTP_CODE2=$(curl -L --http1.1 --fail-with-body -sS \
|
| 159 |
+
--connect-timeout 60 --retry 3 --retry-delay 4 --retry-connrefused \
|
| 160 |
+
-H "Expect:" \
|
| 161 |
+
-X POST "$BACKEND_URL/upload" \
|
| 162 |
+
-F "user_id=$USER_ID" \
|
| 163 |
+
-F "project_id=$PROJECT_ID" \
|
| 164 |
+
-F "files=@$FILE2" \
|
| 165 |
+
-D "$UPLOAD_HEADERS2" -o "$UPLOAD_BODY2" \
|
| 166 |
+
-w "%{http_code}")
|
| 167 |
+
RET2=$?
|
| 168 |
+
set -e
|
| 169 |
+
|
| 170 |
+
echo "HTTP Status: $HTTP_CODE2"
|
| 171 |
+
echo "--- Response Headers ---"; sed -e 's/\r$//' "$UPLOAD_HEADERS2" | sed 's/^/ /'
|
| 172 |
+
echo "--- Response Body ---"; sed 's/^/ /' "$UPLOAD_BODY2"
|
| 173 |
+
|
| 174 |
+
if [ "$RET2" -ne 0 ] || [ "$HTTP_CODE2" = "000" ]; then
|
| 175 |
+
echo "❌ Upload 2 failed (curl exit=$RET2, http=$HTTP_CODE2)"; exit 1
|
| 176 |
+
fi
|
| 177 |
+
|
| 178 |
+
if command -v jq >/dev/null 2>&1; then
|
| 179 |
+
JOB_ID2=$(jq -r '.job_id // empty' < "$UPLOAD_BODY2")
|
| 180 |
+
else
|
| 181 |
+
JOB_ID2=$(python3 - <<'PY'
|
| 182 |
+
import sys, json
|
| 183 |
+
try:
|
| 184 |
+
data=json.load(sys.stdin)
|
| 185 |
+
print(data.get('job_id',''))
|
| 186 |
+
except Exception:
|
| 187 |
+
print('')
|
| 188 |
+
PY
|
| 189 |
+
< "$UPLOAD_BODY2")
|
| 190 |
+
fi
|
| 191 |
+
|
| 192 |
+
if [ -z "${JOB_ID2:-}" ]; then
|
| 193 |
+
echo "❌ Failed to extract job_id from second upload response"; exit 1
|
| 194 |
+
fi
|
| 195 |
+
|
| 196 |
+
echo ""
|
| 197 |
+
echo "✅ Upload 2 initiated successfully!"
|
| 198 |
+
echo " Job ID: $JOB_ID2"
|
| 199 |
+
echo ""
|
| 200 |
+
|
| 201 |
+
echo "📊 Step 4: Monitor Upload 2 Progress"
|
| 202 |
+
echo "-------------------------------------"
|
| 203 |
+
for i in {1..48}; do
|
| 204 |
+
echo "Checking progress (attempt $i/48)..."
|
| 205 |
+
json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID2" -H "Accept: application/json" | sed 's/^/ /'
|
| 206 |
+
STATUS_LINE=$(json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID2" -H "Accept: application/json" | tail -n +1)
|
| 207 |
+
if echo "$STATUS_LINE" | grep -q '"status":"completed"'; then
|
| 208 |
+
echo "✅ Upload 2 completed successfully!"; break
|
| 209 |
+
elif echo "$STATUS_LINE" | grep -q '"status":"processing"'; then
|
| 210 |
+
echo "⏳ Still processing... waiting 20 seconds"; sleep 20
|
| 211 |
+
else
|
| 212 |
+
echo "❌ Upload 2 failed or unknown status"; break
|
| 213 |
+
fi
|
| 214 |
+
echo ""
|
| 215 |
+
done
|
| 216 |
+
|
| 217 |
+
echo ""
|
| 218 |
+
|
| 219 |
+
# Step 4: List Uploaded Files
|
| 220 |
+
echo "📋 Step 4: List Uploaded Files"
|
| 221 |
+
echo "-------------------------------"
|
| 222 |
+
json_with_status GET "$BACKEND_URL/files?user_id=$USER_ID&project_id=$PROJECT_ID" -H "Accept: application/json" | sed 's/^/ /'
|
| 223 |
+
echo ""; echo ""
|
| 224 |
+
|
| 225 |
+
# Step 5: Get File Chunks (for Lecture7_GA_EC.pdf)
|
| 226 |
+
echo "🔍 Step 5: Get File Chunks for Lecture7_GA_EC.pdf"
|
| 227 |
+
echo "----------------------------------------------"
|
| 228 |
+
json_with_status GET "$BACKEND_URL/files/chunks?user_id=$USER_ID&project_id=$PROJECT_ID&filename=Lecture7_GA_EC.pdf&limit=5" -H "Accept: application/json" | sed 's/^/ /'
|
| 229 |
+
echo ""; echo ""
|
| 230 |
+
|
| 231 |
+
# Step 6: Get File Chunks (for Tut7.pdf)
|
| 232 |
+
echo "🔍 Step 6: Get File Chunks for Tut7.pdf"
|
| 233 |
+
echo "------------------------------------------------"
|
| 234 |
+
json_with_status GET "$BACKEND_URL/files/chunks?user_id=$USER_ID&project_id=$PROJECT_ID&filename=Tut7.pdf&limit=5" -H "Accept: application/json" | sed 's/^/ /'
|
| 235 |
+
|
| 236 |
+
echo ""
|
| 237 |
+
echo "🎉 Test completed!"
|
| 238 |
+
echo "=================="
|
ingestion_python/test_upload3.sh
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
set -euo pipefail
|
| 4 |
+
|
| 5 |
+
echo "🚀 Testing Ingestion Pipeline Upload"
|
| 6 |
+
echo "======================================"
|
| 7 |
+
|
| 8 |
+
# Configuration
|
| 9 |
+
BACKEND_URL="https://binkhoale1812-studdybuddy-ingestion3.hf.space"
|
| 10 |
+
USER_ID="44e65346-8eaa-4f95-b17a-f6219953e7a8"
|
| 11 |
+
PROJECT_ID="496e2fad-ec7e-4562-b06a-ea2491f2460"
|
| 12 |
+
|
| 13 |
+
# Test files
|
| 14 |
+
FILE1="../exefiles/Lecture8_PSO_ACO.pdf"
|
| 15 |
+
FILE2="../exefiles/Tut8.pdf"
|
| 16 |
+
|
| 17 |
+
# Debug toggles
|
| 18 |
+
DEBUG=${DEBUG:-0}
|
| 19 |
+
TRACE=${TRACE:-0}
|
| 20 |
+
|
| 21 |
+
echo "📋 Configuration:"
|
| 22 |
+
echo " Backend URL: $BACKEND_URL"
|
| 23 |
+
echo " User ID: $USER_ID"
|
| 24 |
+
echo " Project ID: $PROJECT_ID"
|
| 25 |
+
echo " Files: $FILE1, $FILE2"
|
| 26 |
+
echo ""
|
| 27 |
+
|
| 28 |
+
curl_base() {
|
| 29 |
+
local method="$1"; shift
|
| 30 |
+
local url="$1"; shift
|
| 31 |
+
local extra=("$@")
|
| 32 |
+
local common=(
|
| 33 |
+
-L --http1.1 --fail-with-body -sS
|
| 34 |
+
--connect-timeout 60
|
| 35 |
+
--retry 5 --retry-delay 4 --retry-connrefused
|
| 36 |
+
)
|
| 37 |
+
if [ "$DEBUG" = "1" ]; then
|
| 38 |
+
common+=( -v )
|
| 39 |
+
fi
|
| 40 |
+
if [ "$TRACE" = "1" ]; then
|
| 41 |
+
common+=( --trace-time --trace-ascii - )
|
| 42 |
+
fi
|
| 43 |
+
curl -X "$method" "$url" "${common[@]}" "${extra[@]}"
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
json_with_status() {
|
| 47 |
+
local method="$1"; shift
|
| 48 |
+
local url="$1"; shift
|
| 49 |
+
local extra=("$@")
|
| 50 |
+
curl_base "$method" "$url" "${extra[@]}" \
|
| 51 |
+
-w "\nHTTP Status: %{http_code}\n"
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
# Step 0: Preflight (for browser parity)
|
| 55 |
+
echo "🛰️ Step 0: OPTIONS /upload (preflight parity)"
|
| 56 |
+
echo "---------------------------------------------"
|
| 57 |
+
json_with_status OPTIONS "$BACKEND_URL/upload" -H "Origin: https://example.com" -H "Access-Control-Request-Method: POST" || true
|
| 58 |
+
echo ""; echo ""
|
| 59 |
+
|
| 60 |
+
# Step 1: Health Check
|
| 61 |
+
echo "🏥 Step 1: Health Check"
|
| 62 |
+
echo "------------------------"
|
| 63 |
+
json_with_status GET "$BACKEND_URL/health" -H "Accept: application/json" || true
|
| 64 |
+
echo ""; echo ""
|
| 65 |
+
|
| 66 |
+
# Step 2: Upload Files
|
| 67 |
+
echo "📁 Step 2: Upload Files (sequential)"
|
| 68 |
+
echo "------------------------------------"
|
| 69 |
+
echo "Uploading $(basename "$FILE1")..."
|
| 70 |
+
|
| 71 |
+
UPLOAD_HEADERS=$(mktemp)
|
| 72 |
+
UPLOAD_BODY=$(mktemp)
|
| 73 |
+
|
| 74 |
+
set +e
|
| 75 |
+
HTTP_CODE=$(curl -L --http1.1 --fail-with-body -sS \
|
| 76 |
+
--connect-timeout 60 --retry 3 --retry-delay 4 --retry-connrefused \
|
| 77 |
+
-H "Expect:" \
|
| 78 |
+
-X POST "$BACKEND_URL/upload" \
|
| 79 |
+
-F "user_id=$USER_ID" \
|
| 80 |
+
-F "project_id=$PROJECT_ID" \
|
| 81 |
+
-F "files=@$FILE1" \
|
| 82 |
+
-D "$UPLOAD_HEADERS" -o "$UPLOAD_BODY" \
|
| 83 |
+
-w "%{http_code}")
|
| 84 |
+
RET=$?
|
| 85 |
+
set -e
|
| 86 |
+
|
| 87 |
+
echo "HTTP Status: $HTTP_CODE"
|
| 88 |
+
echo "--- Response Headers ---"; sed -e 's/\r$//' "$UPLOAD_HEADERS" | sed 's/^/ /'
|
| 89 |
+
echo "--- Response Body ---"; sed 's/^/ /' "$UPLOAD_BODY"
|
| 90 |
+
|
| 91 |
+
if [ "$RET" -ne 0 ] || [ "$HTTP_CODE" = "000" ]; then
|
| 92 |
+
echo "❌ Upload failed (curl exit=$RET, http=$HTTP_CODE)"; exit 1
|
| 93 |
+
fi
|
| 94 |
+
|
| 95 |
+
# Extract job_id (prefer jq)
|
| 96 |
+
if command -v jq >/dev/null 2>&1; then
|
| 97 |
+
JOB_ID=$(jq -r '.job_id // empty' < "$UPLOAD_BODY")
|
| 98 |
+
else
|
| 99 |
+
JOB_ID=$(python3 - <<'PY'
|
| 100 |
+
import sys, json
|
| 101 |
+
try:
|
| 102 |
+
data=json.load(sys.stdin)
|
| 103 |
+
print(data.get('job_id',''))
|
| 104 |
+
except Exception:
|
| 105 |
+
print('')
|
| 106 |
+
PY
|
| 107 |
+
< "$UPLOAD_BODY")
|
| 108 |
+
fi
|
| 109 |
+
|
| 110 |
+
if [ -z "${JOB_ID:-}" ]; then
|
| 111 |
+
echo "❌ Failed to extract job_id from upload response"; exit 1
|
| 112 |
+
fi
|
| 113 |
+
|
| 114 |
+
echo ""
|
| 115 |
+
echo "✅ Upload 1 initiated successfully!"
|
| 116 |
+
echo " Job ID: $JOB_ID"
|
| 117 |
+
echo ""
|
| 118 |
+
|
| 119 |
+
# Step 3: Monitor Upload Progress
|
| 120 |
+
echo "📊 Step 3: Monitor Upload Progress"
|
| 121 |
+
echo "----------------------------------"
|
| 122 |
+
|
| 123 |
+
for i in {1..48}; do
|
| 124 |
+
echo "Checking progress (attempt $i/12)..."
|
| 125 |
+
json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID" -H "Accept: application/json" | sed 's/^/ /'
|
| 126 |
+
STATUS_LINE=$(json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID" -H "Accept: application/json" | tail -n +1)
|
| 127 |
+
if echo "$STATUS_LINE" | grep -q '"status":"completed"'; then
|
| 128 |
+
echo "✅ Upload completed successfully!"; break
|
| 129 |
+
elif echo "$STATUS_LINE" | grep -q '"status":"processing"'; then
|
| 130 |
+
echo "⏳ Still processing... waiting 20 seconds"; sleep 20
|
| 131 |
+
else
|
| 132 |
+
echo "❌ Upload failed or unknown status"; break
|
| 133 |
+
fi
|
| 134 |
+
echo ""
|
| 135 |
+
done
|
| 136 |
+
|
| 137 |
+
echo ""
|
| 138 |
+
|
| 139 |
+
echo "📁 Step 3: Upload second file"
|
| 140 |
+
echo "------------------------------"
|
| 141 |
+
echo "Uploading $(basename "$FILE2")..."
|
| 142 |
+
|
| 143 |
+
UPLOAD_HEADERS2=$(mktemp)
|
| 144 |
+
UPLOAD_BODY2=$(mktemp)
|
| 145 |
+
|
| 146 |
+
set +e
|
| 147 |
+
HTTP_CODE2=$(curl -L --http1.1 --fail-with-body -sS \
|
| 148 |
+
--connect-timeout 60 --retry 3 --retry-delay 4 --retry-connrefused \
|
| 149 |
+
-H "Expect:" \
|
| 150 |
+
-X POST "$BACKEND_URL/upload" \
|
| 151 |
+
-F "user_id=$USER_ID" \
|
| 152 |
+
-F "project_id=$PROJECT_ID" \
|
| 153 |
+
-F "files=@$FILE2" \
|
| 154 |
+
-D "$UPLOAD_HEADERS2" -o "$UPLOAD_BODY2" \
|
| 155 |
+
-w "%{http_code}")
|
| 156 |
+
RET2=$?
|
| 157 |
+
set -e
|
| 158 |
+
|
| 159 |
+
echo "HTTP Status: $HTTP_CODE2"
|
| 160 |
+
echo "--- Response Headers ---"; sed -e 's/\r$//' "$UPLOAD_HEADERS2" | sed 's/^/ /'
|
| 161 |
+
echo "--- Response Body ---"; sed 's/^/ /' "$UPLOAD_BODY2"
|
| 162 |
+
|
| 163 |
+
if [ "$RET2" -ne 0 ] || [ "$HTTP_CODE2" = "000" ]; then
|
| 164 |
+
echo "❌ Upload 2 failed (curl exit=$RET2, http=$HTTP_CODE2)"; exit 1
|
| 165 |
+
fi
|
| 166 |
+
|
| 167 |
+
if command -v jq >/dev/null 2>&1; then
|
| 168 |
+
JOB_ID2=$(jq -r '.job_id // empty' < "$UPLOAD_BODY2")
|
| 169 |
+
else
|
| 170 |
+
JOB_ID2=$(python3 - <<'PY'
|
| 171 |
+
import sys, json
|
| 172 |
+
try:
|
| 173 |
+
data=json.load(sys.stdin)
|
| 174 |
+
print(data.get('job_id',''))
|
| 175 |
+
except Exception:
|
| 176 |
+
print('')
|
| 177 |
+
PY
|
| 178 |
+
< "$UPLOAD_BODY2")
|
| 179 |
+
fi
|
| 180 |
+
|
| 181 |
+
if [ -z "${JOB_ID2:-}" ]; then
|
| 182 |
+
echo "❌ Failed to extract job_id from second upload response"; exit 1
|
| 183 |
+
fi
|
| 184 |
+
|
| 185 |
+
echo ""
|
| 186 |
+
echo "✅ Upload 2 initiated successfully!"
|
| 187 |
+
echo " Job ID: $JOB_ID2"
|
| 188 |
+
echo ""
|
| 189 |
+
|
| 190 |
+
echo "📊 Step 4: Monitor Upload 2 Progress"
|
| 191 |
+
echo "-------------------------------------"
|
| 192 |
+
for i in {1..48}; do
|
| 193 |
+
echo "Checking progress (attempt $i/48)..."
|
| 194 |
+
json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID2" -H "Accept: application/json" | sed 's/^/ /'
|
| 195 |
+
STATUS_LINE=$(json_with_status GET "$BACKEND_URL/upload/status?job_id=$JOB_ID2" -H "Accept: application/json" | tail -n +1)
|
| 196 |
+
if echo "$STATUS_LINE" | grep -q '"status":"completed"'; then
|
| 197 |
+
echo "✅ Upload 2 completed successfully!"; break
|
| 198 |
+
elif echo "$STATUS_LINE" | grep -q '"status":"processing"'; then
|
| 199 |
+
echo "⏳ Still processing... waiting 20 seconds"; sleep 20
|
| 200 |
+
else
|
| 201 |
+
echo "❌ Upload 2 failed or unknown status"; break
|
| 202 |
+
fi
|
| 203 |
+
echo ""
|
| 204 |
+
done
|
| 205 |
+
|
| 206 |
+
echo ""
|
| 207 |
+
|
| 208 |
+
# Step 4: List Uploaded Files
|
| 209 |
+
echo "📋 Step 4: List Uploaded Files"
|
| 210 |
+
echo "-------------------------------"
|
| 211 |
+
json_with_status GET "$BACKEND_URL/files?user_id=$USER_ID&project_id=$PROJECT_ID" -H "Accept: application/json" | sed 's/^/ /'
|
| 212 |
+
echo ""; echo ""
|
| 213 |
+
|
| 214 |
+
# Step 5: Get File Chunks (for Lecture8_PSO_ACO.pdf)
|
| 215 |
+
echo "🔍 Step 5: Get File Chunks for Lecture8_PSO_ACO.pdf"
|
| 216 |
+
echo "----------------------------------------------"
|
| 217 |
+
json_with_status GET "$BACKEND_URL/files/chunks?user_id=$USER_ID&project_id=$PROJECT_ID&filename=Lecture8_PSO_ACO.pdf&limit=5" -H "Accept: application/json" | sed 's/^/ /'
|
| 218 |
+
echo ""; echo ""
|
| 219 |
+
|
| 220 |
+
# Step 6: Get File Chunks (for Tut8.pdf)
|
| 221 |
+
echo "🔍 Step 6: Get File Chunks for Tut8.pdf"
|
| 222 |
+
echo "------------------------------------------------"
|
| 223 |
+
json_with_status GET "$BACKEND_URL/files/chunks?user_id=$USER_ID&project_id=$PROJECT_ID&filename=Tut8.pdf&limit=5" -H "Accept: application/json" | sed 's/^/ /'
|
| 224 |
+
|
| 225 |
+
echo ""
|
| 226 |
+
echo "🎉 Test completed!"
|
| 227 |
+
echo "=================="
|
ingestion_python/utils/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
ingestion_python/utils/api/rotator.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ────────────────────────────── utils/rotator.py ──────────────────────────────
|
| 2 |
+
import os
|
| 3 |
+
import itertools
|
| 4 |
+
from ..logger import get_logger
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
import httpx
|
| 8 |
+
|
| 9 |
+
logger = get_logger("ROTATOR", __name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class APIKeyRotator:
|
| 13 |
+
"""
|
| 14 |
+
Round-robin API key rotator.
|
| 15 |
+
- Loads keys from env vars with given prefix (e.g., GEMINI_API_1..6)
|
| 16 |
+
- get_key() returns current key
|
| 17 |
+
- rotate() moves to next key
|
| 18 |
+
- on HTTP 401/429/5xx you should call rotate() and retry (bounded)
|
| 19 |
+
"""
|
| 20 |
+
def __init__(self, prefix: str, max_slots: int = 6):
|
| 21 |
+
self.keys = []
|
| 22 |
+
for i in range(1, max_slots + 1):
|
| 23 |
+
v = os.getenv(f"{prefix}{i}")
|
| 24 |
+
if v:
|
| 25 |
+
self.keys.append(v.strip())
|
| 26 |
+
if not self.keys:
|
| 27 |
+
logger.warning(f"No API keys found for prefix {prefix}. Calls will likely fail.")
|
| 28 |
+
self._cycle = itertools.cycle([""])
|
| 29 |
+
else:
|
| 30 |
+
self._cycle = itertools.cycle(self.keys)
|
| 31 |
+
self.current = next(self._cycle)
|
| 32 |
+
|
| 33 |
+
def get_key(self) -> Optional[str]:
|
| 34 |
+
return self.current
|
| 35 |
+
|
| 36 |
+
def rotate(self) -> Optional[str]:
|
| 37 |
+
self.current = next(self._cycle)
|
| 38 |
+
logger.info("Rotated API key.")
|
| 39 |
+
return self.current
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
async def robust_post_json(url: str, headers: dict, payload: dict, rotator: APIKeyRotator, max_retries: int = 6):
|
| 43 |
+
"""
|
| 44 |
+
POST JSON with simple retry+rotate on 401/403/429/5xx.
|
| 45 |
+
Returns json response.
|
| 46 |
+
"""
|
| 47 |
+
for attempt in range(max_retries):
|
| 48 |
+
try:
|
| 49 |
+
async with httpx.AsyncClient(timeout=60) as client:
|
| 50 |
+
r = await client.post(url, headers=headers, json=payload)
|
| 51 |
+
logger.info(f"[ROTATOR] HTTP {r.status_code} response from {url}")
|
| 52 |
+
|
| 53 |
+
if r.status_code in (401, 403, 429) or (500 <= r.status_code < 600):
|
| 54 |
+
logger.warning(f"HTTP {r.status_code} from provider. Rotating key and retrying ({attempt+1}/{max_retries})")
|
| 55 |
+
logger.warning(f"Response body: {r.text}")
|
| 56 |
+
rotator.rotate()
|
| 57 |
+
continue
|
| 58 |
+
r.raise_for_status()
|
| 59 |
+
|
| 60 |
+
response_data = r.json()
|
| 61 |
+
logger.info(f"[ROTATOR] Successfully parsed JSON response with keys: {list(response_data.keys()) if isinstance(response_data, dict) else 'Not a dict'}")
|
| 62 |
+
return response_data
|
| 63 |
+
except Exception as e:
|
| 64 |
+
logger.warning(f"Request error: {e}. Rotating and retrying ({attempt+1}/{max_retries})")
|
| 65 |
+
logger.warning(f"Request details - URL: {url}, Headers: {headers}")
|
| 66 |
+
rotator.rotate()
|
| 67 |
+
raise RuntimeError("Provider request failed after retries.")
|
ingestion_python/utils/api/router.py
ADDED
|
@@ -0,0 +1,359 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ────────────────────────────── utils/router.py ──────────────────────────────
|
| 2 |
+
import os
|
| 3 |
+
from ..logger import get_logger
|
| 4 |
+
from typing import Dict, Any
|
| 5 |
+
from .rotator import robust_post_json, APIKeyRotator
|
| 6 |
+
|
| 7 |
+
logger = get_logger("ROUTER", __name__)
|
| 8 |
+
|
| 9 |
+
# Default model names (can be overridden via env)
|
| 10 |
+
GEMINI_SMALL = os.getenv("GEMINI_SMALL", "gemini-2.5-flash-lite")
|
| 11 |
+
GEMINI_MED = os.getenv("GEMINI_MED", "gemini-2.5-flash")
|
| 12 |
+
GEMINI_PRO = os.getenv("GEMINI_PRO", "gemini-2.5-pro")
|
| 13 |
+
|
| 14 |
+
# NVIDIA model hierarchy (can be overridden via env)
|
| 15 |
+
NVIDIA_SMALL = os.getenv("NVIDIA_SMALL", "meta/llama-3.1-8b-instruct") # Llama model for easy complexity tasks
|
| 16 |
+
NVIDIA_MEDIUM = os.getenv("NVIDIA_MEDIUM", "qwen/qwen3-next-80b-a3b-thinking") # Qwen model for reasoning tasks
|
| 17 |
+
NVIDIA_LARGE = os.getenv("NVIDIA_LARGE", "openai/gpt-oss-120b") # GPT-OSS model for hard/long context tasks
|
| 18 |
+
|
| 19 |
+
def select_model(question: str, context: str) -> Dict[str, Any]:
|
| 20 |
+
"""
|
| 21 |
+
Enhanced three-tier model selection system:
|
| 22 |
+
- Easy tasks (immediate execution, simple) -> Llama (NVIDIA small)
|
| 23 |
+
- Reasoning tasks (analysis, decision-making, JSON parsing) -> Qwen (NVIDIA medium)
|
| 24 |
+
- Hard/long context tasks (complex synthesis, long-form) -> GPT-OSS (NVIDIA large)
|
| 25 |
+
- Very complex tasks (research, comprehensive analysis) -> Gemini Pro
|
| 26 |
+
"""
|
| 27 |
+
qlen = len(question.split())
|
| 28 |
+
clen = len(context.split())
|
| 29 |
+
|
| 30 |
+
# Very hard task keywords - require Gemini Pro (research, comprehensive analysis)
|
| 31 |
+
very_hard_keywords = ("prove", "derivation", "complexity", "algorithm", "optimize", "theorem", "rigorous", "step-by-step", "policy critique", "ambiguity", "counterfactual", "comprehensive", "detailed analysis", "synthesis", "evaluation", "research", "investigation", "comprehensive study")
|
| 32 |
+
|
| 33 |
+
# Hard/long context keywords - require NVIDIA Large (GPT-OSS)
|
| 34 |
+
hard_keywords = ("analyze", "explain", "compare", "evaluate", "summarize", "extract", "classify", "identify", "describe", "discuss", "synthesis", "consolidate", "process", "generate", "create", "develop", "build", "construct")
|
| 35 |
+
|
| 36 |
+
# Reasoning task keywords - require Qwen (thinking/reasoning)
|
| 37 |
+
reasoning_keywords = ("reasoning", "context", "enhance", "select", "decide", "choose", "determine", "assess", "judge", "consider", "think", "reason", "logic", "inference", "deduction", "analysis", "interpretation")
|
| 38 |
+
|
| 39 |
+
# Simple task keywords - immediate execution
|
| 40 |
+
simple_keywords = ("what", "how", "when", "where", "who", "yes", "no", "count", "list", "find", "search", "lookup")
|
| 41 |
+
|
| 42 |
+
# Determine complexity level
|
| 43 |
+
is_very_hard = (
|
| 44 |
+
any(k in question.lower() for k in very_hard_keywords) or
|
| 45 |
+
qlen > 120 or
|
| 46 |
+
clen > 4000 or
|
| 47 |
+
"comprehensive" in question.lower() or
|
| 48 |
+
"detailed" in question.lower() or
|
| 49 |
+
"research" in question.lower()
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
is_hard = (
|
| 53 |
+
any(k in question.lower() for k in hard_keywords) or
|
| 54 |
+
qlen > 50 or
|
| 55 |
+
clen > 1500 or
|
| 56 |
+
"synthesis" in question.lower() or
|
| 57 |
+
"generate" in question.lower() or
|
| 58 |
+
"create" in question.lower()
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
is_reasoning = (
|
| 62 |
+
any(k in question.lower() for k in reasoning_keywords) or
|
| 63 |
+
qlen > 20 or
|
| 64 |
+
clen > 800 or
|
| 65 |
+
"enhance" in question.lower() or
|
| 66 |
+
"context" in question.lower() or
|
| 67 |
+
"select" in question.lower() or
|
| 68 |
+
"decide" in question.lower()
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
is_simple = (
|
| 72 |
+
any(k in question.lower() for k in simple_keywords) or
|
| 73 |
+
qlen <= 10 or
|
| 74 |
+
clen <= 200
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
if is_very_hard:
|
| 78 |
+
# Use Gemini Pro for very complex tasks requiring advanced reasoning
|
| 79 |
+
return {"provider": "gemini", "model": GEMINI_PRO}
|
| 80 |
+
elif is_hard:
|
| 81 |
+
# Use NVIDIA Large (GPT-OSS) for hard/long context tasks
|
| 82 |
+
return {"provider": "nvidia_large", "model": NVIDIA_LARGE}
|
| 83 |
+
elif is_reasoning:
|
| 84 |
+
# Use Qwen for reasoning tasks requiring thinking
|
| 85 |
+
return {"provider": "qwen", "model": NVIDIA_MEDIUM}
|
| 86 |
+
else:
|
| 87 |
+
# Use NVIDIA small (Llama) for simple tasks requiring immediate execution
|
| 88 |
+
return {"provider": "nvidia", "model": NVIDIA_SMALL}
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
async def generate_answer_with_model(selection: Dict[str, Any], system_prompt: str, user_prompt: str,
|
| 92 |
+
gemini_rotator: APIKeyRotator, nvidia_rotator: APIKeyRotator,
|
| 93 |
+
user_id: str = None, context: str = "") -> str:
|
| 94 |
+
provider = selection["provider"]
|
| 95 |
+
model = selection["model"]
|
| 96 |
+
|
| 97 |
+
if provider == "gemini":
|
| 98 |
+
# Try Gemini first
|
| 99 |
+
try:
|
| 100 |
+
key = gemini_rotator.get_key() or ""
|
| 101 |
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={key}"
|
| 102 |
+
payload = {
|
| 103 |
+
"contents": [
|
| 104 |
+
{"role": "user", "parts": [{"text": f"{system_prompt}\n\n{user_prompt}"}]}
|
| 105 |
+
],
|
| 106 |
+
"generationConfig": {"temperature": 0.2}
|
| 107 |
+
}
|
| 108 |
+
headers = {"Content-Type": "application/json"}
|
| 109 |
+
data = await robust_post_json(url, headers, payload, gemini_rotator)
|
| 110 |
+
|
| 111 |
+
content = data["candidates"][0]["content"]["parts"][0]["text"]
|
| 112 |
+
if not content or content.strip() == "":
|
| 113 |
+
logger.warning(f"Empty content from Gemini model: {data}")
|
| 114 |
+
raise Exception("Empty content from Gemini")
|
| 115 |
+
return content
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logger.warning(f"Gemini model {model} failed: {e}. Attempting fallback...")
|
| 118 |
+
|
| 119 |
+
# Fallback logic: GEMINI_PRO/MED → NVIDIA_LARGE, GEMINI_SMALL → NVIDIA_SMALL
|
| 120 |
+
if model in [GEMINI_PRO, GEMINI_MED]:
|
| 121 |
+
logger.info(f"Falling back from {model} to NVIDIA_LARGE")
|
| 122 |
+
fallback_selection = {"provider": "nvidia_large", "model": NVIDIA_LARGE}
|
| 123 |
+
return await generate_answer_with_model(fallback_selection, system_prompt, user_prompt, gemini_rotator, nvidia_rotator, user_id, context)
|
| 124 |
+
elif model == GEMINI_SMALL:
|
| 125 |
+
logger.info(f"Falling back from {model} to NVIDIA_SMALL")
|
| 126 |
+
fallback_selection = {"provider": "nvidia", "model": NVIDIA_SMALL}
|
| 127 |
+
return await generate_answer_with_model(fallback_selection, system_prompt, user_prompt, gemini_rotator, nvidia_rotator, user_id, context)
|
| 128 |
+
else:
|
| 129 |
+
logger.error(f"No fallback defined for Gemini model: {model}")
|
| 130 |
+
return "I couldn't parse the model response."
|
| 131 |
+
|
| 132 |
+
elif provider == "nvidia":
|
| 133 |
+
# Try NVIDIA small model first
|
| 134 |
+
try:
|
| 135 |
+
key = nvidia_rotator.get_key() or ""
|
| 136 |
+
url = "https://integrate.api.nvidia.com/v1/chat/completions"
|
| 137 |
+
payload = {
|
| 138 |
+
"model": model,
|
| 139 |
+
"temperature": 0.2,
|
| 140 |
+
"messages": [
|
| 141 |
+
{"role": "system", "content": system_prompt},
|
| 142 |
+
{"role": "user", "content": user_prompt},
|
| 143 |
+
]
|
| 144 |
+
}
|
| 145 |
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
|
| 146 |
+
|
| 147 |
+
logger.info(f"[ROUTER] NVIDIA API call - Model: {model}, Key present: {bool(key)}")
|
| 148 |
+
logger.info(f"[ROUTER] System prompt length: {len(system_prompt)}, User prompt length: {len(user_prompt)}")
|
| 149 |
+
|
| 150 |
+
data = await robust_post_json(url, headers, payload, nvidia_rotator)
|
| 151 |
+
|
| 152 |
+
logger.info(f"[ROUTER] NVIDIA API response type: {type(data)}, keys: {list(data.keys()) if isinstance(data, dict) else 'Not a dict'}")
|
| 153 |
+
content = data["choices"][0]["message"]["content"]
|
| 154 |
+
if not content or content.strip() == "":
|
| 155 |
+
logger.warning(f"Empty content from NVIDIA model: {data}")
|
| 156 |
+
raise Exception("Empty content from NVIDIA")
|
| 157 |
+
return content
|
| 158 |
+
except Exception as e:
|
| 159 |
+
logger.warning(f"NVIDIA model {model} failed: {e}. Attempting fallback...")
|
| 160 |
+
|
| 161 |
+
# Fallback: NVIDIA_SMALL → Try a different NVIDIA model or basic response
|
| 162 |
+
if model == NVIDIA_SMALL:
|
| 163 |
+
logger.info(f"Falling back from {model} to basic response")
|
| 164 |
+
return "I'm experiencing technical difficulties with the AI model. Please try again later."
|
| 165 |
+
else:
|
| 166 |
+
logger.error(f"No fallback defined for NVIDIA model: {model}")
|
| 167 |
+
return "I couldn't parse the model response."
|
| 168 |
+
|
| 169 |
+
elif provider == "qwen":
|
| 170 |
+
# Use Qwen for reasoning tasks with fallback
|
| 171 |
+
try:
|
| 172 |
+
return await qwen_chat_completion(system_prompt, user_prompt, nvidia_rotator, user_id, context)
|
| 173 |
+
except Exception as e:
|
| 174 |
+
logger.warning(f"Qwen model failed: {e}. Attempting fallback...")
|
| 175 |
+
# Fallback: Qwen → NVIDIA_SMALL
|
| 176 |
+
logger.info("Falling back from Qwen to NVIDIA_SMALL")
|
| 177 |
+
fallback_selection = {"provider": "nvidia", "model": NVIDIA_SMALL}
|
| 178 |
+
return await generate_answer_with_model(fallback_selection, system_prompt, user_prompt, gemini_rotator, nvidia_rotator, user_id, context)
|
| 179 |
+
elif provider == "nvidia_large":
|
| 180 |
+
# Use NVIDIA Large (GPT-OSS) for hard/long context tasks with fallback
|
| 181 |
+
try:
|
| 182 |
+
return await nvidia_large_chat_completion(system_prompt, user_prompt, nvidia_rotator, user_id, context)
|
| 183 |
+
except Exception as e:
|
| 184 |
+
logger.warning(f"NVIDIA_LARGE model failed: {e}. Attempting fallback...")
|
| 185 |
+
# Fallback: NVIDIA_LARGE → NVIDIA_SMALL
|
| 186 |
+
logger.info("Falling back from NVIDIA_LARGE to NVIDIA_SMALL")
|
| 187 |
+
fallback_selection = {"provider": "nvidia", "model": NVIDIA_SMALL}
|
| 188 |
+
return await generate_answer_with_model(fallback_selection, system_prompt, user_prompt, gemini_rotator, nvidia_rotator, user_id, context)
|
| 189 |
+
elif provider == "nvidia_coder":
|
| 190 |
+
# Use NVIDIA Coder for code generation tasks with fallback
|
| 191 |
+
try:
|
| 192 |
+
from helpers.coder import nvidia_coder_completion
|
| 193 |
+
return await nvidia_coder_completion(system_prompt, user_prompt, nvidia_rotator, user_id, context)
|
| 194 |
+
except Exception as e:
|
| 195 |
+
logger.warning(f"NVIDIA_CODER model failed: {e}. Attempting fallback...")
|
| 196 |
+
# Fallback: NVIDIA_CODER → NVIDIA_SMALL
|
| 197 |
+
logger.info("Falling back from NVIDIA_CODER to NVIDIA_SMALL")
|
| 198 |
+
fallback_selection = {"provider": "nvidia", "model": NVIDIA_SMALL}
|
| 199 |
+
return await generate_answer_with_model(fallback_selection, system_prompt, user_prompt, gemini_rotator, nvidia_rotator, user_id, context)
|
| 200 |
+
|
| 201 |
+
return "Unsupported provider."
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
async def qwen_chat_completion(system_prompt: str, user_prompt: str, nvidia_rotator: APIKeyRotator, user_id: str = None, context: str = "") -> str:
|
| 205 |
+
"""
|
| 206 |
+
Qwen chat completion with thinking mode enabled.
|
| 207 |
+
Uses the NVIDIA API rotator for key management.
|
| 208 |
+
"""
|
| 209 |
+
|
| 210 |
+
key = nvidia_rotator.get_key() or ""
|
| 211 |
+
url = "https://integrate.api.nvidia.com/v1/chat/completions"
|
| 212 |
+
|
| 213 |
+
payload = {
|
| 214 |
+
"model": NVIDIA_MEDIUM,
|
| 215 |
+
"messages": [
|
| 216 |
+
{"role": "system", "content": system_prompt},
|
| 217 |
+
{"role": "user", "content": user_prompt}
|
| 218 |
+
],
|
| 219 |
+
"temperature": 0.6,
|
| 220 |
+
"top_p": 0.7,
|
| 221 |
+
"max_tokens": 8192,
|
| 222 |
+
"stream": True
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
|
| 226 |
+
|
| 227 |
+
logger.info(f"[QWEN] API call - Model: {NVIDIA_MEDIUM}, Key present: {bool(key)}")
|
| 228 |
+
logger.info(f"[QWEN] System prompt length: {len(system_prompt)}, User prompt length: {len(user_prompt)}")
|
| 229 |
+
|
| 230 |
+
try:
|
| 231 |
+
# For streaming, we need to handle the response differently
|
| 232 |
+
import httpx
|
| 233 |
+
async with httpx.AsyncClient(timeout=60) as client:
|
| 234 |
+
response = await client.post(url, headers=headers, json=payload)
|
| 235 |
+
|
| 236 |
+
if response.status_code in (401, 403, 429) or (500 <= response.status_code < 600):
|
| 237 |
+
logger.warning(f"HTTP {response.status_code} from Qwen provider. Rotating key and retrying")
|
| 238 |
+
nvidia_rotator.rotate()
|
| 239 |
+
# Retry once with new key
|
| 240 |
+
key = nvidia_rotator.get_key() or ""
|
| 241 |
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
|
| 242 |
+
response = await client.post(url, headers=headers, json=payload)
|
| 243 |
+
|
| 244 |
+
response.raise_for_status()
|
| 245 |
+
|
| 246 |
+
# Handle streaming response
|
| 247 |
+
content = ""
|
| 248 |
+
async for line in response.aiter_lines():
|
| 249 |
+
if line.startswith("data: "):
|
| 250 |
+
data = line[6:] # Remove "data: " prefix
|
| 251 |
+
if data.strip() == "[DONE]":
|
| 252 |
+
break
|
| 253 |
+
|
| 254 |
+
try:
|
| 255 |
+
import json
|
| 256 |
+
chunk_data = json.loads(data)
|
| 257 |
+
if "choices" in chunk_data and len(chunk_data["choices"]) > 0:
|
| 258 |
+
delta = chunk_data["choices"][0].get("delta", {})
|
| 259 |
+
|
| 260 |
+
# Handle reasoning content (thinking)
|
| 261 |
+
reasoning = delta.get("reasoning_content")
|
| 262 |
+
if reasoning:
|
| 263 |
+
logger.debug(f"[QWEN] Reasoning: {reasoning}")
|
| 264 |
+
|
| 265 |
+
# Handle regular content
|
| 266 |
+
chunk_content = delta.get("content")
|
| 267 |
+
if chunk_content:
|
| 268 |
+
content += chunk_content
|
| 269 |
+
except json.JSONDecodeError:
|
| 270 |
+
continue
|
| 271 |
+
|
| 272 |
+
if not content or content.strip() == "":
|
| 273 |
+
logger.warning(f"Empty content from Qwen model")
|
| 274 |
+
return "I received an empty response from the model."
|
| 275 |
+
|
| 276 |
+
return content.strip()
|
| 277 |
+
|
| 278 |
+
except Exception as e:
|
| 279 |
+
logger.warning(f"Qwen API error: {e}")
|
| 280 |
+
return "I couldn't process the request with Qwen model."
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
async def nvidia_large_chat_completion(system_prompt: str, user_prompt: str, nvidia_rotator: APIKeyRotator, user_id: str = None, context: str = "") -> str:
|
| 284 |
+
"""
|
| 285 |
+
NVIDIA Large (GPT-OSS) chat completion for hard/long context tasks.
|
| 286 |
+
Uses the NVIDIA API rotator for key management.
|
| 287 |
+
"""
|
| 288 |
+
|
| 289 |
+
key = nvidia_rotator.get_key() or ""
|
| 290 |
+
url = "https://integrate.api.nvidia.com/v1/chat/completions"
|
| 291 |
+
|
| 292 |
+
payload = {
|
| 293 |
+
"model": NVIDIA_LARGE,
|
| 294 |
+
"messages": [
|
| 295 |
+
{"role": "system", "content": system_prompt},
|
| 296 |
+
{"role": "user", "content": user_prompt}
|
| 297 |
+
],
|
| 298 |
+
"temperature": 1.0,
|
| 299 |
+
"top_p": 1.0,
|
| 300 |
+
"max_tokens": 4096,
|
| 301 |
+
"stream": True
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
|
| 305 |
+
|
| 306 |
+
logger.info(f"[NVIDIA_LARGE] API call - Model: {NVIDIA_LARGE}, Key present: {bool(key)}")
|
| 307 |
+
logger.info(f"[NVIDIA_LARGE] System prompt length: {len(system_prompt)}, User prompt length: {len(user_prompt)}")
|
| 308 |
+
|
| 309 |
+
try:
|
| 310 |
+
# For streaming, we need to handle the response differently
|
| 311 |
+
import httpx
|
| 312 |
+
async with httpx.AsyncClient(timeout=60) as client:
|
| 313 |
+
response = await client.post(url, headers=headers, json=payload)
|
| 314 |
+
|
| 315 |
+
if response.status_code in (401, 403, 429) or (500 <= response.status_code < 600):
|
| 316 |
+
logger.warning(f"HTTP {response.status_code} from NVIDIA Large provider. Rotating key and retrying")
|
| 317 |
+
nvidia_rotator.rotate()
|
| 318 |
+
# Retry once with new key
|
| 319 |
+
key = nvidia_rotator.get_key() or ""
|
| 320 |
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
|
| 321 |
+
response = await client.post(url, headers=headers, json=payload)
|
| 322 |
+
|
| 323 |
+
response.raise_for_status()
|
| 324 |
+
|
| 325 |
+
# Handle streaming response
|
| 326 |
+
content = ""
|
| 327 |
+
async for line in response.aiter_lines():
|
| 328 |
+
if line.startswith("data: "):
|
| 329 |
+
data = line[6:] # Remove "data: " prefix
|
| 330 |
+
if data.strip() == "[DONE]":
|
| 331 |
+
break
|
| 332 |
+
|
| 333 |
+
try:
|
| 334 |
+
import json
|
| 335 |
+
chunk_data = json.loads(data)
|
| 336 |
+
if "choices" in chunk_data and len(chunk_data["choices"]) > 0:
|
| 337 |
+
delta = chunk_data["choices"][0].get("delta", {})
|
| 338 |
+
|
| 339 |
+
# Handle reasoning content (thinking)
|
| 340 |
+
reasoning = delta.get("reasoning_content")
|
| 341 |
+
if reasoning:
|
| 342 |
+
logger.debug(f"[NVIDIA_LARGE] Reasoning: {reasoning}")
|
| 343 |
+
|
| 344 |
+
# Handle regular content
|
| 345 |
+
chunk_content = delta.get("content")
|
| 346 |
+
if chunk_content:
|
| 347 |
+
content += chunk_content
|
| 348 |
+
except json.JSONDecodeError:
|
| 349 |
+
continue
|
| 350 |
+
|
| 351 |
+
if not content or content.strip() == "":
|
| 352 |
+
logger.warning(f"Empty content from NVIDIA Large model")
|
| 353 |
+
return "I received an empty response from the model."
|
| 354 |
+
|
| 355 |
+
return content.strip()
|
| 356 |
+
|
| 357 |
+
except Exception as e:
|
| 358 |
+
logger.warning(f"NVIDIA Large API error: {e}")
|
| 359 |
+
return "I couldn't process the request with NVIDIA Large model."
|
ingestion_python/utils/embedding.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import List
|
| 3 |
+
|
| 4 |
+
import requests
|
| 5 |
+
|
| 6 |
+
from utils.logger import get_logger
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
logger = get_logger("REMOTE_EMBED", __name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class RemoteEmbeddingClient:
|
| 13 |
+
"""Client to call external embedding service /embed endpoint.
|
| 14 |
+
|
| 15 |
+
Expects env EMBED_BASE_URL, e.g. https://<space>.hf.space
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, base_url: str | None = None, timeout: int = 60):
|
| 19 |
+
self.base_url = (base_url or os.getenv("EMBED_BASE_URL", "https://binkhoale1812-embedding.hf.space")).rstrip("/")
|
| 20 |
+
if not self.base_url:
|
| 21 |
+
raise RuntimeError("EMBED_BASE_URL is required for RemoteEmbeddingClient")
|
| 22 |
+
self.timeout = timeout
|
| 23 |
+
|
| 24 |
+
def embed(self, texts: List[str]) -> List[list]:
|
| 25 |
+
if not texts:
|
| 26 |
+
return []
|
| 27 |
+
url = f"{self.base_url}/embed"
|
| 28 |
+
payload = {"texts": texts}
|
| 29 |
+
headers = {"Content-Type": "application/json"}
|
| 30 |
+
try:
|
| 31 |
+
resp = requests.post(url, json=payload, headers=headers, timeout=self.timeout)
|
| 32 |
+
resp.raise_for_status()
|
| 33 |
+
data = resp.json()
|
| 34 |
+
vectors = data.get("vectors", [])
|
| 35 |
+
# Basic validation
|
| 36 |
+
if not isinstance(vectors, list):
|
| 37 |
+
raise ValueError("Invalid vectors format from remote embedder")
|
| 38 |
+
return vectors
|
| 39 |
+
except Exception as e:
|
| 40 |
+
logger.warning(f"Remote embedding failed: {e}")
|
| 41 |
+
# Fail closed with zero vectors to avoid crashes
|
| 42 |
+
return [[0.0] * 384 for _ in texts]
|
| 43 |
+
|
| 44 |
+
|
ingestion_python/utils/ingestion/chunker.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ────────────────────────────── utils/chunker.py ──────────────────────────────
|
| 2 |
+
import re
|
| 3 |
+
from typing import List, Dict, Any
|
| 4 |
+
from utils.service.summarizer import cheap_summarize, clean_chunk_text
|
| 5 |
+
from utils.service.common import split_sentences, slugify
|
| 6 |
+
from ..logger import get_logger
|
| 7 |
+
|
| 8 |
+
# Enhanced semantic chunker with overlap and better structure:
|
| 9 |
+
# - Split by headings / numbered sections if present
|
| 10 |
+
# - Ensure each chunk ~ 300-600 words (configurable)
|
| 11 |
+
# - Add overlap between chunks for better context preservation
|
| 12 |
+
# - Generate a short summary + topic name
|
| 13 |
+
# - Better handling of semantic boundaries
|
| 14 |
+
|
| 15 |
+
MAX_WORDS = 500
|
| 16 |
+
MIN_WORDS = 150
|
| 17 |
+
OVERLAP_WORDS = 50 # Overlap between chunks for better context
|
| 18 |
+
logger = get_logger("CHUNKER", __name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _by_headings(text: str):
|
| 22 |
+
# Enhanced split on markdown-like or outline headings with better patterns
|
| 23 |
+
patterns = [
|
| 24 |
+
r"(?m)^(#{1,6}\s.*)\s*$", # Markdown headers
|
| 25 |
+
r"(?m)^([0-9]+\.\s+[^\n]+)\s*$", # Numbered sections
|
| 26 |
+
r"(?m)^([A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$", # Underlined headers
|
| 27 |
+
r"(?m)^(Chapter\s+\d+.*|Section\s+\d+.*)\s*$", # Chapter/Section headers
|
| 28 |
+
r"(?m)^(Abstract|Introduction|Conclusion|References|Bibliography)\s*$", # Common academic sections
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
parts = []
|
| 32 |
+
last = 0
|
| 33 |
+
all_matches = []
|
| 34 |
+
|
| 35 |
+
# Find all matches from all patterns
|
| 36 |
+
for pattern in patterns:
|
| 37 |
+
for m in re.finditer(pattern, text):
|
| 38 |
+
all_matches.append((m.start(), m.end(), m.group(1).strip()))
|
| 39 |
+
|
| 40 |
+
# Sort matches by position
|
| 41 |
+
all_matches.sort(key=lambda x: x[0])
|
| 42 |
+
|
| 43 |
+
# Split text based on matches
|
| 44 |
+
for start, end, header in all_matches:
|
| 45 |
+
if start > last:
|
| 46 |
+
parts.append(text[last:start])
|
| 47 |
+
parts.append(text[start:end])
|
| 48 |
+
last = end
|
| 49 |
+
|
| 50 |
+
if last < len(text):
|
| 51 |
+
parts.append(text[last:])
|
| 52 |
+
|
| 53 |
+
if not parts:
|
| 54 |
+
parts = [text]
|
| 55 |
+
|
| 56 |
+
return parts
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _create_overlapping_chunks(text_blocks: List[str]) -> List[str]:
|
| 60 |
+
"""Create overlapping chunks from text blocks for better context preservation"""
|
| 61 |
+
chunks = []
|
| 62 |
+
|
| 63 |
+
for i, block in enumerate(text_blocks):
|
| 64 |
+
words = block.split()
|
| 65 |
+
if not words:
|
| 66 |
+
continue
|
| 67 |
+
|
| 68 |
+
# If block is small enough, use as-is
|
| 69 |
+
if len(words) <= MAX_WORDS:
|
| 70 |
+
chunks.append(block)
|
| 71 |
+
continue
|
| 72 |
+
|
| 73 |
+
# Split large blocks with overlap
|
| 74 |
+
start = 0
|
| 75 |
+
while start < len(words):
|
| 76 |
+
end = min(start + MAX_WORDS, len(words))
|
| 77 |
+
chunk_words = words[start:end]
|
| 78 |
+
|
| 79 |
+
# Add overlap from previous chunk if available
|
| 80 |
+
if start > 0 and len(chunks) > 0:
|
| 81 |
+
prev_words = chunks[-1].split()
|
| 82 |
+
overlap_start = max(0, len(prev_words) - OVERLAP_WORDS)
|
| 83 |
+
overlap_words = prev_words[overlap_start:]
|
| 84 |
+
chunk_words = overlap_words + chunk_words
|
| 85 |
+
|
| 86 |
+
chunks.append(" ".join(chunk_words))
|
| 87 |
+
start = end - OVERLAP_WORDS # Overlap with next chunk
|
| 88 |
+
|
| 89 |
+
return chunks
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
async def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id: str, project_id: str) -> List[Dict[str, Any]]:
|
| 93 |
+
# Concatenate pages but keep page spans for metadata
|
| 94 |
+
full = ""
|
| 95 |
+
page_markers = []
|
| 96 |
+
for p in pages:
|
| 97 |
+
start = len(full)
|
| 98 |
+
full += f"\n\n[[Page {p['page_num']}]]\n{p.get('text','').strip()}\n"
|
| 99 |
+
page_markers.append((p['page_num'], start, len(full)))
|
| 100 |
+
|
| 101 |
+
# First split by headings
|
| 102 |
+
coarse = _by_headings(full)
|
| 103 |
+
|
| 104 |
+
# Create overlapping chunks for better context preservation
|
| 105 |
+
cards = _create_overlapping_chunks(coarse)
|
| 106 |
+
|
| 107 |
+
# Build card dicts
|
| 108 |
+
out = []
|
| 109 |
+
for i, raw_content in enumerate(cards, 1):
|
| 110 |
+
# Clean with LLM to remove headers/footers and IDs
|
| 111 |
+
cleaned = await clean_chunk_text(raw_content)
|
| 112 |
+
topic = await cheap_summarize(cleaned, max_sentences=1)
|
| 113 |
+
if not topic:
|
| 114 |
+
topic = cleaned[:80] + "..."
|
| 115 |
+
summary = await cheap_summarize(cleaned, max_sentences=3)
|
| 116 |
+
# Estimate page span
|
| 117 |
+
first_page = pages[0]['page_num'] if pages else 1
|
| 118 |
+
last_page = pages[-1]['page_num'] if pages else 1
|
| 119 |
+
out.append({
|
| 120 |
+
"user_id": user_id,
|
| 121 |
+
"project_id": project_id,
|
| 122 |
+
"filename": filename,
|
| 123 |
+
"topic_name": topic[:120],
|
| 124 |
+
"summary": summary,
|
| 125 |
+
"content": cleaned,
|
| 126 |
+
"page_span": [first_page, last_page],
|
| 127 |
+
"card_id": f"{slugify(filename)}-c{i:04d}"
|
| 128 |
+
})
|
| 129 |
+
logger.info(f"Built {len(out)} cards from {len(pages)} pages for {filename}")
|
| 130 |
+
return out
|
ingestion_python/utils/ingestion/parser.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
from typing import List, Dict, Any
|
| 3 |
+
import fitz # PyMuPDF
|
| 4 |
+
from docx import Document
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import numpy as np
|
| 7 |
+
from ..logger import get_logger
|
| 8 |
+
|
| 9 |
+
logger = get_logger("PARSER", __name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def parse_pdf_bytes(b: bytes) -> List[Dict[str, Any]]:
|
| 13 |
+
"""
|
| 14 |
+
Returns list of pages, each {'page_num': i, 'text': str, 'images': [PIL.Image]}
|
| 15 |
+
"""
|
| 16 |
+
pages = []
|
| 17 |
+
with fitz.open(stream=b, filetype="pdf") as doc:
|
| 18 |
+
for i, page in enumerate(doc):
|
| 19 |
+
text = page.get_text("text")
|
| 20 |
+
images = []
|
| 21 |
+
for img in page.get_images(full=True):
|
| 22 |
+
xref = img[0]
|
| 23 |
+
try:
|
| 24 |
+
pix = fitz.Pixmap(doc, xref)
|
| 25 |
+
# Convert CMYK/Alpha safely
|
| 26 |
+
if pix.n - pix.alpha >= 4:
|
| 27 |
+
pix = fitz.Pixmap(fitz.csRGB, pix)
|
| 28 |
+
# Use PNG bytes to avoid 'not enough image data'
|
| 29 |
+
png_bytes = pix.tobytes("png")
|
| 30 |
+
im = Image.open(io.BytesIO(png_bytes)).convert("RGB")
|
| 31 |
+
images.append(im)
|
| 32 |
+
except Exception as e:
|
| 33 |
+
logger.warning(f"Failed to extract image on page {i+1}: {e}")
|
| 34 |
+
finally:
|
| 35 |
+
try:
|
| 36 |
+
pix = None
|
| 37 |
+
except Exception:
|
| 38 |
+
pass
|
| 39 |
+
pages.append({"page_num": i + 1, "text": text, "images": images})
|
| 40 |
+
logger.info(f"Parsed PDF with {len(pages)} pages")
|
| 41 |
+
return pages
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def parse_docx_bytes(b: bytes) -> List[Dict[str, Any]]:
|
| 45 |
+
f = io.BytesIO(b)
|
| 46 |
+
doc = Document(f)
|
| 47 |
+
text = []
|
| 48 |
+
images = []
|
| 49 |
+
for rel in doc.part.rels.values():
|
| 50 |
+
if "image" in rel.reltype:
|
| 51 |
+
data = rel.target_part.blob
|
| 52 |
+
try:
|
| 53 |
+
im = Image.open(io.BytesIO(data)).convert("RGB")
|
| 54 |
+
images.append(im)
|
| 55 |
+
except Exception:
|
| 56 |
+
pass
|
| 57 |
+
for p in doc.paragraphs:
|
| 58 |
+
text.append(p.text)
|
| 59 |
+
pages = [{"page_num": 1, "text": "\n".join(text), "images": images}]
|
| 60 |
+
logger.info("Parsed DOCX into single concatenated page")
|
| 61 |
+
return pages
|
| 62 |
+
|
| 63 |
+
|
ingestion_python/utils/logger.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import sys
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
_DEFAULT_FORMAT = "%(asctime)s %(levelname)s %(message)s"
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _ensure_root_handler() -> None:
|
| 10 |
+
root_logger = logging.getLogger()
|
| 11 |
+
if root_logger.handlers:
|
| 12 |
+
return
|
| 13 |
+
handler = logging.StreamHandler(stream=sys.stdout)
|
| 14 |
+
formatter = logging.Formatter(_DEFAULT_FORMAT)
|
| 15 |
+
handler.setFormatter(formatter)
|
| 16 |
+
root_logger.addHandler(handler)
|
| 17 |
+
root_logger.setLevel(logging.INFO)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class _TaggedAdapter(logging.LoggerAdapter):
|
| 21 |
+
def process(self, msg, kwargs):
|
| 22 |
+
tag = self.extra.get("tag", "")
|
| 23 |
+
if tag and not str(msg).startswith(tag):
|
| 24 |
+
msg = f"{tag} {msg}"
|
| 25 |
+
return msg, kwargs
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def get_logger(tag: str, name: Optional[str] = None) -> logging.Logger:
|
| 29 |
+
_ensure_root_handler()
|
| 30 |
+
logger_name = name or __name__
|
| 31 |
+
base = logging.getLogger(logger_name)
|
| 32 |
+
return _TaggedAdapter(base, {"tag": f"[{tag}]"})
|
| 33 |
+
|
| 34 |
+
import logging
|
| 35 |
+
import sys
|
| 36 |
+
from typing import Optional
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
_DEFAULT_FORMAT = "%(asctime)s %(levelname)s %(message)s"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _ensure_root_handler() -> None:
|
| 43 |
+
root_logger = logging.getLogger()
|
| 44 |
+
if root_logger.handlers:
|
| 45 |
+
return
|
| 46 |
+
handler = logging.StreamHandler(stream=sys.stdout)
|
| 47 |
+
formatter = logging.Formatter(_DEFAULT_FORMAT)
|
| 48 |
+
handler.setFormatter(formatter)
|
| 49 |
+
root_logger.addHandler(handler)
|
| 50 |
+
root_logger.setLevel(logging.INFO)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class _TaggedAdapter(logging.LoggerAdapter):
|
| 54 |
+
def process(self, msg, kwargs):
|
| 55 |
+
tag = self.extra.get("tag", "")
|
| 56 |
+
if tag and not str(msg).startswith(tag):
|
| 57 |
+
msg = f"{tag} {msg}"
|
| 58 |
+
return msg, kwargs
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def get_logger(tag: str, name: Optional[str] = None) -> logging.Logger:
|
| 62 |
+
"""
|
| 63 |
+
Return a logger that injects a [TAG] prefix into records.
|
| 64 |
+
Example: logger = get_logger("APP") → logs like: [APP] message
|
| 65 |
+
"""
|
| 66 |
+
_ensure_root_handler()
|
| 67 |
+
logger_name = name or __name__
|
| 68 |
+
base = logging.getLogger(logger_name)
|
| 69 |
+
return _TaggedAdapter(base, {"tag": f"[{tag}]"})
|
| 70 |
+
|
| 71 |
+
|
ingestion_python/utils/rag/embeddings.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ────────────────────────────── utils/embeddings.py ──────────────────────────────
|
| 2 |
+
import os
|
| 3 |
+
from typing import List
|
| 4 |
+
import requests
|
| 5 |
+
|
| 6 |
+
from utils.logger import get_logger
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
logger = get_logger("EMBED", __name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class EmbeddingClient:
|
| 13 |
+
"""Embedding client that calls external embedding service via HTTP.
|
| 14 |
+
|
| 15 |
+
Expects environment variable EMBEDDER_BASE_URL pointing at an API with:
|
| 16 |
+
POST /embed {"texts": [..]} -> {"vectors": [[..], ...], "model": "..."}
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def __init__(self, base_url: str = None):
|
| 20 |
+
self.base_url = (base_url or os.getenv("EMBEDDER_BASE_URL", "")).rstrip("/")
|
| 21 |
+
if not self.base_url:
|
| 22 |
+
logger.warning("EMBEDDER_BASE_URL not set; embedding calls will fail.")
|
| 23 |
+
|
| 24 |
+
def embed(self, texts: List[str]) -> List[list]:
|
| 25 |
+
if not texts:
|
| 26 |
+
return []
|
| 27 |
+
if not self.base_url:
|
| 28 |
+
raise RuntimeError("EMBEDDER_BASE_URL not configured")
|
| 29 |
+
url = f"{self.base_url}/embed"
|
| 30 |
+
try:
|
| 31 |
+
resp = requests.post(url, json={"texts": texts}, timeout=60)
|
| 32 |
+
if resp.status_code >= 400:
|
| 33 |
+
raise RuntimeError(f"Embedding API error {resp.status_code}: {resp.text[:200]}")
|
| 34 |
+
data = resp.json()
|
| 35 |
+
vectors = data.get("vectors") or []
|
| 36 |
+
return vectors
|
| 37 |
+
except Exception as e:
|
| 38 |
+
logger.warning(f"Embedding API failed: {e}")
|
| 39 |
+
raise
|
ingestion_python/utils/rag/rag.py
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ────────────────────────────── utils/rag.py ──────────────────────────────
|
| 2 |
+
import os
|
| 3 |
+
import math
|
| 4 |
+
from typing import List, Dict, Any, Optional
|
| 5 |
+
from pymongo import MongoClient, ASCENDING, TEXT
|
| 6 |
+
from pymongo.collection import Collection
|
| 7 |
+
from pymongo.errors import PyMongoError
|
| 8 |
+
import numpy as np
|
| 9 |
+
import logging
|
| 10 |
+
from ..logger import get_logger
|
| 11 |
+
|
| 12 |
+
VECTOR_DIM = 384 # all-MiniLM-L6-v2
|
| 13 |
+
INDEX_NAME = os.getenv("MONGO_VECTOR_INDEX", "vector_index")
|
| 14 |
+
USE_ATLAS_VECTOR = os.getenv("ATLAS_VECTOR", "0") == "1"
|
| 15 |
+
|
| 16 |
+
logger = get_logger("RAG", __name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class RAGStore:
|
| 20 |
+
def __init__(self, mongo_uri: str, db_name: str = "studybuddy"):
|
| 21 |
+
self.client = MongoClient(mongo_uri)
|
| 22 |
+
self.db = self.client[db_name]
|
| 23 |
+
self.chunks: Collection = self.db["chunks"]
|
| 24 |
+
self.files: Collection = self.db["files"]
|
| 25 |
+
|
| 26 |
+
# ── Write ────────────────────────────────────────────────────────────────
|
| 27 |
+
def store_cards(self, cards: List[Dict[str, Any]]):
|
| 28 |
+
if not cards:
|
| 29 |
+
return
|
| 30 |
+
for c in cards:
|
| 31 |
+
# basic validation
|
| 32 |
+
emb = c.get("embedding")
|
| 33 |
+
if not emb or len(emb) != VECTOR_DIM:
|
| 34 |
+
raise ValueError("Invalid embedding length; expected %d" % VECTOR_DIM)
|
| 35 |
+
self.chunks.insert_many(cards, ordered=False)
|
| 36 |
+
logger.info(f"Inserted {len(cards)} cards into MongoDB")
|
| 37 |
+
|
| 38 |
+
def upsert_file_summary(self, user_id: str, project_id: str, filename: str, summary: str):
|
| 39 |
+
self.files.update_one(
|
| 40 |
+
{"user_id": user_id, "project_id": project_id, "filename": filename},
|
| 41 |
+
{"$set": {"summary": summary}},
|
| 42 |
+
upsert=True
|
| 43 |
+
)
|
| 44 |
+
logger.info(f"Upserted summary for {filename} (user {user_id}, project {project_id})")
|
| 45 |
+
|
| 46 |
+
# ── Read ────────────────────────────────────────────────────────────────
|
| 47 |
+
def list_cards(self, user_id: str, project_id: str, filename: Optional[str], limit: int, skip: int):
|
| 48 |
+
q = {"user_id": user_id, "project_id": project_id}
|
| 49 |
+
if filename:
|
| 50 |
+
q["filename"] = filename
|
| 51 |
+
cur = self.chunks.find(q, {"embedding": 0}).skip(skip).limit(limit).sort([("_id", ASCENDING)])
|
| 52 |
+
# Convert MongoDB documents to JSON-serializable format
|
| 53 |
+
cards = []
|
| 54 |
+
for card in cur:
|
| 55 |
+
serializable_card = {}
|
| 56 |
+
for key, value in card.items():
|
| 57 |
+
if key == '_id':
|
| 58 |
+
serializable_card[key] = str(value) # Convert ObjectId to string
|
| 59 |
+
elif hasattr(value, 'isoformat'): # Handle datetime objects
|
| 60 |
+
serializable_card[key] = value.isoformat()
|
| 61 |
+
else:
|
| 62 |
+
serializable_card[key] = value
|
| 63 |
+
cards.append(serializable_card)
|
| 64 |
+
return cards
|
| 65 |
+
|
| 66 |
+
def get_file_summary(self, user_id: str, project_id: str, filename: str):
|
| 67 |
+
doc = self.files.find_one({"user_id": user_id, "project_id": project_id, "filename": filename})
|
| 68 |
+
if doc:
|
| 69 |
+
# Convert MongoDB document to JSON-serializable format
|
| 70 |
+
serializable_doc = {}
|
| 71 |
+
for key, value in doc.items():
|
| 72 |
+
if key == '_id':
|
| 73 |
+
serializable_doc[key] = str(value) # Convert ObjectId to string
|
| 74 |
+
elif hasattr(value, 'isoformat'): # Handle datetime objects
|
| 75 |
+
serializable_doc[key] = value.isoformat()
|
| 76 |
+
else:
|
| 77 |
+
serializable_doc[key] = value
|
| 78 |
+
return serializable_doc
|
| 79 |
+
return None
|
| 80 |
+
|
| 81 |
+
def get_file_chunks(self, user_id: str, project_id: str, filename: str, limit: int = 20) -> List[Dict[str, Any]]:
|
| 82 |
+
"""Get chunks for a specific file"""
|
| 83 |
+
cursor = self.chunks.find({
|
| 84 |
+
"user_id": user_id,
|
| 85 |
+
"project_id": project_id,
|
| 86 |
+
"filename": filename
|
| 87 |
+
}).limit(limit)
|
| 88 |
+
|
| 89 |
+
chunks = []
|
| 90 |
+
for doc in cursor:
|
| 91 |
+
# Convert MongoDB document to JSON-serializable format
|
| 92 |
+
serializable_doc = {}
|
| 93 |
+
for key, value in doc.items():
|
| 94 |
+
if key == '_id':
|
| 95 |
+
serializable_doc[key] = str(value)
|
| 96 |
+
elif hasattr(value, 'isoformat'):
|
| 97 |
+
serializable_doc[key] = value.isoformat()
|
| 98 |
+
else:
|
| 99 |
+
serializable_doc[key] = value
|
| 100 |
+
chunks.append(serializable_doc)
|
| 101 |
+
|
| 102 |
+
return chunks
|
| 103 |
+
|
| 104 |
+
def list_files(self, user_id: str, project_id: str):
|
| 105 |
+
"""List all files for a project with their summaries"""
|
| 106 |
+
files_cursor = self.files.find(
|
| 107 |
+
{"user_id": user_id, "project_id": project_id},
|
| 108 |
+
{"_id": 0, "filename": 1, "summary": 1}
|
| 109 |
+
).sort("filename", ASCENDING)
|
| 110 |
+
|
| 111 |
+
# Convert MongoDB documents to JSON-serializable format
|
| 112 |
+
files = []
|
| 113 |
+
for file_doc in files_cursor:
|
| 114 |
+
serializable_file = {}
|
| 115 |
+
for key, value in file_doc.items():
|
| 116 |
+
if hasattr(value, 'isoformat'): # Handle datetime objects
|
| 117 |
+
serializable_file[key] = value.isoformat()
|
| 118 |
+
else:
|
| 119 |
+
serializable_file[key] = value
|
| 120 |
+
files.append(serializable_file)
|
| 121 |
+
return files
|
| 122 |
+
|
| 123 |
+
def vector_search(self, user_id: str, project_id: str, query_vector: List[float], k: int = 6, filenames: Optional[List[str]] = None, search_type: str = "hybrid"):
|
| 124 |
+
"""
|
| 125 |
+
Enhanced vector search with multiple strategies:
|
| 126 |
+
- hybrid: Combines Atlas and local search
|
| 127 |
+
- flat: Exhaustive search for maximum accuracy
|
| 128 |
+
- atlas: Uses Atlas Vector Search only
|
| 129 |
+
- local: Uses local cosine similarity only
|
| 130 |
+
"""
|
| 131 |
+
if search_type == "flat" or (search_type == "hybrid" and not USE_ATLAS_VECTOR):
|
| 132 |
+
return self._flat_vector_search(user_id, project_id, query_vector, k, filenames)
|
| 133 |
+
elif search_type == "atlas" and USE_ATLAS_VECTOR:
|
| 134 |
+
return self._atlas_vector_search(user_id, project_id, query_vector, k, filenames)
|
| 135 |
+
elif search_type == "local":
|
| 136 |
+
return self._local_vector_search(user_id, project_id, query_vector, k, filenames)
|
| 137 |
+
else:
|
| 138 |
+
# Default hybrid approach
|
| 139 |
+
if USE_ATLAS_VECTOR:
|
| 140 |
+
atlas_results = self._atlas_vector_search(user_id, project_id, query_vector, k, filenames)
|
| 141 |
+
if atlas_results:
|
| 142 |
+
return atlas_results
|
| 143 |
+
return self._local_vector_search(user_id, project_id, query_vector, k, filenames)
|
| 144 |
+
|
| 145 |
+
def _atlas_vector_search(self, user_id: str, project_id: str, query_vector: List[float], k: int, filenames: Optional[List[str]] = None):
|
| 146 |
+
"""Atlas Vector Search implementation"""
|
| 147 |
+
match_stage = {"user_id": user_id, "project_id": project_id}
|
| 148 |
+
if filenames:
|
| 149 |
+
match_stage["filename"] = {"$in": filenames}
|
| 150 |
+
pipeline = [
|
| 151 |
+
{
|
| 152 |
+
"$search": {
|
| 153 |
+
"index": INDEX_NAME,
|
| 154 |
+
"knnBeta": {
|
| 155 |
+
"vector": query_vector,
|
| 156 |
+
"path": "embedding",
|
| 157 |
+
"k": k,
|
| 158 |
+
}
|
| 159 |
+
}
|
| 160 |
+
},
|
| 161 |
+
{"$match": match_stage},
|
| 162 |
+
{"$project": {"doc": "$$ROOT", "score": {"$meta": "searchScore"}}},
|
| 163 |
+
{"$limit": k},
|
| 164 |
+
]
|
| 165 |
+
hits = list(self.chunks.aggregate(pipeline))
|
| 166 |
+
return self._serialize_hits(hits)
|
| 167 |
+
|
| 168 |
+
def _local_vector_search(self, user_id: str, project_id: str, query_vector: List[float], k: int, filenames: Optional[List[str]] = None):
|
| 169 |
+
"""Local cosine similarity search with improved sampling"""
|
| 170 |
+
q = {"user_id": user_id, "project_id": project_id}
|
| 171 |
+
if filenames:
|
| 172 |
+
q["filename"] = {"$in": filenames}
|
| 173 |
+
|
| 174 |
+
# Increase sample size for better accuracy
|
| 175 |
+
sample_limit = max(5000, k * 50)
|
| 176 |
+
sample = list(self.chunks.find(q).sort([("_id", -1)]).limit(sample_limit))
|
| 177 |
+
if not sample:
|
| 178 |
+
return []
|
| 179 |
+
|
| 180 |
+
qv = np.array(query_vector, dtype="float32")
|
| 181 |
+
scores = []
|
| 182 |
+
|
| 183 |
+
for d in sample:
|
| 184 |
+
v = np.array(d.get("embedding", [0]*VECTOR_DIM), dtype="float32")
|
| 185 |
+
denom = (np.linalg.norm(qv) * np.linalg.norm(v)) or 1.0
|
| 186 |
+
sim = float(np.dot(qv, v) / denom)
|
| 187 |
+
scores.append((sim, d))
|
| 188 |
+
|
| 189 |
+
scores.sort(key=lambda x: x[0], reverse=True)
|
| 190 |
+
top = scores[:k]
|
| 191 |
+
logger.info(f"Local vector search: {len(sample)} docs sampled, {len(top)} results")
|
| 192 |
+
|
| 193 |
+
return self._serialize_results(top)
|
| 194 |
+
|
| 195 |
+
def _flat_vector_search(self, user_id: str, project_id: str, query_vector: List[float], k: int, filenames: Optional[List[str]] = None):
|
| 196 |
+
"""Flat exhaustive search for maximum accuracy"""
|
| 197 |
+
q = {"user_id": user_id, "project_id": project_id}
|
| 198 |
+
if filenames:
|
| 199 |
+
q["filename"] = {"$in": filenames}
|
| 200 |
+
|
| 201 |
+
# Get ALL relevant documents for exhaustive search
|
| 202 |
+
all_docs = list(self.chunks.find(q))
|
| 203 |
+
if not all_docs:
|
| 204 |
+
return []
|
| 205 |
+
|
| 206 |
+
qv = np.array(query_vector, dtype="float32")
|
| 207 |
+
scores = []
|
| 208 |
+
|
| 209 |
+
for doc in all_docs:
|
| 210 |
+
v = np.array(doc.get("embedding", [0]*VECTOR_DIM), dtype="float32")
|
| 211 |
+
denom = (np.linalg.norm(qv) * np.linalg.norm(v)) or 1.0
|
| 212 |
+
sim = float(np.dot(qv, v) / denom)
|
| 213 |
+
scores.append((sim, doc))
|
| 214 |
+
|
| 215 |
+
scores.sort(key=lambda x: x[0], reverse=True)
|
| 216 |
+
top = scores[:k]
|
| 217 |
+
logger.info(f"Flat vector search: {len(all_docs)} docs searched, {len(top)} results")
|
| 218 |
+
|
| 219 |
+
return self._serialize_results(top)
|
| 220 |
+
|
| 221 |
+
def _serialize_hits(self, hits):
|
| 222 |
+
"""Serialize Atlas search hits"""
|
| 223 |
+
serializable_hits = []
|
| 224 |
+
for hit in hits:
|
| 225 |
+
doc = hit["doc"]
|
| 226 |
+
serializable_doc = self._serialize_doc(doc)
|
| 227 |
+
serializable_hits.append({
|
| 228 |
+
"doc": serializable_doc,
|
| 229 |
+
"score": float(hit.get("score", 0.0))
|
| 230 |
+
})
|
| 231 |
+
return serializable_hits
|
| 232 |
+
|
| 233 |
+
def _serialize_results(self, results):
|
| 234 |
+
"""Serialize local search results"""
|
| 235 |
+
serializable_results = []
|
| 236 |
+
for score, doc in results:
|
| 237 |
+
serializable_doc = self._serialize_doc(doc)
|
| 238 |
+
serializable_results.append({
|
| 239 |
+
"doc": serializable_doc,
|
| 240 |
+
"score": float(score)
|
| 241 |
+
})
|
| 242 |
+
return serializable_results
|
| 243 |
+
|
| 244 |
+
def _serialize_doc(self, doc):
|
| 245 |
+
"""Convert MongoDB document to JSON-serializable format"""
|
| 246 |
+
serializable_doc = {}
|
| 247 |
+
for key, value in doc.items():
|
| 248 |
+
if key == '_id':
|
| 249 |
+
serializable_doc[key] = str(value)
|
| 250 |
+
elif hasattr(value, 'isoformat'):
|
| 251 |
+
serializable_doc[key] = value.isoformat()
|
| 252 |
+
else:
|
| 253 |
+
serializable_doc[key] = value
|
| 254 |
+
return serializable_doc
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def ensure_indexes(store: RAGStore):
|
| 258 |
+
# Basic text index for fallback keyword search (optional)
|
| 259 |
+
try:
|
| 260 |
+
store.chunks.create_index([("user_id", ASCENDING), ("project_id", ASCENDING), ("filename", ASCENDING)])
|
| 261 |
+
store.chunks.create_index([("content", TEXT), ("topic_name", TEXT), ("summary", TEXT)], name="text_idx")
|
| 262 |
+
store.files.create_index([("user_id", ASCENDING), ("project_id", ASCENDING), ("filename", ASCENDING)], unique=True)
|
| 263 |
+
except PyMongoError as e:
|
| 264 |
+
logger.warning(f"Index creation warning: {e}")
|
| 265 |
+
# Note: For Atlas Vector, create an Atlas Search index named INDEX_NAME on field "embedding" with vector options.
|
| 266 |
+
# Example (in Atlas UI):
|
| 267 |
+
# {
|
| 268 |
+
# "mappings": {
|
| 269 |
+
# "dynamic": false,
|
| 270 |
+
# "fields": {
|
| 271 |
+
# "embedding": {
|
| 272 |
+
# "type": "knnVector",
|
| 273 |
+
# "dimensions": 384,
|
| 274 |
+
# "similarity": "cosine"
|
| 275 |
+
# }
|
| 276 |
+
# }
|
| 277 |
+
# }
|
| 278 |
+
# }
|
ingestion_python/utils/service/common.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import unicodedata
|
| 3 |
+
from utils.logger import get_logger
|
| 4 |
+
|
| 5 |
+
logger = get_logger("COMMON", __name__)
|
| 6 |
+
|
| 7 |
+
def split_sentences(text: str):
|
| 8 |
+
return re.split(r"(?<=[\.\!\?])\s+", text.strip())
|
| 9 |
+
|
| 10 |
+
def slugify(value: str):
|
| 11 |
+
value = str(value)
|
| 12 |
+
value = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii")
|
| 13 |
+
value = re.sub(r"[^\w\s-]", "", value).strip().lower()
|
| 14 |
+
return re.sub(r"[-\s]+", "-", value)
|
| 15 |
+
|
| 16 |
+
def trim_text(s: str, n: int):
|
| 17 |
+
s = s or ""
|
| 18 |
+
if len(s) <= n:
|
| 19 |
+
return s
|
| 20 |
+
return s[:n] + "…"
|
ingestion_python/utils/service/summarizer.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import List
|
| 3 |
+
from utils.logger import get_logger
|
| 4 |
+
|
| 5 |
+
logger = get_logger("SUM", __name__)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
async def clean_chunk_text(text: str) -> str:
|
| 9 |
+
"""Clean and normalize text for processing."""
|
| 10 |
+
if not text:
|
| 11 |
+
return ""
|
| 12 |
+
|
| 13 |
+
# Remove extra whitespace and normalize
|
| 14 |
+
text = " ".join(text.split())
|
| 15 |
+
|
| 16 |
+
# Remove common artifacts
|
| 17 |
+
text = text.replace("\\n", " ").replace("\\t", " ")
|
| 18 |
+
|
| 19 |
+
return text.strip()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
async def cheap_summarize(text: str, max_sentences: int = 3) -> str:
|
| 23 |
+
"""Simple text-based summarization without external APIs."""
|
| 24 |
+
if not text or len(text.strip()) < 50:
|
| 25 |
+
return text.strip()
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
# Simple extractive summarization: take first few sentences
|
| 29 |
+
sentences = re.split(r'[.!?]+', text)
|
| 30 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
| 31 |
+
|
| 32 |
+
if len(sentences) <= max_sentences:
|
| 33 |
+
return text.strip()
|
| 34 |
+
|
| 35 |
+
# Take first max_sentences sentences
|
| 36 |
+
summary_sentences = sentences[:max_sentences]
|
| 37 |
+
summary = '. '.join(summary_sentences)
|
| 38 |
+
|
| 39 |
+
# Add period if it doesn't end with punctuation
|
| 40 |
+
if not summary.endswith(('.', '!', '?')):
|
| 41 |
+
summary += '.'
|
| 42 |
+
|
| 43 |
+
return summary
|
| 44 |
+
|
| 45 |
+
except Exception as e:
|
| 46 |
+
logger.warning(f"[SUM] Summarization failed: {e}")
|
| 47 |
+
# Fallback: return first part of text
|
| 48 |
+
return text[:200] + "..." if len(text) > 200 else text
|