Spaces:
Running
Running
docker deployment
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +60 -0
- .env.example +3 -0
- .gitattributes +2 -34
- .gitignore +57 -0
- CODEBASE_DOCUMENTATION.md +673 -0
- College_Overview2.md +29 -0
- DOCUMENTATION_PLAN.md +277 -0
- Dockerfile +66 -0
- LOCAL_MODEL_TRUNCATION_FIX.md +136 -0
- MARKDOWN_FIX_SUMMARY.md +118 -0
- README.md +65 -10
- WHY_LOCAL_NOT_WORKING.md +112 -0
- app/__init__.py +0 -0
- app/api/__init__.py +5 -0
- app/api/dependencies.py +41 -0
- app/api/routes/__init__.py +5 -0
- app/api/routes/rag.py +186 -0
- app/api/routes/settings.py +186 -0
- app/api/routes/vector_store.py +311 -0
- app/api/schemas/__init__.py +1 -0
- app/api/schemas/requests.py +71 -0
- app/api/schemas/settings.py +54 -0
- app/api/schemas/tests.py +30 -0
- app/core/__init__.py +0 -0
- app/core/config.py +55 -0
- app/core/paths.py +10 -0
- app/main.py +21 -0
- app/models/__init__.py +0 -0
- app/prompts/__init__.py +1 -0
- app/prompts/system_prompts.py +112 -0
- app/services/__init__.py +2 -0
- app/services/classifier_service.py +337 -0
- app/services/document_loader.py +34 -0
- app/services/file_service.py +198 -0
- app/services/filter-demo +197 -0
- app/services/filter_classifier copy.py +334 -0
- app/services/filter_classifier.py +529 -0
- app/services/hybrid_retrieval.py +354 -0
- app/services/ingestion_service.py +95 -0
- app/services/rag_service.py +483 -0
- app/services/text_splitter.py +266 -0
- app/services/vector_store.py +67 -0
- app/utils/__init__.py +5 -0
- app/utils/constants.py +264 -0
- app/utils/document_helpers.py +231 -0
- app/utils/embeddings.py +11 -0
- app/utils/llm_models.py +20 -0
- app/utils/model_factory.py +164 -0
- app/utils/preprocessing.py +107 -0
- app/utils/tests.py +0 -0
.dockerignore
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ─── Python ───────────────────────────────────────────────────────────────────
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.egg-info/
|
| 6 |
+
.eggs/
|
| 7 |
+
|
| 8 |
+
# ─── Virtual environments ─────────────────────────────────────────────────────
|
| 9 |
+
.venv/
|
| 10 |
+
venv/
|
| 11 |
+
env/
|
| 12 |
+
|
| 13 |
+
# ─── Environment / secrets ────────────────────────────────────────────────────
|
| 14 |
+
.env
|
| 15 |
+
.env.*
|
| 16 |
+
|
| 17 |
+
# ─── Git ──────────────────────────────────────────────────────────────────────
|
| 18 |
+
.git/
|
| 19 |
+
.gitignore
|
| 20 |
+
|
| 21 |
+
# ─── Large LLM model files (not needed — Gemini-only mode) ───────────────────
|
| 22 |
+
ml_models/llm/
|
| 23 |
+
ml_models/embeddings/bge-small/
|
| 24 |
+
|
| 25 |
+
# ─── Dev/test files not needed in production ──────────────────────────────────
|
| 26 |
+
tests/
|
| 27 |
+
docs/
|
| 28 |
+
results/
|
| 29 |
+
temp/
|
| 30 |
+
old/
|
| 31 |
+
scripts/
|
| 32 |
+
dump/
|
| 33 |
+
|
| 34 |
+
# ─── Root-level scratch/demo scripts ─────────────────────────────────────────
|
| 35 |
+
bm25.py
|
| 36 |
+
cfs.py
|
| 37 |
+
classifier-demo.py
|
| 38 |
+
fileService.py
|
| 39 |
+
hybrid_search.py
|
| 40 |
+
rewrite_query.py
|
| 41 |
+
testSearch.py
|
| 42 |
+
test_json_spliting.py
|
| 43 |
+
test_markdown_splitter.py
|
| 44 |
+
|
| 45 |
+
# ─── Large PDF files ──────────────────────────────────────────────────────────
|
| 46 |
+
*.pdf
|
| 47 |
+
|
| 48 |
+
# ─── Documentation ────────────────────────────────────────────────────────────
|
| 49 |
+
*.md
|
| 50 |
+
!readme.md
|
| 51 |
+
|
| 52 |
+
# ─── IDE / OS ─────────────────────────────────────────────────────────────────
|
| 53 |
+
.vscode/
|
| 54 |
+
.idea/
|
| 55 |
+
*.swp
|
| 56 |
+
.DS_Store
|
| 57 |
+
Thumbs.db
|
| 58 |
+
|
| 59 |
+
# ─── Second requirements file (unused) ───────────────────────────────────────
|
| 60 |
+
req.txt
|
.env.example
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
GOOGLE_API_KEY=
|
| 2 |
+
LLM_PROVIDER=gemini # or "local"
|
| 3 |
+
ENABLE_FALLBACK=true
|
.gitattributes
CHANGED
|
@@ -1,38 +1,6 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
|
| 27 |
-
*.
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
*.gguf filter=lfs diff=lfs merge=lfs -text
|
| 37 |
*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
| 38 |
*.pdf filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
*.gguf filter=lfs diff=lfs merge=lfs -text
|
| 5 |
*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
| 6 |
*.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
./models
|
| 2 |
+
.venv
|
| 3 |
+
cache
|
| 4 |
+
__pycache__
|
| 5 |
+
.env
|
| 6 |
+
Qwen2.5-0.5B-Instruct-Q4_K_M.gguf
|
| 7 |
+
|
| 8 |
+
# Python
|
| 9 |
+
__pycache__/
|
| 10 |
+
*.py[cod]
|
| 11 |
+
*$py.class
|
| 12 |
+
.venv/
|
| 13 |
+
venv/
|
| 14 |
+
*.egg-info/
|
| 15 |
+
|
| 16 |
+
# Environment
|
| 17 |
+
.env
|
| 18 |
+
|
| 19 |
+
# Data (runtime files)
|
| 20 |
+
data/
|
| 21 |
+
ml_models/
|
| 22 |
+
|
| 23 |
+
# IDE
|
| 24 |
+
.vscode/
|
| 25 |
+
.idea/
|
| 26 |
+
*.swp
|
| 27 |
+
|
| 28 |
+
# Temporary files
|
| 29 |
+
temp/
|
| 30 |
+
*.tmp
|
| 31 |
+
|
| 32 |
+
# OS
|
| 33 |
+
.DS_Store
|
| 34 |
+
Thumbs.db
|
| 35 |
+
|
| 36 |
+
# ML model files (large binary files)
|
| 37 |
+
ml_models/**/*.gguf
|
| 38 |
+
ml_models/**/*.bin
|
| 39 |
+
ml_models/**/*.safetensors
|
| 40 |
+
|
| 41 |
+
# Keep directory structure
|
| 42 |
+
!ml_models/.gitkeep
|
| 43 |
+
!ml_models/llm/.gitkeep
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# In .gitignore — add these exceptions:
|
| 47 |
+
!data/
|
| 48 |
+
!data/vector_stores/
|
| 49 |
+
!data/vector_stores/classifier_test_1/
|
| 50 |
+
!data/vector_stores/classifier_test_1/**
|
| 51 |
+
|
| 52 |
+
!ml_models/
|
| 53 |
+
!ml_models/classifier/
|
| 54 |
+
!ml_models/classifier/chatbot_classifier.pkl
|
| 55 |
+
!ml_models/embeddings/mdbr-leaf-mt/
|
| 56 |
+
!ml_models/embeddings/mdbr-leaf-mt/**
|
| 57 |
+
!data/classifier_test_1.json
|
CODEBASE_DOCUMENTATION.md
ADDED
|
@@ -0,0 +1,673 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# VGEC RAG Chatbot — Codebase Documentation
|
| 2 |
+
|
| 3 |
+
> **Generated:** 2026-03-25
|
| 4 |
+
> **Version:** 1.0.0
|
| 5 |
+
> **Scope:** Full system — ingestion, retrieval, classification, API, evaluation
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Table of Contents
|
| 10 |
+
|
| 11 |
+
1. [Project Overview](#1-project-overview)
|
| 12 |
+
2. [System Architecture](#2-system-architecture)
|
| 13 |
+
3. [Schema & Data Model](#3-schema--data-model)
|
| 14 |
+
4. [Retrieval Pipeline](#4-retrieval-pipeline)
|
| 15 |
+
5. [Key Classes & Modules](#5-key-classes--modules)
|
| 16 |
+
6. [Evaluation & Metrics](#6-evaluation--metrics)
|
| 17 |
+
7. [Known Limitations](#7-known-limitations)
|
| 18 |
+
8. [File Structure](#8-file-structure)
|
| 19 |
+
|
| 20 |
+
---
|
| 21 |
+
|
| 22 |
+
## 1. Project Overview
|
| 23 |
+
|
| 24 |
+
### Purpose
|
| 25 |
+
|
| 26 |
+
**VGEC RAG Chatbot** is a Retrieval-Augmented Generation (RAG) chatbot for **Vishwakarma Government Engineering College (VGEC), Chandkheda, Gujarat**. It allows students, faculty, and visitors to query structured information about the institution — departments, faculty, syllabus, labs, intake capacity, and more — through natural language.
|
| 27 |
+
|
| 28 |
+
### Domain
|
| 29 |
+
|
| 30 |
+
- **Institution:** VGEC (Government Engineering College, Gujarat)
|
| 31 |
+
- **Data Coverage:** Department-level information for multiple disciplines (Computer Engineering, Civil, Electrical, IT, ECE, etc.)
|
| 32 |
+
- **Topics:** Faculty lists, lab facilities, syllabus details, HOD info, research activities, intake capacity, achievements
|
| 33 |
+
|
| 34 |
+
### Tech Stack
|
| 35 |
+
|
| 36 |
+
| Layer | Technology |
|
| 37 |
+
|---|---|
|
| 38 |
+
| **API Framework** | FastAPI |
|
| 39 |
+
| **Vector Database** | ChromaDB (persistent, local) |
|
| 40 |
+
| **Embeddings** | Google `gemini-embedding-001` (via `langchain-google-genai`) |
|
| 41 |
+
| **LLM (Cloud)** | Google Gemini `gemini-2.5-flash-lite` |
|
| 42 |
+
| **LLM (Local)** | `EXAONE-3.5-2.4B-Instruct-Q4_K_M.gguf` via `llama-cpp-python` |
|
| 43 |
+
| **NLP / Preprocessing** | spaCy (`en_core_web_sm`), NLTK (PorterStemmer) |
|
| 44 |
+
| **Classifier** | Scikit-learn `LogisticRegression` + `SentenceTransformer` (`MongoDB/mdbr-leaf-mt`) |
|
| 45 |
+
| **BM25** | `langchain-community` `BM25Retriever` |
|
| 46 |
+
| **Chunking** | LangChain `RecursiveCharacterTextSplitter` |
|
| 47 |
+
| **Config** | Pydantic `BaseSettings` (`.env`-backed) |
|
| 48 |
+
|
| 49 |
+
### Key Features Implemented
|
| 50 |
+
|
| 51 |
+
- ✅ Structured JSON ingestion with intent-aware chunking
|
| 52 |
+
- ✅ Hybrid retrieval: BM25 + vector search fused via Reciprocal Rank Fusion (RRF)
|
| 53 |
+
- ✅ Intent/metadata classification with confidence-gated ChromaDB filters
|
| 54 |
+
- ✅ Abbreviation expansion (`CE` → `Computer Engineering`, etc.)
|
| 55 |
+
- ✅ Multi-turn conversation history support
|
| 56 |
+
- ✅ Dual LLM backend with automatic fallback (Gemini ↔ Local)
|
| 57 |
+
- ✅ Full CRUD REST API for vector store management
|
| 58 |
+
- ✅ Offline evaluation endpoint (MRR, hit rate, noise rate)
|
| 59 |
+
- ✅ Classifier accuracy evaluation endpoint
|
| 60 |
+
|
| 61 |
+
---
|
| 62 |
+
|
| 63 |
+
## 2. System Architecture
|
| 64 |
+
|
| 65 |
+
### Component Diagram
|
| 66 |
+
|
| 67 |
+
```
|
| 68 |
+
┌──────────────────────────┐
|
| 69 |
+
│ FastAPI App │
|
| 70 |
+
│ /api/v1/rag /vector │
|
| 71 |
+
└──────────┬───────────────┘
|
| 72 |
+
│ DI (lru_cache)
|
| 73 |
+
┌──────────▼───────────────┐
|
| 74 |
+
│ RAGService │
|
| 75 |
+
│ (core orchestrator) │
|
| 76 |
+
└──┬───────────┬────────────┘
|
| 77 |
+
│ │
|
| 78 |
+
┌─────────────▼──┐ ┌───▼──────────────────┐
|
| 79 |
+
│ IngestionService│ │ HybridRetrievalService│
|
| 80 |
+
│ (write path) │ │ (read path) │
|
| 81 |
+
└──────┬──────── ┘ └───┬──────────┬─────── ┘
|
| 82 |
+
│ │ │
|
| 83 |
+
┌──────────▼──┐ ┌──────────▼──┐ ┌────▼──────────┐
|
| 84 |
+
│ FileService │ │ ClassifierSvc│ │ VectorStore │
|
| 85 |
+
│ (file +meta) │ │(clf predict) │ │ (ChromaDB) │
|
| 86 |
+
└──────────────┘ └─────────────┘ └───────────────┘
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
### Data Flow
|
| 90 |
+
|
| 91 |
+
#### Ingestion Path
|
| 92 |
+
|
| 93 |
+
```
|
| 94 |
+
File Upload (PDF/MD/TXT/JSON)
|
| 95 |
+
│
|
| 96 |
+
▼
|
| 97 |
+
FileService.read_file() ← type-aware loading (PyMuPDF for PDF)
|
| 98 |
+
│ returns: Document + metadata
|
| 99 |
+
▼
|
| 100 |
+
FileService.write_file() ← persist copy to data/documents/
|
| 101 |
+
│
|
| 102 |
+
▼
|
| 103 |
+
IngestionService.handle_*_docs() ← route by file extension
|
| 104 |
+
│
|
| 105 |
+
├─ JSON → handle_json_docs() ← intent-aware chunks (list / detail / count)
|
| 106 |
+
└─ text → handle_text_docs() ← RecursiveCharacterTextSplitter + normalize()
|
| 107 |
+
│
|
| 108 |
+
▼
|
| 109 |
+
VectorStore.add_documents() ← embed + upsert into ChromaDB
|
| 110 |
+
│
|
| 111 |
+
▼
|
| 112 |
+
FileService.patch_metadata() ← update ingestion record JSON (chunk count, timing, size)
|
| 113 |
+
```
|
| 114 |
+
|
| 115 |
+
#### Query Path
|
| 116 |
+
|
| 117 |
+
```
|
| 118 |
+
User Question
|
| 119 |
+
│
|
| 120 |
+
▼
|
| 121 |
+
preprocess_query() ← tokenize + strip stopwords (spaCy) + normalize
|
| 122 |
+
│
|
| 123 |
+
▼
|
| 124 |
+
HybridRetrievalService.retrieve()
|
| 125 |
+
│
|
| 126 |
+
├─ clf.expand_abbreviations() ← CE → Computer Engineering
|
| 127 |
+
├─ clf.predict_with_filter() ← LogReg predict → Chroma $and/$or filter
|
| 128 |
+
├─ _vector_rank() ← ChromaDB similarity_search_with_score (k=15)
|
| 129 |
+
├─ _bm25_rank() ← BM25 over the vector candidate pool
|
| 130 |
+
├─ _reciprocal_rank_fusion() ← weighted RRF merge
|
| 131 |
+
├─ metadata score boosting ← multiply fused scores for confident matches
|
| 132 |
+
└─ _apply_title_boost() ← per-query-word title match bonus
|
| 133 |
+
│
|
| 134 |
+
▼
|
| 135 |
+
get_references_v2() ← filter by threshold, build context string
|
| 136 |
+
│
|
| 137 |
+
▼
|
| 138 |
+
LLM.invoke(prompt) ← Gemini or local LlamaCpp
|
| 139 |
+
│
|
| 140 |
+
▼
|
| 141 |
+
Return: { answer, references, context, threshold_used, k_used }
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
### External Dependencies
|
| 145 |
+
|
| 146 |
+
| Dependency | Role | Provider |
|
| 147 |
+
|---|---|---|
|
| 148 |
+
| ChromaDB | Persistent vector store | Local disk |
|
| 149 |
+
| Google Gemini API | Embeddings + LLM generation | Google Cloud |
|
| 150 |
+
| LlamaCpp (GGUF model) | Local LLM fallback | Local CPU |
|
| 151 |
+
| Sentence Transformers | Classifier feature extraction | HuggingFace Hub |
|
| 152 |
+
| spaCy `en_core_web_sm` | POS tagging / lemmatization | Local |
|
| 153 |
+
|
| 154 |
+
---
|
| 155 |
+
|
| 156 |
+
## 3. Schema & Data Model
|
| 157 |
+
|
| 158 |
+
### Source JSON Format
|
| 159 |
+
|
| 160 |
+
Source data files (e.g. `computer_eng.json`) follow this schema:
|
| 161 |
+
|
| 162 |
+
```json
|
| 163 |
+
{
|
| 164 |
+
"id": "computer-engineering-department",
|
| 165 |
+
"name": "Computer Engineering Department",
|
| 166 |
+
"source": "https://www.vgecg.ac.in/department.php?dept=3",
|
| 167 |
+
"category": "computer_eng",
|
| 168 |
+
"type": "department",
|
| 169 |
+
"created_date": "2026-02-19",
|
| 170 |
+
"content": {
|
| 171 |
+
"<topic_key>": {
|
| 172 |
+
"list": ["item 1", "item 2", "..."],
|
| 173 |
+
"details": "Paragraph describing the topic."
|
| 174 |
+
}
|
| 175 |
+
}
|
| 176 |
+
}
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
**Top-level fields:**
|
| 180 |
+
|
| 181 |
+
| Field | Type | Description |
|
| 182 |
+
|---|---|---|
|
| 183 |
+
| `id` | string | Unique document identifier |
|
| 184 |
+
| `name` | string | Human-readable institution/department name |
|
| 185 |
+
| `source` | string | Authoritative URL |
|
| 186 |
+
| `category` | string | Department slug (e.g. `computer_eng`) |
|
| 187 |
+
| `type` | string | Document type (e.g. `department`) |
|
| 188 |
+
| `created_date` | string (ISO) | Data creation date |
|
| 189 |
+
| `content` | object | Topic map; each key = a topic |
|
| 190 |
+
|
| 191 |
+
### Chunk Metadata Schema (stored in ChromaDB)
|
| 192 |
+
|
| 193 |
+
Every vector chunk stored in Chroma carries the following metadata:
|
| 194 |
+
|
| 195 |
+
| Field | Type | Source |
|
| 196 |
+
|---|---|---|
|
| 197 |
+
| `id` | string (UUID) | Auto-generated |
|
| 198 |
+
| `title` | string | Document name / topic key |
|
| 199 |
+
| `source` | string | Source URL |
|
| 200 |
+
| `source_file` | string | Filename (e.g. `computer_eng.json`) |
|
| 201 |
+
| `type` | string | Taxonomy level 1 (e.g. `department`) |
|
| 202 |
+
| `category` | string | Taxonomy level 2 (e.g. `computer_eng`) |
|
| 203 |
+
| `topic` | string | Taxonomy level 3 (e.g. `faculty`) |
|
| 204 |
+
| `intent` | string | Chunk intent: `list`, `detail`, or `count` |
|
| 205 |
+
| `chunk_index` | int | Sequential index within file |
|
| 206 |
+
| `created_date` | string (ISO) | Ingestion timestamp |
|
| 207 |
+
| `updated_at` | string (ISO) | Last modification timestamp |
|
| 208 |
+
| `ext` | string | Source file extension (`json`, `pdf`, `md`, `txt`) |
|
| 209 |
+
|
| 210 |
+
### Hierarchical Taxonomy
|
| 211 |
+
|
| 212 |
+
The classifier predicts and ChromaDB filters operate on a 3-level hierarchy:
|
| 213 |
+
|
| 214 |
+
```
|
| 215 |
+
type
|
| 216 |
+
└── category
|
| 217 |
+
└── topic
|
| 218 |
+
└── intent (list | detail | count)
|
| 219 |
+
```
|
| 220 |
+
|
| 221 |
+
**Example mapping (Computer Engineering):**
|
| 222 |
+
|
| 223 |
+
```
|
| 224 |
+
type: "department"
|
| 225 |
+
└── category: "computer_eng"
|
| 226 |
+
├── topic: "faculty" → intent: list | detail
|
| 227 |
+
├── topic: "lab" → intent: list | detail
|
| 228 |
+
├── topic: "syllabus" → intent: list | detail
|
| 229 |
+
├── topic: "hod" → intent: list | detail
|
| 230 |
+
├── topic: "intake" → intent: list | detail
|
| 231 |
+
├── topic: "research" → intent: list | detail
|
| 232 |
+
└── topic: "achievements"
|
| 233 |
+
```
|
| 234 |
+
|
| 235 |
+
### Document Chunking Strategy
|
| 236 |
+
|
| 237 |
+
**JSON documents** use a hand-crafted, intent-aware strategy in `IngestionService.handle_json_docs()`:
|
| 238 |
+
|
| 239 |
+
| Intent | Chunk Content | Metadata |
|
| 240 |
+
|---|---|---|
|
| 241 |
+
| `list` | Numbered list: `1. item\n2. item\n...` | `intent=list` |
|
| 242 |
+
| `count` | `"Total <topic>: N"` (auto-generated) | `intent=count` |
|
| 243 |
+
| `detail` | Raw paragraph text | `intent=detail` |
|
| 244 |
+
|
| 245 |
+
**Text/PDF/Markdown documents** use `RecursiveCharacterTextSplitter`:
|
| 246 |
+
- Default: `chunk_size=500`, `chunk_overlap=100`
|
| 247 |
+
- Separator priority: `\n\n` → `\n` → ` ` → (character)
|
| 248 |
+
- Markdown variant respects `---` section delimiters
|
| 249 |
+
- Content is passed through `normalize()` (tokenize + strip blanks) before storage
|
| 250 |
+
|
| 251 |
+
---
|
| 252 |
+
|
| 253 |
+
## 4. Retrieval Pipeline
|
| 254 |
+
|
| 255 |
+
### Query Processing Flow
|
| 256 |
+
|
| 257 |
+
```python
|
| 258 |
+
# Step 1: Normalize input
|
| 259 |
+
question = preprocess_query(question)
|
| 260 |
+
# → spaCy POS filter (NOUN, PROPN, VERB, NUM, ADJ) + lemmatize + strip stopwords
|
| 261 |
+
|
| 262 |
+
# Step 2: Expand abbreviations
|
| 263 |
+
processed_query = clf.expand_abbreviations(query)
|
| 264 |
+
# → "CE dept" → "computer engineering department"
|
| 265 |
+
|
| 266 |
+
# Step 3: Classify intent/metadata
|
| 267 |
+
filters = clf.predict_with_filter([processed_query])
|
| 268 |
+
# → {"$and": [{"type": "department"}, {"intent": "list"}, {"$or": [...]}]}
|
| 269 |
+
|
| 270 |
+
# Step 4: Vector search with optional filter
|
| 271 |
+
raw_results = chroma.similarity_search_with_score(query, k=15, filter=filters)
|
| 272 |
+
# Fallback: if filtered results empty, retry without filter
|
| 273 |
+
|
| 274 |
+
# Step 5: BM25 re-rank over vector candidates
|
| 275 |
+
bm25_results = BM25Retriever.from_documents(candidate_docs)
|
| 276 |
+
|
| 277 |
+
# Step 6: RRF fusion
|
| 278 |
+
fused_score(d) = bm25_weight * 1/(rrf_k + rank_bm25)
|
| 279 |
+
+ vector_weight * 1/(rrf_k + rank_vec)
|
| 280 |
+
|
| 281 |
+
# Step 7: Metadata confidence boosting
|
| 282 |
+
if doc.metadata[field] == predicted_val and conf > 0.90:
|
| 283 |
+
result.fused_score *= boost_factor # 1.10–1.20
|
| 284 |
+
|
| 285 |
+
# Step 8: Title word boost
|
| 286 |
+
for word in query_words:
|
| 287 |
+
if word in doc.title:
|
| 288 |
+
result.fused_score += title_boost_per_word # 0.004
|
| 289 |
+
|
| 290 |
+
# Step 9: Threshold filter + sort + top-k
|
| 291 |
+
results = [r for r in results if r.fused_score >= threshold]
|
| 292 |
+
```
|
| 293 |
+
|
| 294 |
+
### Classifier Thresholds
|
| 295 |
+
|
| 296 |
+
The `Classifier` uses two separate threshold tables:
|
| 297 |
+
|
| 298 |
+
**Prediction threshold** — below this, the field is set to `None` (not used at all):
|
| 299 |
+
|
| 300 |
+
| Field | Threshold |
|
| 301 |
+
|---|---|
|
| 302 |
+
| `type` | 0.40 |
|
| 303 |
+
| `category` | 0.40 |
|
| 304 |
+
| `topic` | 0.50 |
|
| 305 |
+
| `intent` | 0.60 |
|
| 306 |
+
|
| 307 |
+
**Filter threshold** — above this, the field becomes a hard ChromaDB `$and` filter:
|
| 308 |
+
|
| 309 |
+
| Field | Threshold |
|
| 310 |
+
|---|---|
|
| 311 |
+
| `type` | 0.65 |
|
| 312 |
+
| `category` | 0.65 |
|
| 313 |
+
| `topic` | 0.70 |
|
| 314 |
+
|
| 315 |
+
### Filter Construction Logic (`_build_filter`)
|
| 316 |
+
|
| 317 |
+
```python
|
| 318 |
+
# Gate: if type confidence < 0.65 → return None (full scan)
|
| 319 |
+
# Hard anchors (always included if type passes):
|
| 320 |
+
# - type == predicted_type
|
| 321 |
+
# - intent == predicted_intent (special: "count" expands to count OR detail)
|
| 322 |
+
# Soft hints (combined as $or):
|
| 323 |
+
# - category == predicted_category (if conf >= 0.65, else "general")
|
| 324 |
+
# - topic == predicted_topic (if conf >= 0.70, else "general")
|
| 325 |
+
```
|
| 326 |
+
|
| 327 |
+
### Hybrid Retrieval Config (Defaults)
|
| 328 |
+
|
| 329 |
+
| Parameter | `hybrid_query` | `search_docs` |
|
| 330 |
+
|---|---|---|
|
| 331 |
+
| `candidate_k` | 15 | 15 |
|
| 332 |
+
| `top_k` (final) | `settings.similarity_top_k` (8) | k (param) |
|
| 333 |
+
| `bm25_weight` | 0.45 | 0.70 |
|
| 334 |
+
| `vector_weight` | 0.55 | 0.30 |
|
| 335 |
+
| `rrf_k` | 20 | 20 |
|
| 336 |
+
| `bm25_k1` | 1.2 | 1.5 |
|
| 337 |
+
| `bm25_b` | 0.9 | 0.75 |
|
| 338 |
+
| `title_boost_per_word` | 0.004 | 0.004 |
|
| 339 |
+
| `score_threshold` | 0.4 | 0.4 |
|
| 340 |
+
|
| 341 |
+
> **Note:** `search_docs` is BM25-heavy (0.70) since it is used for keyword-oriented document browsing, while `hybrid_query` is vector-heavy for semantic QA.
|
| 342 |
+
|
| 343 |
+
---
|
| 344 |
+
|
| 345 |
+
## 5. Key Classes & Modules
|
| 346 |
+
|
| 347 |
+
### Services (`app/services/`)
|
| 348 |
+
|
| 349 |
+
#### `RAGService`
|
| 350 |
+
|
| 351 |
+
Main orchestrator. Singleton via `lru_cache` in `dependencies.py`.
|
| 352 |
+
|
| 353 |
+
| Method | Description |
|
| 354 |
+
|---|---|
|
| 355 |
+
| `query()` | Semantic-only QA (vector search → LLM) |
|
| 356 |
+
| `hybrid_query()` | Hybrid QA (BM25 + vector → RRF → LLM) |
|
| 357 |
+
| `search_docs()` | BM25-heavy document search, no LLM |
|
| 358 |
+
| `ingest_documents()` | Ingest a file path into the vector store |
|
| 359 |
+
| `get_filenames()` | Return all tracked file metadata records |
|
| 360 |
+
| `test_queries()` | Batch retrieval evaluation (MRR, precision, noise) |
|
| 361 |
+
| `test_classifier()` | Batch classifier accuracy evaluation |
|
| 362 |
+
| `delete_database()` | Drop the entire ChromaDB collection |
|
| 363 |
+
|
| 364 |
+
#### `HybridRetrievalService`
|
| 365 |
+
|
| 366 |
+
Stateless per-request service created inline by `RAGService`.
|
| 367 |
+
|
| 368 |
+
| Method | Description |
|
| 369 |
+
|---|---|
|
| 370 |
+
| `retrieve(query)` | Full hybrid retrieval pipeline; returns `List[RetrievalResult]` |
|
| 371 |
+
| `_vector_rank()` | Chroma similarity search + classifier filter |
|
| 372 |
+
| `_bm25_rank()` | BM25 over candidate pool |
|
| 373 |
+
| `_reciprocal_rank_fusion()` | Merge both ranked lists via RRF |
|
| 374 |
+
| `_apply_title_boost()` | Word-level title match score bonus |
|
| 375 |
+
|
| 376 |
+
**`RetrievalResult` dataclass:**
|
| 377 |
+
|
| 378 |
+
```python
|
| 379 |
+
@dataclass
|
| 380 |
+
class RetrievalResult:
|
| 381 |
+
document: Document
|
| 382 |
+
fused_score: float
|
| 383 |
+
bm25_rank: Optional[int]
|
| 384 |
+
vector_rank: Optional[int]
|
| 385 |
+
title_boost: float
|
| 386 |
+
```
|
| 387 |
+
|
| 388 |
+
#### `Classifier`
|
| 389 |
+
|
| 390 |
+
Loaded at startup from a pickled pipeline (`chatbot_classifier.pkl`).
|
| 391 |
+
|
| 392 |
+
| Method | Description |
|
| 393 |
+
|---|---|
|
| 394 |
+
| `predict(queries)` | Returns list of `{type, category, topic, intent, *_conf}` dicts |
|
| 395 |
+
| `predict_with_filter(queries)` | Returns a ChromaDB-compatible filter dict or `None` |
|
| 396 |
+
| `expand_abbreviations(text)` | Regex-based abbreviation expansion |
|
| 397 |
+
| `get_features(queries)` | Build `[SentenceTransformer embedding | TF-IDF]` feature matrix |
|
| 398 |
+
| `train_models(df)` | Train 4 LogisticRegression classifiers (offline use) |
|
| 399 |
+
|
| 400 |
+
#### `IngestionService`
|
| 401 |
+
|
| 402 |
+
| Method | Description |
|
| 403 |
+
|---|---|
|
| 404 |
+
| `ingest(file_path)` | Load + chunk a file; returns `List[Document]` |
|
| 405 |
+
| `handle_json_docs()` | Intent-aware chunking for structured JSON data |
|
| 406 |
+
| `handle_text_docs()` | Recursive character splitting for unstructured text |
|
| 407 |
+
| `get_records()` | Delegate to `FileService.get_records()` |
|
| 408 |
+
| `delete_record(filename)` | Remove a file's metadata record |
|
| 409 |
+
| `path_record(path, metadata)` | Patch ingestion stats after indexing |
|
| 410 |
+
|
| 411 |
+
#### `FileService`
|
| 412 |
+
|
| 413 |
+
| Method | Description |
|
| 414 |
+
|---|---|
|
| 415 |
+
| `read_file(path)` | Load file content; dispatches by extension |
|
| 416 |
+
| `write_file(path, content, metadata)` | Persist file to `data/documents/` |
|
| 417 |
+
| `patch_metadata(path, metadata)` | Merge new fields into existing record |
|
| 418 |
+
| `get_records()` | Return all ingestion records dict |
|
| 419 |
+
| `delete_record(filename)` | Remove a record from `<collection>.json` |
|
| 420 |
+
|
| 421 |
+
#### `VectorStore`
|
| 422 |
+
|
| 423 |
+
Thin wrapper around `langchain_chroma.Chroma`.
|
| 424 |
+
|
| 425 |
+
| Method | Description |
|
| 426 |
+
|---|---|
|
| 427 |
+
| `get()` | Retrieve all documents |
|
| 428 |
+
| `get_by_id(ids)` | Retrieve specific documents by ID |
|
| 429 |
+
| `add_documents(docs)` | Embed + insert, skipping empty chunks |
|
| 430 |
+
| `update_document(id, doc)` | Delete then re-insert with same ID |
|
| 431 |
+
| `delete(ids)` | Remove documents by ID list |
|
| 432 |
+
| `similarity_search_with_score()` | Wrapped Chroma search |
|
| 433 |
+
|
| 434 |
+
### Utilities (`app/utils/`)
|
| 435 |
+
|
| 436 |
+
#### `preprocessing.py`
|
| 437 |
+
|
| 438 |
+
| Function | Description |
|
| 439 |
+
|---|---|
|
| 440 |
+
| `preprocess(text)` | spaCy POS filter + lemmatize + stopword removal → joined string |
|
| 441 |
+
| `normalize(text)` | Tokenize + strip blanks (lightweight, no POS) |
|
| 442 |
+
| `preprocess_query(query)` | Applies `normalize()` to user queries |
|
| 443 |
+
| `preprocess_documents(docs)` | Applies `preprocess()` to a document list in-place |
|
| 444 |
+
| `preprocess_filename(path)` | Sanitize filename (remove special chars, lowercase) |
|
| 445 |
+
|
| 446 |
+
#### `document_helpers.py`
|
| 447 |
+
|
| 448 |
+
| Function | Description |
|
| 449 |
+
|---|---|
|
| 450 |
+
| `get_references_v2(docs, threshold)` | Convert `RetrievalResult` list → references dict + context string |
|
| 451 |
+
| `get_references(docs, threshold)` | Same for raw `(Document, distance)` tuples (used by `query()`) |
|
| 452 |
+
| `build_metadata(path)` | Parse YAML frontmatter from `.md`/`.txt` files |
|
| 453 |
+
| `create_documents(chunks, ...)` | Attach standard metadata (UUID, timestamps, indices) to chunks |
|
| 454 |
+
| `create_documents_from_text(text)` | Full pipeline: frontmatter parse → split → metadata attach |
|
| 455 |
+
| `clean_metadata(metadata)` | Serialize datetime, coerce non-allowed types to string |
|
| 456 |
+
|
| 457 |
+
#### `model_factory.py`
|
| 458 |
+
|
| 459 |
+
| Function | Description |
|
| 460 |
+
|---|---|
|
| 461 |
+
| `get_embedding_model()` | Returns `GoogleGenerativeAIEmbeddings` |
|
| 462 |
+
| `get_gemini_model()` | Returns `ChatGoogleGenerativeAI` |
|
| 463 |
+
| `get_local_model()` | Returns `ChatLlamaCpp` (GGUF, CPU inference) |
|
| 464 |
+
| `get_llm_model(provider)` | Dispatches to Gemini or Local with fallback logic |
|
| 465 |
+
|
| 466 |
+
### API Routes (`app/api/routes/`)
|
| 467 |
+
|
| 468 |
+
#### `rag.py` — prefix `/api/v1/rag`
|
| 469 |
+
|
| 470 |
+
| Method | Endpoint | Description |
|
| 471 |
+
|---|---|---|
|
| 472 |
+
| GET | `/` | Health check |
|
| 473 |
+
| POST | `/` | Semantic query |
|
| 474 |
+
| POST | `/hybrid_query` | Hybrid RAG query (primary endpoint) |
|
| 475 |
+
| POST | `/similarity_search` | Hybrid retrieval, no LLM response |
|
| 476 |
+
| POST | `/search` | BM25-heavy document search |
|
| 477 |
+
| POST | `/test` | Batch retrieval evaluation |
|
| 478 |
+
| POST | `/test_classifier` | Classifier accuracy evaluation |
|
| 479 |
+
| GET | `/test_classifier_dataset` | Run built-in test dataset, cache result |
|
| 480 |
+
|
| 481 |
+
#### `vector_store.py` — prefix `/api/v1/vector`
|
| 482 |
+
|
| 483 |
+
| Method | Endpoint | Description |
|
| 484 |
+
|---|---|---|
|
| 485 |
+
| GET | `/` | List all documents (paginated, filterable) |
|
| 486 |
+
| GET | `/filenames` | List ingested file records |
|
| 487 |
+
| GET | `/{id}` | Get single document by ChromaDB ID |
|
| 488 |
+
| POST | `/` | Upload + ingest file |
|
| 489 |
+
| PUT | `/{id}` | Update document content/metadata |
|
| 490 |
+
| DELETE | `/ids` | Bulk delete by ID list |
|
| 491 |
+
| DELETE | `/{id}` | Delete single document |
|
| 492 |
+
| DELETE | `/` | Filter-based delete (filename/source/contains) |
|
| 493 |
+
|
| 494 |
+
### Configuration (`app/core/config.py`)
|
| 495 |
+
|
| 496 |
+
All settings are read from `.env` via Pydantic `BaseSettings`:
|
| 497 |
+
|
| 498 |
+
```python
|
| 499 |
+
class Settings(BaseSettings):
|
| 500 |
+
# Paths
|
| 501 |
+
collection_name: str = "classifier_test_1"
|
| 502 |
+
persist_directory: str = "./data/vector_stores/classifier_test_1"
|
| 503 |
+
|
| 504 |
+
# Chunking
|
| 505 |
+
chunk_size: int = 500
|
| 506 |
+
chunk_overlap: int = 100
|
| 507 |
+
|
| 508 |
+
# Retrieval
|
| 509 |
+
similarity_top_k: int = 8
|
| 510 |
+
similarity_threshold: float = 0.4
|
| 511 |
+
|
| 512 |
+
# LLM Provider
|
| 513 |
+
llm_provider: Literal["gemini", "local"] = "local"
|
| 514 |
+
enable_fallback: bool = True
|
| 515 |
+
|
| 516 |
+
# Models
|
| 517 |
+
embedding_model_name: str = "models/gemini-embedding-001"
|
| 518 |
+
gemini_model_name: str = "gemini-2.5-flash-lite"
|
| 519 |
+
local_model_name: str = "EXAONE-3.5-2.4B-Instruct-Q4_K_M.gguf"
|
| 520 |
+
|
| 521 |
+
# Generation
|
| 522 |
+
max_output_tokens: int = 2048
|
| 523 |
+
local_max_tokens: int = 512
|
| 524 |
+
|
| 525 |
+
# Auth
|
| 526 |
+
google_api_key: str # required — must be in .env
|
| 527 |
+
```
|
| 528 |
+
|
| 529 |
+
---
|
| 530 |
+
|
| 531 |
+
## 6. Evaluation & Metrics
|
| 532 |
+
|
| 533 |
+
### Retrieval Evaluation (`test_queries` / `POST /api/v1/rag/test`)
|
| 534 |
+
|
| 535 |
+
Tests each (question, expected_document, expected_chunk_index) triple against `hybrid_query`:
|
| 536 |
+
|
| 537 |
+
| Metric | Formula | Interpretation |
|
| 538 |
+
|---|---|---|
|
| 539 |
+
| **Hit Rate** | `hits / total` | % of questions where the exact chunk was retrieved |
|
| 540 |
+
| **Top-1 Hit Rate** | `rank==1 hits / total` | % of questions where exact chunk was top result |
|
| 541 |
+
| **MRR** | `mean(1/rank)` | Mean Reciprocal Rank; higher = correct result ranked earlier |
|
| 542 |
+
| **Doc Precision** | `correct_source_chunks / all_chunks` | How many retrieved chunks came from the right document |
|
| 543 |
+
| **Doc Recall** | `1 if any correct_source_chunk else 0` | Did we retrieve at least one chunk from the right document? |
|
| 544 |
+
| **Doc Noise** | `wrong_source_chunks / all_chunks` | Proportion of off-topic chunks in the result set |
|
| 545 |
+
| **Error Rate** | `1 - hit_rate` | Miss rate for exact chunk retrieval |
|
| 546 |
+
|
| 547 |
+
**Test Input Schema:**
|
| 548 |
+
|
| 549 |
+
```python
|
| 550 |
+
class TestRequestSchema(BaseModel):
|
| 551 |
+
tests: List[Test] # question + document + chunk_index
|
| 552 |
+
k: int = 5
|
| 553 |
+
threshold: float = 0.4
|
| 554 |
+
```
|
| 555 |
+
|
| 556 |
+
### Classifier Evaluation (`test_classifier` / `POST /api/v1/rag/test_classifier`)
|
| 557 |
+
|
| 558 |
+
Evaluates predictions for all 4 classification fields (`type`, `category`, `topic`, `intent`):
|
| 559 |
+
|
| 560 |
+
| Metric | Notes |
|
| 561 |
+
|---|---|
|
| 562 |
+
| **Accuracy** | `sklearn.accuracy_score` |
|
| 563 |
+
| **Precision (macro)** | `zero_division=0` |
|
| 564 |
+
| **Recall (macro)** | `zero_division=0` |
|
| 565 |
+
| **F1 Macro** | Unweighted average across classes |
|
| 566 |
+
| **F1 Weighted** | Class-frequency weighted |
|
| 567 |
+
| **Classification Report** | Full per-class breakdown (`output_dict=True`) |
|
| 568 |
+
|
| 569 |
+
A bundled test dataset is stored in `app/utils/tests.py` as `classifier_test_dataset` and can be executed via `GET /api/v1/rag/test_classifier_dataset`. Results are **memoized** on the `RAGService.evaluation` dict for the lifetime of the server process.
|
| 570 |
+
|
| 571 |
+
---
|
| 572 |
+
|
| 573 |
+
## 7. Known Limitations
|
| 574 |
+
|
| 575 |
+
### Technical Debt
|
| 576 |
+
|
| 577 |
+
- **`preprocess_query` is incomplete.** The function signature has an LLM-powered query rewriting block that is commented out. Currently it just calls `normalize()` (tokenize only), which means no stopword removal or lemmatization is applied to user queries (only to stored documents).
|
| 578 |
+
- **`search_docs` does not honour `filename` as a metadata filter in Chroma.** The filter is applied in Python post-retrieval, which is inefficient for large collections.
|
| 579 |
+
- **Count intent is synthetic.** The `"Total <topic>: N"` chunk is an auto-generated chunk during ingestion, not from the source document. If source data changes, stale count chunks can remain indexed.
|
| 580 |
+
- **`VectorStore.get_dict()` has a `print(type(rows))`** debug statement left in production code.
|
| 581 |
+
- **`FileService.__init__` docstring** has an extra backtick: `"`\`` class docstring`.
|
| 582 |
+
|
| 583 |
+
### Planned but Unimplemented
|
| 584 |
+
|
| 585 |
+
- **Query rewriting via local LLM** — skeleton is commented out in `preprocess_query()`.
|
| 586 |
+
- **Semantic caching** — no query result memoization at the API layer.
|
| 587 |
+
- **Re-ranker** — no cross-encoder re-ranking step; relies only on RRF + boosting.
|
| 588 |
+
- **`topic` field is not included in the ChromaDB hard filter** — only `type` + `intent` are hard-anchored; `category` and `topic` are soft `$or` hints.
|
| 589 |
+
|
| 590 |
+
### Performance Bottlenecks
|
| 591 |
+
|
| 592 |
+
- **Local LLM (LlamaCpp)** is CPU-only with `n_ctx=8096` and `n_threads=4`. Response latency is high (~10–30s) on low-RAM systems.
|
| 593 |
+
- **Classifier uses `SentenceTransformer` + `TF-IDF` features** — inference runs on every request with no caching of query embeddings.
|
| 594 |
+
- **BM25 corpus is rebuilt from scratch per request** — `BM25Retriever.from_documents()` is called inside `_bm25_rank()` each time.
|
| 595 |
+
- **`classify_test_dataset` in `app/utils/tests.py`** is a very large file (1.8MB) loaded at import time.
|
| 596 |
+
- **The memoized evaluation** in `rag_service.evaluation` is not thread-safe if the server runs with multiple workers.
|
| 597 |
+
|
| 598 |
+
---
|
| 599 |
+
|
| 600 |
+
## 8. File Structure
|
| 601 |
+
|
| 602 |
+
```
|
| 603 |
+
VGEC-RAG-Chatbot/
|
| 604 |
+
│
|
| 605 |
+
├── app/ # Application package
|
| 606 |
+
│ ├── main.py # FastAPI app, router mounting, CORS middleware
|
| 607 |
+
│ ├── core/
|
| 608 |
+
│ │ ├── config.py # Pydantic Settings (all tuneable params)
|
| 609 |
+
│ │ └── paths.py # Path constants helper
|
| 610 |
+
│ │
|
| 611 |
+
│ ├── api/
|
| 612 |
+
│ │ ├── dependencies.py # lru_cache singleton for RAGService
|
| 613 |
+
│ │ ├── routes/
|
| 614 |
+
│ │ │ ├── rag.py # /rag endpoints (query, test, classifier)
|
| 615 |
+
│ │ │ ├── vector_store.py # /vector endpoints (CRUD for ChromaDB)
|
| 616 |
+
│ │ │ └── settings.py # /settings endpoints
|
| 617 |
+
│ │ └── schemas/
|
| 618 |
+
│ │ ├── requests.py # RAGRequest, PaginationParams, etc.
|
| 619 |
+
│ │ └── tests.py # TestRequestSchema, TestClassifierReqSchema
|
| 620 |
+
│ │
|
| 621 |
+
│ ├── services/
|
| 622 |
+
│ │ ├── rag_service.py # RAGService (main orchestrator)
|
| 623 |
+
│ │ ├── hybrid_retrieval.py # HybridRetrievalService + RRF logic
|
| 624 |
+
│ │ ├── classifier_service.py # Classifier class + singleton clf
|
| 625 |
+
│ │ ├── ingestion_service.py # IngestionService (chunking pipeline)
|
| 626 |
+
│ │ ├── file_service.py # FileService (file I/O + metadata JSON)
|
| 627 |
+
│ │ ├── vector_store.py # VectorStore (thin ChromaDB wrapper)
|
| 628 |
+
│ │ ├── text_splitter.py # TextSplitter (RecursiveCharacter + variants)
|
| 629 |
+
│ │ └── document_loader.py # (legacy loader, not in primary path)
|
| 630 |
+
│ │
|
| 631 |
+
│ ├── utils/
|
| 632 |
+
│ │ ├���─ preprocessing.py # preprocess(), normalize(), preprocess_query()
|
| 633 |
+
│ │ ├── document_helpers.py # get_references_v2(), build_metadata(), create_documents()
|
| 634 |
+
│ │ ├── model_factory.py # get_llm_model(), get_embedding_model()
|
| 635 |
+
│ │ ├── constants.py # stopwords list, short_words_mappings
|
| 636 |
+
│ │ ├── embeddings.py # (thin embedding util)
|
| 637 |
+
│ │ ├── llm_models.py # (thin LLM util)
|
| 638 |
+
│ │ └── tests.py # classifier_test_dataset (large, 1.8MB)
|
| 639 |
+
│ │
|
| 640 |
+
│ └── prompts/
|
| 641 |
+
│ └── __init__.py # SYSTEM_PROMPT, wrap_exaone()
|
| 642 |
+
│
|
| 643 |
+
├── ml_models/
|
| 644 |
+
│ ├── classifier/
|
| 645 |
+
│ │ └── chatbot_classifier.pkl # Pickled pipeline (models, tfidf, label encoders, etc.)
|
| 646 |
+
│ ├── embeddings/ # (Local embedding model weights, if any)
|
| 647 |
+
│ └── llm/
|
| 648 |
+
│ └── EXAONE-3.5-2.4B-*.gguf # Local LLM weights
|
| 649 |
+
│
|
| 650 |
+
├── data/
|
| 651 |
+
│ ├── department_data/ # Source JSON files per department
|
| 652 |
+
│ │ ├── computer_eng.json
|
| 653 |
+
│ │ ├── civil.json
|
| 654 |
+
│ │ └── ...
|
| 655 |
+
│ ├── documents/ # Persistent copies of ingested files
|
| 656 |
+
│ ├── vector_stores/
|
| 657 |
+
│ │ └── classifier_test_1/ # ChromaDB persist directory
|
| 658 |
+
│ ├── classifier_test_1.json # Ingestion metadata registry (FileService records)
|
| 659 |
+
│ └── other_data/ # Misc data files
|
| 660 |
+
│
|
| 661 |
+
├── temp/ # Staging area for uploaded files (auto-cleared)
|
| 662 |
+
├── scripts/ # Offline scripts (training, testing)
|
| 663 |
+
├── tests/ # Test files
|
| 664 |
+
│
|
| 665 |
+
├── requirements.txt # Pinned production dependencies
|
| 666 |
+
├── .env # Runtime secrets (google_api_key, etc.)
|
| 667 |
+
├── .env.example # Template for .env
|
| 668 |
+
└── CODEBASE_DOCUMENTATION.md # This file
|
| 669 |
+
```
|
| 670 |
+
|
| 671 |
+
---
|
| 672 |
+
|
| 673 |
+
*End of documentation.*
|
College_Overview2.md
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Vishwakarma Government Engineering College
|
| 3 |
+
source_url: https://www.vgecg.ac.in/index.php
|
| 4 |
+
domain: https://www.vgecg.ac.in
|
| 5 |
+
pathname: /index.php
|
| 6 |
+
visited: 2026-02-15T12:55:40.751Z
|
| 7 |
+
topic: College Overview
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# College Statistics
|
| 11 |
+
|
| 12 |
+
Description:
|
| 13 |
+
This page provides some key statistics about Vishwakarma Government Engineering College.
|
| 14 |
+
|
| 15 |
+
- **Publication:** 48046+
|
| 16 |
+
- **Research Labs:** 13+
|
| 17 |
+
- **Courses:** 12+
|
| 18 |
+
- **Highest Package (Lacs):** 23+
|
| 19 |
+
|
| 20 |
+
Source:
|
| 21 |
+
- https://www.vgecg.ac.in/index.php
|
| 22 |
+
|
| 23 |
+
Keywords:
|
| 24 |
+
- college statistics
|
| 25 |
+
- VGEC overview
|
| 26 |
+
- publications
|
| 27 |
+
- research labs
|
| 28 |
+
- courses
|
| 29 |
+
- placements
|
DOCUMENTATION_PLAN.md
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# VGEC RAG Chatbot — Software Documentation Plan
|
| 2 |
+
> Based on IEEE/Industry Standard | Updated: 2026-03-25
|
| 3 |
+
> Reference: `CODEBASE_DOCUMENTATION.md` covers most of Phase 5 already — reuse it.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## DIAGRAMS FIRST — Priority Order
|
| 8 |
+
|
| 9 |
+
> Do all diagrams before writing any prose. Diagrams take the most time and are referenced throughout.
|
| 10 |
+
|
| 11 |
+
| # | Diagram | Phase Used In | Tool | Status |
|
| 12 |
+
|---|---|---|---|---|
|
| 13 |
+
| 1 | High-Level Architecture (Component Diagram) | Phase 5 | Draw.io / Mermaid | [ ] |
|
| 14 |
+
| 2 | Data Flow — Query Path | Phase 5 | Draw.io (DFD Level 2) | [ ] |
|
| 15 |
+
| 3 | Data Flow — Ingestion Path | Phase 5 | Draw.io (DFD Level 2) | [ ] |
|
| 16 |
+
| 4 | Hierarchical Taxonomy Tree (Type→Category→Topic) | Phase 5 | Tree diagram / Mermaid | [ ] |
|
| 17 |
+
| 5 | Filter Decision Flowchart (Strict→Partial→Fallback) | Phase 5 | Flowchart / Draw.io | [ ] |
|
| 18 |
+
| 6 | Hybrid Retrieval Sequence (Vector→BM25→RRF→Boost) | Phase 5 | Sequence diagram / Flow | [ ] |
|
| 19 |
+
| 7 | Use Case Diagram (Student, Faculty, Admin actors) | Phase 4 | Draw.io / PlantUML | [ ] |
|
| 20 |
+
| 8 | System Context Diagram / Level 0 DFD | Phase 2 | Draw.io | [ ] |
|
| 21 |
+
| 9 | Class Diagram (simplified — RAGService + helpers) | Phase 6 | Draw.io / UML | [ ] |
|
| 22 |
+
| 10 | Activity Diagram — Chunking Process | Phase 6 | Activity flow / Draw.io | [ ] |
|
| 23 |
+
| 11 | MRR Bar Chart — Your RAG vs Traditional | Phase 7 | matplotlib / Excel | [ ] |
|
| 24 |
+
| 12 | Noise Rate Bar Chart — Comparison | Phase 7 | matplotlib / Excel | [ ] |
|
| 25 |
+
| 13 | Classifier Confusion Matrix (per field) | Phase 7 | Seaborn heatmap | [ ] |
|
| 26 |
+
| 14 | Deployment Diagram (Express → FastAPI → ChromaDB) | Phase 8 | Draw.io | [ ] |
|
| 27 |
+
| 15 | Future Roadmap / Gantt-style Timeline | Phase 9 | Draw.io / simple table | [ ] |
|
| 28 |
+
|
| 29 |
+
---
|
| 30 |
+
|
| 31 |
+
## Phase 1 — Front Matter
|
| 32 |
+
**Est. time: 1–2 hrs | No diagrams needed**
|
| 33 |
+
|
| 34 |
+
- [ ] Title Page
|
| 35 |
+
- Project: VGEC RAG Chatbot
|
| 36 |
+
- Subtitle: Retrieval-Augmented Generation System for Academic Queries
|
| 37 |
+
- Name, Roll No., Department, Submission Date
|
| 38 |
+
- Guide name, College name
|
| 39 |
+
- [ ] Abstract (150–200 words)
|
| 40 |
+
- Problem: Students struggle to find accurate VGEC info scattered across website
|
| 41 |
+
- Solution: RAG-based chatbot with hierarchical classification + hybrid retrieval
|
| 42 |
+
- Key results: MRR, noise reduction *(fill placeholders after deployment)*
|
| 43 |
+
- Tech: FastAPI, ChromaDB, Gemini, Logistic Regression classifier
|
| 44 |
+
- [ ] Table of Contents *(auto-generate at end — structure it now)*
|
| 45 |
+
- [ ] List of Figures *(auto-generate at end)*
|
| 46 |
+
- [ ] List of Abbreviations
|
| 47 |
+
- RAG, BM25, RRF, LLM, MRR, API, VGEC, HOD, etc.
|
| 48 |
+
|
| 49 |
+
---
|
| 50 |
+
|
| 51 |
+
## Phase 2 — Introduction
|
| 52 |
+
**Est. time: 2–3 hrs | Diagrams needed: System Context Diagram (Diagram #8)**
|
| 53 |
+
|
| 54 |
+
- [ ] 2.1 Background
|
| 55 |
+
- Current state: Static website, PDFs, manual queries to admin office
|
| 56 |
+
- Pain points: Information scattered, no natural language interface
|
| 57 |
+
- [ ] 2.2 Problem Statement
|
| 58 |
+
- Lack of intelligent query system for institutional data
|
| 59 |
+
- Need for domain-specific (VGEC) accurate retrieval
|
| 60 |
+
- [ ] 2.3 Objectives
|
| 61 |
+
- Build RAG pipeline with >75% MRR
|
| 62 |
+
- Implement metadata classification for pre-filtering
|
| 63 |
+
- Provide REST API for frontend integration
|
| 64 |
+
- Deploy with a secure Express gateway
|
| 65 |
+
- [ ] 2.4 Scope
|
| 66 |
+
- **In scope:** Department data (faculty, labs, syllabus, HOD, intake), REST API, classification, evaluation
|
| 67 |
+
- **Out of scope:** Real-time website scraping, admissions processing, multimedia
|
| 68 |
+
|
| 69 |
+
> **Reuse from:** `CODEBASE_DOCUMENTATION.md` Section 1 (Project Overview)
|
| 70 |
+
|
| 71 |
+
---
|
| 72 |
+
|
| 73 |
+
## Phase 3 — Literature Review / Related Work
|
| 74 |
+
**Est. time: 2–3 hrs | Diagrams needed: Evolution timeline (simple horizontal flow)**
|
| 75 |
+
|
| 76 |
+
- [ ] 3.1 Traditional Chatbots
|
| 77 |
+
- Rule-based (ALICE, ELIZA) — rigid, no context
|
| 78 |
+
- Keyword matching chatbots — no semantic understanding
|
| 79 |
+
- [ ] 3.2 Modern RAG Systems
|
| 80 |
+
- OpenAI GPT-4 + vector DB (generic, not domain-specific)
|
| 81 |
+
- LlamaIndex / LangChain baseline RAG — no metadata filtering
|
| 82 |
+
- [ ] 3.3 Hybrid Search Systems
|
| 83 |
+
- Elasticsearch (BM25 only), Cohere (vector only)
|
| 84 |
+
- RRF as the standard fusion method (reference paper)
|
| 85 |
+
- [ ] 3.4 Your Differentiation
|
| 86 |
+
- Hierarchical classifier (Type→Category→Topic→Intent) for pre-filtering
|
| 87 |
+
- Hybrid retrieval (BM25 + Vector + RRF) vs pure semantic search
|
| 88 |
+
- Domain-specific ingestion strategy (intent-aware JSON chunking)
|
| 89 |
+
|
| 90 |
+
---
|
| 91 |
+
|
| 92 |
+
## Phase 4 — System Analysis & Requirements
|
| 93 |
+
**Est. time: 3–4 hrs | Diagrams needed: Use Case Diagram (#7), Level 1 DFD**
|
| 94 |
+
|
| 95 |
+
- [ ] 4.1 Functional Requirements
|
| 96 |
+
- FR1: Ingest structured JSON and unstructured documents (PDF, MD, TXT)
|
| 97 |
+
- FR2: Classify queries into metadata filters (type, category, topic, intent)
|
| 98 |
+
- FR3: Retrieve relevant chunks with configurable similarity threshold
|
| 99 |
+
- FR4: Generate contextual answers using Gemini or local LLM
|
| 100 |
+
- FR5: Provide CRUD operations on vector store via REST API
|
| 101 |
+
- FR6: Rate-limit and authenticate requests via Express gateway
|
| 102 |
+
- [ ] 4.2 Non-Functional Requirements
|
| 103 |
+
- Performance: <5s response (cloud), <30s (local LLM)
|
| 104 |
+
- Accuracy: MRR >0.75
|
| 105 |
+
- Security: Admin routes protected by JWT, Python API never publicly exposed
|
| 106 |
+
- Scalability: Support 10,000+ chunks in ChromaDB
|
| 107 |
+
- [ ] 4.3 Use Case Diagram *(Diagram #7)*
|
| 108 |
+
- Actors: Student, Faculty, Admin
|
| 109 |
+
- Student use cases: Submit query, View answer, View references
|
| 110 |
+
- Admin use cases: Ingest document, Delete document, Run evaluation, Change settings
|
| 111 |
+
- [ ] 4.4 Level 1 DFD
|
| 112 |
+
- Major processes: Ingest, Classify, Retrieve, Generate, Evaluate
|
| 113 |
+
|
| 114 |
+
---
|
| 115 |
+
|
| 116 |
+
## Phase 5 — System Design
|
| 117 |
+
**Est. time: 4–6 hrs | MOST MARKS, MOST DIAGRAMS**
|
| 118 |
+
**Diagrams needed: #1, #2, #3, #4, #5, #6**
|
| 119 |
+
|
| 120 |
+
> **Reuse heavily from:** `CODEBASE_DOCUMENTATION.md` Sections 2, 3, 4
|
| 121 |
+
|
| 122 |
+
- [ ] 5.1 Architecture Design
|
| 123 |
+
- [ ] High-Level Component Diagram *(Diagram #1)*
|
| 124 |
+
- [ ] Data Flow — Ingestion Path *(Diagram #3)*
|
| 125 |
+
- [ ] Data Flow — Query Path *(Diagram #2)*
|
| 126 |
+
- [ ] Technology Stack Table (already in CODEBASE_DOCUMENTATION.md Section 1)
|
| 127 |
+
- [ ] 5.2 Database Design
|
| 128 |
+
- [ ] Vector DB Metadata Schema (field table — already in CODEBASE_DOCUMENTATION.md Section 3)
|
| 129 |
+
- [ ] Source JSON Schema (already documented)
|
| 130 |
+
- [ ] File Tracking Registry Schema (FileService JSON records)
|
| 131 |
+
- [ ] 5.3 Algorithm Design
|
| 132 |
+
- [ ] Hierarchical Taxonomy Tree *(Diagram #4)* (Type → Category → Topic → Intent)
|
| 133 |
+
- [ ] Filter Decision Flowchart *(Diagram #5)* (confidence thresholds → Strict/Partial/Fallback)
|
| 134 |
+
- [ ] Hybrid Retrieval Sequence *(Diagram #6)* (Vector → BM25 → RRF formula → Boost → Threshold)
|
| 135 |
+
- [ ] Chunking Strategy (JSON intent-aware vs RecursiveCharacterTextSplitter)
|
| 136 |
+
- [ ] RRF Formula — document with the actual equation:
|
| 137 |
+
```
|
| 138 |
+
score(d) = bm25_weight * 1/(rrf_k + rank_bm25)
|
| 139 |
+
+ vector_weight * 1/(rrf_k + rank_vec)
|
| 140 |
+
```
|
| 141 |
+
- [ ] 5.4 Interface Design
|
| 142 |
+
- [ ] API Endpoint Table — /rag and /vector routes (already in CODEBASE_DOCUMENTATION.md Section 5)
|
| 143 |
+
- [ ] Request/Response JSON examples (sample curl or Postman output)
|
| 144 |
+
- [ ] Express Gateway design (rate limit + auth + concurrency queue)
|
| 145 |
+
|
| 146 |
+
---
|
| 147 |
+
|
| 148 |
+
## Phase 6 — Implementation
|
| 149 |
+
**Est. time: 2–3 hrs | Diagrams needed: Directory tree (#9 class diagram, #10 activity diagram)**
|
| 150 |
+
|
| 151 |
+
> **Reuse heavily from:** `CODEBASE_DOCUMENTATION.md` Section 5 and Section 8
|
| 152 |
+
|
| 153 |
+
- [ ] 6.1 Directory Structure (already in CODEBASE_DOCUMENTATION.md Section 8)
|
| 154 |
+
- [ ] 6.2 Module Descriptions (already in CODEBASE_DOCUMENTATION.md Section 5)
|
| 155 |
+
- [ ] 6.3 Key Code Snippets *(do NOT paste full files — only algorithm excerpts)*
|
| 156 |
+
- [ ] Filter construction logic (`_build_filter` method)
|
| 157 |
+
- [ ] RRF scoring loop
|
| 158 |
+
- [ ] Intent-aware JSON chunking (`handle_json_docs`)
|
| 159 |
+
- [ ] Classifier prediction + threshold gating
|
| 160 |
+
- [ ] 6.4 Configuration
|
| 161 |
+
- [ ] `.env` variables table (already in CODEBASE_DOCUMENTATION.md Section 5)
|
| 162 |
+
- [ ] Hyperparameter table (BM25 weights, thresholds, chunk size)
|
| 163 |
+
- [ ] 6.5 Express Gateway Implementation
|
| 164 |
+
- [ ] Rate limiting configuration
|
| 165 |
+
- [ ] JWT auth middleware snippet
|
| 166 |
+
- [ ] Concurrency queue (`p-limit`) snippet
|
| 167 |
+
|
| 168 |
+
---
|
| 169 |
+
|
| 170 |
+
## Phase 7 — Testing & Evaluation
|
| 171 |
+
**Est. time: 3–4 hrs | Diagrams needed: #11 (MRR bar chart), #12 (noise chart), #13 (confusion matrix)**
|
| 172 |
+
> ⚠️ PLACEHOLDER — fill real numbers and screenshots AFTER deployment
|
| 173 |
+
|
| 174 |
+
- [ ] 7.1 Test Plan
|
| 175 |
+
- [ ] Unit tests: Classifier accuracy per field (run `/test_classifier_dataset`)
|
| 176 |
+
- [ ] Integration tests: End-to-end hybrid query
|
| 177 |
+
- [ ] Performance: Measure average latency (cloud vs local)
|
| 178 |
+
- [ ] 7.2 Results
|
| 179 |
+
- [ ] Comparison Table: Traditional pure-vector RAG vs Your Hybrid RAG
|
| 180 |
+
- Metrics: MRR, Hit Rate, Top-1 Hit Rate, Noise Rate, Latency
|
| 181 |
+
- [ ] MRR Bar Chart by query intent type *(Diagram #11)*
|
| 182 |
+
- [ ] Noise Rate comparison *(Diagram #12)*
|
| 183 |
+
- [ ] Classifier Confusion Matrix per field *(Diagram #13)*
|
| 184 |
+
- [ ] 7.3 Sample Query Demonstrations
|
| 185 |
+
- Choose 3–5 representative queries, show:
|
| 186 |
+
- Input question
|
| 187 |
+
- Classifier output (type, category, topic, intent + confidences)
|
| 188 |
+
- Retrieved chunks with scores
|
| 189 |
+
- Final LLM answer
|
| 190 |
+
|
| 191 |
+
---
|
| 192 |
+
|
| 193 |
+
## Phase 8 — Deployment
|
| 194 |
+
**Est. time: 1–2 hrs | Diagrams needed: Deployment diagram (#14)**
|
| 195 |
+
> ⚠️ PLACEHOLDER — fill AFTER actual deployment
|
| 196 |
+
|
| 197 |
+
- [ ] 8.1 System Requirements
|
| 198 |
+
- Hardware: 8GB RAM, 4-core CPU (local LLM) OR Google API key (Gemini)
|
| 199 |
+
- Software: Python 3.9+, Node.js 18+, ChromaDB
|
| 200 |
+
- [ ] 8.2 Deployment Architecture *(Diagram #14)*
|
| 201 |
+
- Frontend → Express Gateway → FastAPI → ChromaDB
|
| 202 |
+
- [ ] 8.3 Installation Steps
|
| 203 |
+
- Clone → `pip install -r requirements.txt` → Set `.env` → Run ingestion → Start API
|
| 204 |
+
- Express: `npm install` → Set `.env` → `node server.js`
|
| 205 |
+
- [ ] 8.4 Screenshots *(fill after deployment)*
|
| 206 |
+
- [ ] Swagger UI (`/docs`)
|
| 207 |
+
- [ ] Sample chatbot interaction
|
| 208 |
+
- [ ] Admin panel
|
| 209 |
+
- [ ] Classification test panel
|
| 210 |
+
|
| 211 |
+
---
|
| 212 |
+
|
| 213 |
+
## Phase 9 — Future Scope & Conclusion
|
| 214 |
+
**Est. time: 1–2 hrs | Diagrams needed: Roadmap (#15)**
|
| 215 |
+
|
| 216 |
+
- [ ] 9.1 Future Enhancements
|
| 217 |
+
- Dynamic LLM switching via admin UI (ModelManager architecture)
|
| 218 |
+
- Cross-encoder re-ranking step (after resource becomes available)
|
| 219 |
+
- Query result caching layer
|
| 220 |
+
- Automated metadata prediction during ingestion (classifier-assisted)
|
| 221 |
+
- Website scraping for real-time data updates
|
| 222 |
+
- [ ] 9.2 Known Limitations (already in CODEBASE_DOCUMENTATION.md Section 7)
|
| 223 |
+
- Local LLM latency (CPU-bound, no GPU)
|
| 224 |
+
- BM25 corpus rebuilt per request
|
| 225 |
+
- No real-time data — static knowledge base
|
| 226 |
+
- [ ] 9.3 Conclusion
|
| 227 |
+
- Successfully built domain-specific RAG with hybrid retrieval
|
| 228 |
+
- Hierarchical classification reduces noise and improves precision
|
| 229 |
+
- Secure deployment with Express gateway protects the inference server
|
| 230 |
+
|
| 231 |
+
---
|
| 232 |
+
|
| 233 |
+
## Phase 10 — References & Appendices
|
| 234 |
+
**Est. time: 1–2 hrs | No diagrams needed**
|
| 235 |
+
|
| 236 |
+
- [ ] 10.1 References
|
| 237 |
+
- LangChain documentation
|
| 238 |
+
- ChromaDB documentation
|
| 239 |
+
- Original RRF paper (Cormack et al., 2009)
|
| 240 |
+
- Gemini API documentation
|
| 241 |
+
- VGEC official website (data source)
|
| 242 |
+
- BM25 (Robertson & Zaragoza, 2009)
|
| 243 |
+
- Sentence Transformers (Reimers & Gurevych, 2019)
|
| 244 |
+
- [ ] 10.2 Appendix A — MASTER_INDEX full taxonomy
|
| 245 |
+
- [ ] 10.3 Appendix B — Full API documentation (export from Swagger `/docs`)
|
| 246 |
+
- [ ] 10.4 Appendix C — Sample classifier training data
|
| 247 |
+
- [ ] 10.5 Appendix D — Sample department JSON format
|
| 248 |
+
|
| 249 |
+
---
|
| 250 |
+
|
| 251 |
+
## Execution Timeline
|
| 252 |
+
|
| 253 |
+
| Phase | When | Priority |
|
| 254 |
+
|---|---|---|
|
| 255 |
+
| **All Diagrams** | Start NOW (before writing prose) | 🔴 Critical |
|
| 256 |
+
| Phase 1–3 (Intro, Lit Review) | Day 1 | Must have |
|
| 257 |
+
| Phase 4–5 (Design) | Day 2–3 | 🔴 Critical — most marks |
|
| 258 |
+
| Phase 6 (Implementation) | Day 4 | Must have |
|
| 259 |
+
| Phase 7 (Testing) | After deployment — Day 5 | 🔴 Critical — proof |
|
| 260 |
+
| Phase 8 (Deployment) | After deployment | Must have |
|
| 261 |
+
| Phase 9–10 (Future, Refs) | Day 6 | Finish strong |
|
| 262 |
+
| Final PDF export + proofread | Last | Required |
|
| 263 |
+
|
| 264 |
+
---
|
| 265 |
+
|
| 266 |
+
## Reuse Map — What's Already Written
|
| 267 |
+
|
| 268 |
+
| Documentation Section | Already in |
|
| 269 |
+
|---|---|
|
| 270 |
+
| System Architecture (components, data flow) | `CODEBASE_DOCUMENTATION.md` Section 2 |
|
| 271 |
+
| Tech Stack Table | `CODEBASE_DOCUMENTATION.md` Section 1 |
|
| 272 |
+
| Metadata Schema / Taxonomy | `CODEBASE_DOCUMENTATION.md` Section 3 |
|
| 273 |
+
| Retrieval Pipeline steps | `CODEBASE_DOCUMENTATION.md` Section 4 |
|
| 274 |
+
| All class/method descriptions | `CODEBASE_DOCUMENTATION.md` Section 5 |
|
| 275 |
+
| Metrics definitions | `CODEBASE_DOCUMENTATION.md` Section 6 |
|
| 276 |
+
| Known Limitations | `CODEBASE_DOCUMENTATION.md` Section 7 |
|
| 277 |
+
| File Structure Tree | `CODEBASE_DOCUMENTATION.md` Section 8 |
|
Dockerfile
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 2 |
+
# VGEC RAG Chatbot — Dockerfile for Hugging Face Spaces
|
| 3 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 4 |
+
# HF Spaces requirements:
|
| 5 |
+
# • Port MUST be 7860
|
| 6 |
+
# • GOOGLE_API_KEY must be set as a Space Secret in HF UI
|
| 7 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 8 |
+
|
| 9 |
+
FROM python:3.11-slim
|
| 10 |
+
|
| 11 |
+
# ── System dependencies ───────────────────────────────────────────────────────
|
| 12 |
+
# build-essential → needed by chromadb (hnswlib C extension)
|
| 13 |
+
# libgomp1 → needed by sentence-transformers / scikit-learn OpenMP
|
| 14 |
+
# git → needed by some pip packages that install from git
|
| 15 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 16 |
+
build-essential \
|
| 17 |
+
libgomp1 \
|
| 18 |
+
git \
|
| 19 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 20 |
+
|
| 21 |
+
# ── Working directory ─────────────────────────────────────────────────────────
|
| 22 |
+
WORKDIR /app
|
| 23 |
+
|
| 24 |
+
# ── Python dependencies ───────────────────────────────────────────────────────
|
| 25 |
+
# Copy requirements first so Docker caches this layer separately from source code.
|
| 26 |
+
# Any requirements change rebuilds from here; source code changes don't.
|
| 27 |
+
COPY requirements.txt .
|
| 28 |
+
|
| 29 |
+
# Install CPU-only PyTorch FIRST (prevents pip from pulling 2+ GB GPU wheels
|
| 30 |
+
# when sentence-transformers later requests torch as a dependency).
|
| 31 |
+
RUN pip install --no-cache-dir \
|
| 32 |
+
torch==2.2.2 \
|
| 33 |
+
--index-url https://download.pytorch.org/whl/cpu
|
| 34 |
+
|
| 35 |
+
# Install the rest of the requirements.
|
| 36 |
+
# llama-cpp-python is intentionally excluded — Gemini-only deployment.
|
| 37 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 38 |
+
|
| 39 |
+
# Download the spaCy English model at build time so it's baked into the image.
|
| 40 |
+
RUN python -m spacy download en_core_web_sm
|
| 41 |
+
|
| 42 |
+
# ── Application source ────────────────────────────────────────────────────────
|
| 43 |
+
COPY . .
|
| 44 |
+
|
| 45 |
+
# ── Environment variables ─────────────────────────────────────────────────────
|
| 46 |
+
# Tell Python not to buffer stdout/stderr (so logs appear in real time on HF).
|
| 47 |
+
ENV PYTHONUNBUFFERED=1
|
| 48 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 49 |
+
|
| 50 |
+
# LLM mode — overrides the config.py default; HF Spaces will use Gemini API.
|
| 51 |
+
# GOOGLE_API_KEY is NOT set here — it must be added as a HF Space Secret.
|
| 52 |
+
ENV LLM_PROVIDER=gemini
|
| 53 |
+
ENV ENABLE_FALLBACK=false
|
| 54 |
+
|
| 55 |
+
# Point sentence-transformers cache inside /app so it's predictable.
|
| 56 |
+
ENV SENTENCE_TRANSFORMERS_HOME=/app/ml_models/embeddings
|
| 57 |
+
ENV HF_HOME=/app/.cache/huggingface
|
| 58 |
+
|
| 59 |
+
# ── Port ──────────────────────────────────────────────────────────────────────
|
| 60 |
+
# HF Spaces requires exactly port 7860.
|
| 61 |
+
EXPOSE 7860
|
| 62 |
+
|
| 63 |
+
# ── Startup ───────────────────────────────────────────────────────────────────
|
| 64 |
+
# No --reload (dev-only flag).
|
| 65 |
+
# --workers 1 keeps RAM usage predictable on the free tier (2 vCPU, 16 GB RAM).
|
| 66 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
|
LOCAL_MODEL_TRUNCATION_FIX.md
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Local Model Truncation Fix
|
| 2 |
+
|
| 3 |
+
## 🐛 Problem
|
| 4 |
+
|
| 5 |
+
The local model was cutting off responses mid-sentence, like:
|
| 6 |
+
|
| 7 |
+
```
|
| 8 |
+
"...applications for various purposes such as"
|
| 9 |
+
```
|
| 10 |
+
|
| 11 |
+
## 🔍 Root Cause
|
| 12 |
+
|
| 13 |
+
The `ChatLlamaCpp` model configuration was **missing the `max_tokens` parameter**.
|
| 14 |
+
|
| 15 |
+
Without this parameter:
|
| 16 |
+
|
| 17 |
+
- The model defaults to a very low token generation limit
|
| 18 |
+
- Responses get truncated mid-sentence
|
| 19 |
+
- No warning or error is shown
|
| 20 |
+
|
| 21 |
+
## ✅ Solution Applied
|
| 22 |
+
|
| 23 |
+
### 1. Added `max_tokens` to Local Model Configuration
|
| 24 |
+
|
| 25 |
+
**File:** `app/utils/model_factory.py`
|
| 26 |
+
|
| 27 |
+
**Before:**
|
| 28 |
+
|
| 29 |
+
```python
|
| 30 |
+
model = ChatLlamaCpp(
|
| 31 |
+
model_path=str(model_file),
|
| 32 |
+
n_ctx=4096,
|
| 33 |
+
n_batch=512,
|
| 34 |
+
n_threads=4,
|
| 35 |
+
temperature=0.05,
|
| 36 |
+
# ❌ Missing max_tokens!
|
| 37 |
+
)
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
**After:**
|
| 41 |
+
|
| 42 |
+
```python
|
| 43 |
+
model = ChatLlamaCpp(
|
| 44 |
+
model_path=str(model_file),
|
| 45 |
+
n_ctx=4096,
|
| 46 |
+
n_batch=512,
|
| 47 |
+
n_threads=4,
|
| 48 |
+
max_tokens=settings.local_max_tokens, # ✅ FIXED!
|
| 49 |
+
temperature=0.05,
|
| 50 |
+
)
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
### 2. Increased Gemini Token Limit
|
| 54 |
+
|
| 55 |
+
**Before:** `max_output_tokens=512` (too low)
|
| 56 |
+
**After:** `max_output_tokens=settings.max_output_tokens` (2048)
|
| 57 |
+
|
| 58 |
+
### 3. Made Settings Configurable
|
| 59 |
+
|
| 60 |
+
**File:** `app/core/config.py`
|
| 61 |
+
|
| 62 |
+
Added:
|
| 63 |
+
|
| 64 |
+
```python
|
| 65 |
+
# Generation Settings
|
| 66 |
+
max_output_tokens: int = 2048 # Max tokens for Gemini responses
|
| 67 |
+
local_max_tokens: int = 2048 # Max tokens for local model responses
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
## 📊 Impact
|
| 71 |
+
|
| 72 |
+
### Before:
|
| 73 |
+
|
| 74 |
+
- **Gemini**: 512 max tokens (~350-400 words)
|
| 75 |
+
- **Local**: Unknown (probably ~100-200 tokens)
|
| 76 |
+
- **Result**: Truncated responses
|
| 77 |
+
|
| 78 |
+
### After:
|
| 79 |
+
|
| 80 |
+
- **Gemini**: 2048 max tokens (~1400-1500 words)
|
| 81 |
+
- **Local**: 2048 max tokens (~1400-1500 words)
|
| 82 |
+
- **Result**: Complete, full responses ✅
|
| 83 |
+
|
| 84 |
+
## 🎯 Expected Behavior Now
|
| 85 |
+
|
| 86 |
+
1. **Local model should complete sentences** instead of cutting off
|
| 87 |
+
2. **Responses can be up to ~1500 words** before hitting the limit
|
| 88 |
+
3. **Both models have equal response length capacity**
|
| 89 |
+
|
| 90 |
+
## ⚙️ How to Adjust
|
| 91 |
+
|
| 92 |
+
If you want even longer responses, edit `app/core/config.py`:
|
| 93 |
+
|
| 94 |
+
```python
|
| 95 |
+
# For longer responses (up to ~3500 words)
|
| 96 |
+
max_output_tokens: int = 4096
|
| 97 |
+
local_max_tokens: int = 4096
|
| 98 |
+
|
| 99 |
+
# For shorter responses (to save processing time)
|
| 100 |
+
max_output_tokens: int = 1024
|
| 101 |
+
local_max_tokens: int = 1024
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
## 🧪 Test It
|
| 105 |
+
|
| 106 |
+
Try asking the same question again. The local model should now:
|
| 107 |
+
|
| 108 |
+
1. ✅ Complete full sentences
|
| 109 |
+
2. ✅ Provide detailed answers
|
| 110 |
+
3. ✅ Not cut off mid-word
|
| 111 |
+
|
| 112 |
+
## 📝 Additional Notes
|
| 113 |
+
|
| 114 |
+
### Why 2048 tokens?
|
| 115 |
+
|
| 116 |
+
- Good balance between completeness and speed
|
| 117 |
+
- Covers most Q&A scenarios
|
| 118 |
+
- Prevents overly long responses
|
| 119 |
+
|
| 120 |
+
### What is a "token"?
|
| 121 |
+
|
| 122 |
+
- A token ≈ 0.75 words on average
|
| 123 |
+
- 2048 tokens ≈ 1500 words
|
| 124 |
+
- 4096 tokens ≈ 3000 words
|
| 125 |
+
|
| 126 |
+
### Parameters Explained:
|
| 127 |
+
|
| 128 |
+
- `n_ctx=4096`: Total context window (input + output)
|
| 129 |
+
- `max_tokens=2048`: Maximum output only
|
| 130 |
+
- This means: max ~2048 input + 2048 output = 4096 total
|
| 131 |
+
|
| 132 |
+
### Other Fixes Applied:
|
| 133 |
+
|
| 134 |
+
- Added comments to all parameters for clarity
|
| 135 |
+
- Made token limits configurable via settings
|
| 136 |
+
- Ensured both models have consistent behavior
|
MARKDOWN_FIX_SUMMARY.md
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Markdown Splitting Fix - Summary
|
| 2 |
+
|
| 3 |
+
## Problem
|
| 4 |
+
|
| 5 |
+
The markdown files with `---` section delimiters were being split at every `#` header, creating many small chunks with insufficient context.
|
| 6 |
+
|
| 7 |
+
### Example Issue:
|
| 8 |
+
|
| 9 |
+
```
|
| 10 |
+
# Faculty of the Information & Communication Technology Department
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
This header alone was becoming a separate chunk because the default markdown splitter splits on headers.
|
| 14 |
+
|
| 15 |
+
## Solution Implemented
|
| 16 |
+
|
| 17 |
+
### 1. Created New Splitter Method: `for_markdown_with_sections()`
|
| 18 |
+
|
| 19 |
+
**Location:** `app/services/text_splitter.py`
|
| 20 |
+
|
| 21 |
+
**Custom Separators Priority:**
|
| 22 |
+
|
| 23 |
+
1. `\n---\n` - Section delimiters (HIGHEST PRIORITY)
|
| 24 |
+
2. `\n\n\n` - Triple newlines
|
| 25 |
+
3. `\n\n` - Paragraphs
|
| 26 |
+
4. `\n` - Single newlines
|
| 27 |
+
5. `. ` - Sentences
|
| 28 |
+
6. ` ` - Words
|
| 29 |
+
7. `` - Characters (last resort)
|
| 30 |
+
|
| 31 |
+
This ensures sections stay together and headers aren't split separately.
|
| 32 |
+
|
| 33 |
+
### 2. Updated RAG Service
|
| 34 |
+
|
| 35 |
+
**Location:** `app/services/rag_service.py` (line 77-82)
|
| 36 |
+
|
| 37 |
+
**Changed from:**
|
| 38 |
+
|
| 39 |
+
```python
|
| 40 |
+
markdown_splitter = self.text_splitter.for_markdown(
|
| 41 |
+
chunk_size=chunk_size,
|
| 42 |
+
chunk_overlap=chunk_overlap
|
| 43 |
+
)
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
**Changed to:**
|
| 47 |
+
|
| 48 |
+
```python
|
| 49 |
+
markdown_splitter = TextSplitter.for_markdown_with_sections(
|
| 50 |
+
chunk_size=chunk_size,
|
| 51 |
+
chunk_overlap=chunk_overlap
|
| 52 |
+
)
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
### 3. Updated Document Helpers
|
| 56 |
+
|
| 57 |
+
**Location:** `app/utils/document_helpers.py` (line 161-167)
|
| 58 |
+
|
| 59 |
+
Added auto-detection for markdown with sections:
|
| 60 |
+
|
| 61 |
+
```python
|
| 62 |
+
# Use section-aware splitter if text contains markdown section delimiters
|
| 63 |
+
if "\n---\n" in text or text.startswith("---\n"):
|
| 64 |
+
splitter = TextSplitter.for_markdown_with_sections()
|
| 65 |
+
else:
|
| 66 |
+
splitter = TextSplitter()
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
## Expected Results
|
| 70 |
+
|
| 71 |
+
### Before (with `for_markdown()`):
|
| 72 |
+
|
| 73 |
+
- **Many small chunks** - Headers split separately
|
| 74 |
+
- Example: "# Faculty..." becomes its own 50-character chunk
|
| 75 |
+
- Poor context for RAG retrieval
|
| 76 |
+
|
| 77 |
+
### After (with `for_markdown_with_sections()`):
|
| 78 |
+
|
| 79 |
+
- **Fewer, more meaningful chunks** - Sections kept together
|
| 80 |
+
- Headers stay with their content
|
| 81 |
+
- Better context for RAG retrieval
|
| 82 |
+
- Reduced number of chunks overall
|
| 83 |
+
|
| 84 |
+
## How to Use
|
| 85 |
+
|
| 86 |
+
### For File Upload (Already Applied):
|
| 87 |
+
|
| 88 |
+
When you upload a `.md` file via the POST endpoint, it will automatically:
|
| 89 |
+
|
| 90 |
+
1. Detect it's a markdown file
|
| 91 |
+
2. Use `for_markdown_with_sections()` splitter
|
| 92 |
+
3. Keep sections together
|
| 93 |
+
|
| 94 |
+
### For Raw Text Upload:
|
| 95 |
+
|
| 96 |
+
When posting raw text with `---` delimiters:
|
| 97 |
+
|
| 98 |
+
1. The system auto-detects section delimiters
|
| 99 |
+
2. Applies the section-aware splitter
|
| 100 |
+
3. Preserves semantic structure
|
| 101 |
+
|
| 102 |
+
## Configuration
|
| 103 |
+
|
| 104 |
+
You can still adjust chunk size in `app/core/config.py`:
|
| 105 |
+
|
| 106 |
+
```python
|
| 107 |
+
chunk_size: int = 768 # Adjust as needed
|
| 108 |
+
chunk_overlap: int = 200 # Adjust overlap
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
## Next Steps
|
| 112 |
+
|
| 113 |
+
Try uploading your markdown file again. You should see:
|
| 114 |
+
|
| 115 |
+
- ✅ Fewer total chunks
|
| 116 |
+
- ✅ Each chunk contains header + related content
|
| 117 |
+
- ✅ Better semantic coherence
|
| 118 |
+
- ✅ Improved RAG retrieval quality
|
README.md
CHANGED
|
@@ -1,10 +1,65 @@
|
|
| 1 |
-
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RAG (Retrieval-Augmented Generation) Project
|
| 2 |
+
|
| 3 |
+
## Services
|
| 4 |
+
|
| 5 |
+
### Available Services
|
| 6 |
+
|
| 7 |
+
1. **Document Loader** (`services.document_loader`)
|
| 8 |
+
- Load PDF documents
|
| 9 |
+
- Support for single and multiple file loading
|
| 10 |
+
- Lazy loading support
|
| 11 |
+
|
| 12 |
+
2. **Vector Store** (`services.VectorStore`)
|
| 13 |
+
- Similarity search
|
| 14 |
+
- Document management (add, update, delete)
|
| 15 |
+
- Metadata filtering
|
| 16 |
+
|
| 17 |
+
3. **Text Splitter** (`services.TextSplitter`) ✅
|
| 18 |
+
- Recursive character text splitting
|
| 19 |
+
- Language-specific splitting (20+ languages)
|
| 20 |
+
- See [docs/TEXT_SPLITTER.md](docs/TEXT_SPLITTER.md) for full documentation
|
| 21 |
+
|
| 22 |
+
4. **RAG Service** (`services.RAGService`) ✅ **NEW**
|
| 23 |
+
- Integrates Document Loader, Text Splitter, Vector Store
|
| 24 |
+
- Powered by **Google Gemini** LLM
|
| 25 |
+
- Creates a complete RAG pipeline with retrieval & generation
|
| 26 |
+
|
| 27 |
+
## Quick Start
|
| 28 |
+
|
| 29 |
+
```python
|
| 30 |
+
from services import document_loader, TextSplitter, VectorStore
|
| 31 |
+
from libs import ROOT_PATH
|
| 32 |
+
|
| 33 |
+
# Load documents
|
| 34 |
+
pdf_path = ROOT_PATH / "document.pdf"
|
| 35 |
+
doc_obj = document_loader(filepath=pdf_path)
|
| 36 |
+
documents = doc_obj.load()
|
| 37 |
+
|
| 38 |
+
# Split into chunks
|
| 39 |
+
splitter = TextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 40 |
+
chunks = splitter.split_documents(documents)
|
| 41 |
+
|
| 42 |
+
# Add to vector store
|
| 43 |
+
# vector_store.add_documents(chunks)
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
## Examples
|
| 47 |
+
|
| 48 |
+
Run the TextSplitter examples:
|
| 49 |
+
|
| 50 |
+
```bash
|
| 51 |
+
python examples_text_splitter.py
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
## Tasks
|
| 55 |
+
|
| 56 |
+
- [x] Document Loader
|
| 57 |
+
- [ ] Multiple PDF loader
|
| 58 |
+
- [ ] if txt then txt loader
|
| 59 |
+
- [ ] preprocessing
|
| 60 |
+
- [ ] stop_words removal
|
| 61 |
+
- [ ] punctuations
|
| 62 |
+
- [ ] lowercasing
|
| 63 |
+
- [ ] lemmetization
|
| 64 |
+
- [x] Recursive TextSplitter ✅
|
| 65 |
+
- [ ] Assign Them Metadata properly!
|
WHY_LOCAL_NOT_WORKING.md
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Why Local Model Isn't Working - Diagnosis
|
| 2 |
+
|
| 3 |
+
## 🐛 Problems Found:
|
| 4 |
+
|
| 5 |
+
### 1. **LRU Cache Keeps Old Model** (PRIMARY ISSUE)
|
| 6 |
+
|
| 7 |
+
**File:** `app/api/dependencies.py` (line 13)
|
| 8 |
+
|
| 9 |
+
```python
|
| 10 |
+
@lru_cache() # ❌ This caches the RAG service FOREVER
|
| 11 |
+
def get_rag_service() -> RAGService:
|
| 12 |
+
llm_model = get_llm_model() # Model initialized ONCE
|
| 13 |
+
...
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
**Impact:**
|
| 17 |
+
|
| 18 |
+
- Model is loaded when server FIRST starts
|
| 19 |
+
- Even if you change config, the OLD model stays in memory
|
| 20 |
+
- `@lru_cache()` never clears until server is fully restarted
|
| 21 |
+
- Auto-reload doesn't clear the cache!
|
| 22 |
+
|
| 23 |
+
### 2. **Missing Fallback Trigger**
|
| 24 |
+
|
| 25 |
+
**File:** `app/utils/model_factory.py`
|
| 26 |
+
|
| 27 |
+
The fallback logic EXISTS but it's not being triggered because:
|
| 28 |
+
|
| 29 |
+
- The Gemini model initialization happens at startup (cached)
|
| 30 |
+
- The error happens during model.invoke() (at query time)
|
| 31 |
+
- But fallback only works during get_llm_model() (at init time)
|
| 32 |
+
|
| 33 |
+
### 3. **Missing max_output_tokens** (FIXED)
|
| 34 |
+
|
| 35 |
+
You deleted it from config.py which caused AttributeError.
|
| 36 |
+
✅ I restored it.
|
| 37 |
+
|
| 38 |
+
## ✅ Solutions:
|
| 39 |
+
|
| 40 |
+
### **Quick Fix: Full Server Restart**
|
| 41 |
+
|
| 42 |
+
Stop the server completely (Ctrl+C) and start it again:
|
| 43 |
+
|
| 44 |
+
```bash
|
| 45 |
+
# Kill the server
|
| 46 |
+
Ctrl + C
|
| 47 |
+
|
| 48 |
+
# Restart
|
| 49 |
+
uvicorn main:app --reload
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
This will clear the LRU cache and load the local model.
|
| 53 |
+
|
| 54 |
+
### **Permanent Fix: Remove or Fix LRU Cache**
|
| 55 |
+
|
| 56 |
+
You have 2 options:
|
| 57 |
+
|
| 58 |
+
#### Option A: Remove LRU Cache (Simplest)
|
| 59 |
+
|
| 60 |
+
Models will be reinitialized on each request (slightly slower but settings-aware):
|
| 61 |
+
|
| 62 |
+
```python
|
| 63 |
+
# Remove @lru_cache()
|
| 64 |
+
def get_rag_service() -> RAGService:
|
| 65 |
+
logger.info("Initializing RAG service...")
|
| 66 |
+
llm_model = get_llm_model()
|
| 67 |
+
...
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
#### Option B: Make Cache Settings-Aware
|
| 71 |
+
|
| 72 |
+
Cache based on current settings:
|
| 73 |
+
|
| 74 |
+
```python
|
| 75 |
+
def get_rag_service_key():
|
| 76 |
+
return (settings.llm_provider, settings.gemini_model_name, settings.local_model_name)
|
| 77 |
+
|
| 78 |
+
@lru_cache(maxsize=2)
|
| 79 |
+
def _cached_llm_model(provider, gemini_name, local_name):
|
| 80 |
+
return get_llm_model(provider)
|
| 81 |
+
|
| 82 |
+
def get_rag_service() -> RAGService:
|
| 83 |
+
key = get_rag_service_key()
|
| 84 |
+
llm_model = _cached_llm_model(*key)
|
| 85 |
+
...
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
### **Better Fix: Dynamic Model Loading**
|
| 89 |
+
|
| 90 |
+
Make the RAG service check settings on each request and switch models if needed.
|
| 91 |
+
|
| 92 |
+
## 📋 Action Items:
|
| 93 |
+
|
| 94 |
+
1. ✅ **Fixed:** Restored `max_output_tokens` in config.py
|
| 95 |
+
2. ⚠️ **TODO:** Full server restart (Ctrl+C then restart)
|
| 96 |
+
3. ⚠️ **TODO:** Test with local model
|
| 97 |
+
4. ⚠️ **TODO:** Consider removing `@lru_cache()` from dependencies.py
|
| 98 |
+
|
| 99 |
+
## What's Happening Now:
|
| 100 |
+
|
| 101 |
+
Right now, your server has:
|
| 102 |
+
|
| 103 |
+
- ✅ config.py says `llm_provider = "local"`
|
| 104 |
+
- ✅ max_output_tokens restored
|
| 105 |
+
- ❌ BUT old Gemini model still in memory (cached)
|
| 106 |
+
- ❌ Fallback can't help because model is already loaded
|
| 107 |
+
|
| 108 |
+
**The cached Gemini model is still being used for all requests!**
|
| 109 |
+
|
| 110 |
+
## 🎯 Next Step:
|
| 111 |
+
|
| 112 |
+
**RESTART THE SERVER** (full stop + start, not just reload)
|
app/__init__.py
ADDED
|
File without changes
|
app/api/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# We removed the top-level imports of routes here to prevent circular dependencies.
|
| 2 |
+
# This file now only provides the base structure if needed.
|
| 3 |
+
|
| 4 |
+
# If you want to use the api_router elsewhere, import it and register routes
|
| 5 |
+
# in the file where you initialize the FastAPI app (main.py).
|
app/api/dependencies.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
FastAPI dependencies for dependency injection.
|
| 4 |
+
"""
|
| 5 |
+
from functools import lru_cache
|
| 6 |
+
# from app.services.rag_service import RAGService # MOVED INSIDE FUNCTION TO PREVENT CIRCULAR IMPORT
|
| 7 |
+
from app.utils.model_factory import get_llm_model, get_embedding_model, get_local_model
|
| 8 |
+
from app.core.config import settings
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
@lru_cache()
|
| 14 |
+
def get_rag_service():
|
| 15 |
+
from app.services.rag_service import RAGService
|
| 16 |
+
"""
|
| 17 |
+
Get RAG service instance (singleton).
|
| 18 |
+
|
| 19 |
+
This is cached so the same instance is reused across requests.
|
| 20 |
+
Models are initialized once and reused.
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
RAGService: Configured RAG service
|
| 24 |
+
"""
|
| 25 |
+
logger.info("Initializing RAG service...")
|
| 26 |
+
|
| 27 |
+
# Initialize models
|
| 28 |
+
llm_model = get_llm_model()
|
| 29 |
+
embedding_model = get_embedding_model()
|
| 30 |
+
|
| 31 |
+
# Create RAG service
|
| 32 |
+
rag_service = RAGService(
|
| 33 |
+
model=llm_model,
|
| 34 |
+
collection_name=settings.collection_name,
|
| 35 |
+
persist_directory=settings.persist_directory,
|
| 36 |
+
embedding_model=embedding_model,
|
| 37 |
+
k=settings.similarity_top_k
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
logger.info("RAG service initialized successfully")
|
| 41 |
+
return rag_service
|
app/api/routes/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .rag import router as rag_router
|
| 2 |
+
from .vector_store import router as vector_router
|
| 3 |
+
from .settings import router as settings_router
|
| 4 |
+
|
| 5 |
+
__all__ = ["rag_router", "vector_router", "settings_router"]
|
app/api/routes/rag.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, Depends, HTTPException
|
| 2 |
+
from app.api.schemas.requests import RAGRequest
|
| 3 |
+
from app.api.dependencies import get_rag_service
|
| 4 |
+
from app.api.schemas.tests import TestRequestSchema, TestClassifierReqSchema
|
| 5 |
+
from app.services.rag_service import RAGService
|
| 6 |
+
from app.utils.tests import classifier_test_dataset
|
| 7 |
+
import logging
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
router = APIRouter()
|
| 11 |
+
|
| 12 |
+
@router.get("/")
|
| 13 |
+
def health_check():
|
| 14 |
+
"""Health check endpoint."""
|
| 15 |
+
return {"status": "healthy", "service": "RAG"}
|
| 16 |
+
|
| 17 |
+
@router.post("/")
|
| 18 |
+
def query_rag(
|
| 19 |
+
request: RAGRequest,
|
| 20 |
+
rag_service: RAGService = Depends(get_rag_service) # ✅ Dependency injection!
|
| 21 |
+
):
|
| 22 |
+
"""
|
| 23 |
+
Query the RAG system with a question.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
request: RAG request with question and history
|
| 27 |
+
rag_service: Injected RAG service instance
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
Answer with references and metadata
|
| 31 |
+
"""
|
| 32 |
+
try:
|
| 33 |
+
response = rag_service.query(
|
| 34 |
+
question=request.question,
|
| 35 |
+
history=request.history or [],
|
| 36 |
+
k=request.k,
|
| 37 |
+
threshold=request.threshold,
|
| 38 |
+
include_llm_response=request.include_llm_response
|
| 39 |
+
)
|
| 40 |
+
return response
|
| 41 |
+
except Exception as e:
|
| 42 |
+
logger.error(f"RAG query failed: {e}")
|
| 43 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 44 |
+
|
| 45 |
+
@router.post("/hybrid_query")
|
| 46 |
+
def hybrid_query(
|
| 47 |
+
request: RAGRequest,
|
| 48 |
+
rag_service: RAGService = Depends(get_rag_service) # ✅ Dependency injection!
|
| 49 |
+
):
|
| 50 |
+
"""
|
| 51 |
+
Query the RAG system with a question.
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
request: RAG request with question and history
|
| 55 |
+
rag_service: Injected RAG service instance
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
Answer with references and metadata
|
| 59 |
+
"""
|
| 60 |
+
try:
|
| 61 |
+
response = rag_service.hybrid_query(
|
| 62 |
+
question=request.question,
|
| 63 |
+
history=request.history or [],
|
| 64 |
+
k=request.k,
|
| 65 |
+
threshold=request.threshold,
|
| 66 |
+
include_llm_response=request.include_llm_response
|
| 67 |
+
)
|
| 68 |
+
return response
|
| 69 |
+
except Exception as e:
|
| 70 |
+
logger.error(f"RAG query failed: {e}")
|
| 71 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 72 |
+
|
| 73 |
+
@router.post("/test")
|
| 74 |
+
def test_queries(
|
| 75 |
+
request: TestRequestSchema,
|
| 76 |
+
query_delay: float = 1.0, # seconds between queries (Gemini 100 RPM limit)
|
| 77 |
+
rag_service: RAGService = Depends(get_rag_service)
|
| 78 |
+
):
|
| 79 |
+
"""
|
| 80 |
+
Run batch retrieval evaluation.
|
| 81 |
+
- query_delay: sleep between queries to respect Gemini embedding rate limit.
|
| 82 |
+
Free tier = 100 RPM → 1.0s delay safe for up to 150 queries (~2.5 min).
|
| 83 |
+
Set to 0.0 to disable (only if you have a paid API key).
|
| 84 |
+
"""
|
| 85 |
+
try:
|
| 86 |
+
response = rag_service.test_queries(
|
| 87 |
+
tests=request,
|
| 88 |
+
query_delay=query_delay
|
| 89 |
+
)
|
| 90 |
+
return response
|
| 91 |
+
except Exception as e:
|
| 92 |
+
logger.error(f"Test Execution failed: {e}")
|
| 93 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
@router.post("/test_classifier")
|
| 97 |
+
def test_classifier(
|
| 98 |
+
request: TestClassifierReqSchema,
|
| 99 |
+
rag_service: RAGService = Depends(get_rag_service)
|
| 100 |
+
):
|
| 101 |
+
try:
|
| 102 |
+
if(request.tests is None):
|
| 103 |
+
raise HTTPException(status_code=400, detail="No tests provided")
|
| 104 |
+
response = rag_service.test_classifier(
|
| 105 |
+
tests=request
|
| 106 |
+
)
|
| 107 |
+
return response
|
| 108 |
+
except Exception as e:
|
| 109 |
+
logger.error(f"Test classifier Execution failed: {e}")
|
| 110 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
@router.get("/test_classifier_dataset")
|
| 114 |
+
def test_classifier_dataset(
|
| 115 |
+
rag_service: RAGService = Depends(get_rag_service)
|
| 116 |
+
):
|
| 117 |
+
try:
|
| 118 |
+
if(len(rag_service.evaluation.keys()) > 0):
|
| 119 |
+
return rag_service.evaluation
|
| 120 |
+
|
| 121 |
+
req = TestClassifierReqSchema(tests=classifier_test_dataset)
|
| 122 |
+
response = rag_service.test_classifier(
|
| 123 |
+
tests=req
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
rag_service.evaluation = response["evaluation"]
|
| 127 |
+
return rag_service.evaluation
|
| 128 |
+
except Exception as e:
|
| 129 |
+
logger.error(f"Test classifier Execution failed: {e}")
|
| 130 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
@router.post("/similarity_search")
|
| 134 |
+
def similarity_search(
|
| 135 |
+
request: RAGRequest,
|
| 136 |
+
rag_service: RAGService = Depends(get_rag_service) # ✅ Dependency injection!
|
| 137 |
+
):
|
| 138 |
+
"""
|
| 139 |
+
Query the RAG system with a question.
|
| 140 |
+
|
| 141 |
+
Args:
|
| 142 |
+
request: RAG request with question and history
|
| 143 |
+
rag_service: Injected RAG service instance
|
| 144 |
+
|
| 145 |
+
Returns:
|
| 146 |
+
Answer with references and metadata
|
| 147 |
+
"""
|
| 148 |
+
try:
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
response = rag_service.hybrid_query(
|
| 152 |
+
question=request.question,
|
| 153 |
+
history=request.history or [],
|
| 154 |
+
k=request.k,
|
| 155 |
+
threshold=request.threshold,
|
| 156 |
+
include_llm_response=False,
|
| 157 |
+
)
|
| 158 |
+
return response
|
| 159 |
+
except Exception as e:
|
| 160 |
+
logger.error(f"RAG query failed: {e}")
|
| 161 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 162 |
+
|
| 163 |
+
@router.post("/search")
|
| 164 |
+
def search(
|
| 165 |
+
request: RAGRequest,
|
| 166 |
+
rag_service: RAGService = Depends(get_rag_service) # ✅ Dependency injection!
|
| 167 |
+
):
|
| 168 |
+
"""
|
| 169 |
+
Query the RAG system with a question.
|
| 170 |
+
|
| 171 |
+
Args:
|
| 172 |
+
request: RAG request with question and history
|
| 173 |
+
rag_service: Injected RAG service instance
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
Answer with references and metadata
|
| 177 |
+
"""
|
| 178 |
+
try:
|
| 179 |
+
response = rag_service.search_docs(
|
| 180 |
+
question=request.question,
|
| 181 |
+
k=request.k
|
| 182 |
+
)
|
| 183 |
+
return response
|
| 184 |
+
except Exception as e:
|
| 185 |
+
logger.error(f"RAG query failed: {e}")
|
| 186 |
+
raise HTTPException(status_code=500, detail=str(e))
|
app/api/routes/settings.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, HTTPException, status
|
| 2 |
+
from app.core.config import settings
|
| 3 |
+
from app.api.schemas.settings import SettingsUpdate, SettingsResponse
|
| 4 |
+
import logging
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
router = APIRouter()
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@router.get("/", response_model=SettingsResponse)
|
| 11 |
+
def get_settings():
|
| 12 |
+
"""
|
| 13 |
+
Get current application settings.
|
| 14 |
+
|
| 15 |
+
Returns all configurable settings including RAG parameters,
|
| 16 |
+
model configuration, and API settings.
|
| 17 |
+
"""
|
| 18 |
+
return SettingsResponse(
|
| 19 |
+
# Paths (read-only)
|
| 20 |
+
root_path=str(settings.root_path),
|
| 21 |
+
model_path=str(settings.model_path),
|
| 22 |
+
data_path=str(settings.data_path),
|
| 23 |
+
|
| 24 |
+
# API Settings
|
| 25 |
+
api_title=settings.api_title,
|
| 26 |
+
api_version=settings.api_version,
|
| 27 |
+
cors_origins=settings.cors_origins,
|
| 28 |
+
|
| 29 |
+
# RAG Settings
|
| 30 |
+
chunk_size=settings.chunk_size,
|
| 31 |
+
chunk_overlap=settings.chunk_overlap,
|
| 32 |
+
similarity_top_k=settings.similarity_top_k,
|
| 33 |
+
similarity_threshold=settings.similarity_threshold,
|
| 34 |
+
collection_name=settings.collection_name,
|
| 35 |
+
persist_directory=settings.persist_directory,
|
| 36 |
+
|
| 37 |
+
# Model Settings
|
| 38 |
+
llm_provider=settings.llm_provider,
|
| 39 |
+
enable_fallback=settings.enable_fallback,
|
| 40 |
+
embedding_model_name=settings.embedding_model_name,
|
| 41 |
+
gemini_model_name=settings.gemini_model_name,
|
| 42 |
+
local_model_name=settings.local_model_name,
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@router.patch("/", response_model=SettingsResponse)
|
| 47 |
+
def update_settings(updates: SettingsUpdate):
|
| 48 |
+
"""
|
| 49 |
+
Update application settings at runtime.
|
| 50 |
+
|
| 51 |
+
Only provided fields will be updated. Omitted fields remain unchanged.
|
| 52 |
+
|
| 53 |
+
**Note:** Changes are runtime-only and will be lost on server restart.
|
| 54 |
+
To persist changes, update the `.env` file.
|
| 55 |
+
|
| 56 |
+
**Warning:** Some changes (like CORS origins) may require server restart
|
| 57 |
+
to take full effect.
|
| 58 |
+
"""
|
| 59 |
+
updated_fields = []
|
| 60 |
+
|
| 61 |
+
# Update RAG settings
|
| 62 |
+
if updates.chunk_size is not None:
|
| 63 |
+
settings.chunk_size = updates.chunk_size
|
| 64 |
+
updated_fields.append("chunk_size")
|
| 65 |
+
|
| 66 |
+
if updates.chunk_overlap is not None:
|
| 67 |
+
settings.chunk_overlap = updates.chunk_overlap
|
| 68 |
+
updated_fields.append("chunk_overlap")
|
| 69 |
+
|
| 70 |
+
if updates.similarity_top_k is not None:
|
| 71 |
+
settings.similarity_top_k = updates.similarity_top_k
|
| 72 |
+
updated_fields.append("similarity_top_k")
|
| 73 |
+
|
| 74 |
+
if updates.similarity_threshold is not None:
|
| 75 |
+
settings.similarity_threshold = updates.similarity_threshold
|
| 76 |
+
updated_fields.append("similarity_threshold")
|
| 77 |
+
|
| 78 |
+
# Update Model settings
|
| 79 |
+
if updates.llm_provider is not None:
|
| 80 |
+
settings.llm_provider = updates.llm_provider
|
| 81 |
+
updated_fields.append("llm_provider")
|
| 82 |
+
logger.info(f"LLM provider changed to: {updates.llm_provider}")
|
| 83 |
+
|
| 84 |
+
if updates.enable_fallback is not None:
|
| 85 |
+
settings.enable_fallback = updates.enable_fallback
|
| 86 |
+
updated_fields.append("enable_fallback")
|
| 87 |
+
|
| 88 |
+
if updates.gemini_model_name is not None:
|
| 89 |
+
settings.gemini_model_name = updates.gemini_model_name
|
| 90 |
+
updated_fields.append("gemini_model_name")
|
| 91 |
+
|
| 92 |
+
if updates.local_model_name is not None:
|
| 93 |
+
settings.local_model_name = updates.local_model_name
|
| 94 |
+
updated_fields.append("local_model_name")
|
| 95 |
+
|
| 96 |
+
# Update API settings
|
| 97 |
+
if updates.cors_origins is not None:
|
| 98 |
+
settings.cors_origins = updates.cors_origins
|
| 99 |
+
updated_fields.append("cors_origins")
|
| 100 |
+
logger.warning("CORS origins updated. Server restart may be required for full effect.")
|
| 101 |
+
|
| 102 |
+
logger.info(f"Settings updated: {', '.join(updated_fields)}")
|
| 103 |
+
|
| 104 |
+
# Return updated settings
|
| 105 |
+
return get_settings()
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
@router.post("/reset")
|
| 109 |
+
def reset_settings():
|
| 110 |
+
"""
|
| 111 |
+
Reset all settings to defaults from .env file.
|
| 112 |
+
|
| 113 |
+
This reloads settings from the environment file and discards
|
| 114 |
+
any runtime changes.
|
| 115 |
+
|
| 116 |
+
**Warning:** This will restart the settings object and may cause
|
| 117 |
+
temporary service interruption.
|
| 118 |
+
"""
|
| 119 |
+
try:
|
| 120 |
+
# Reload settings from .env
|
| 121 |
+
from app.core.config import Settings
|
| 122 |
+
new_settings = Settings()
|
| 123 |
+
|
| 124 |
+
# Update the global settings object
|
| 125 |
+
for key, value in new_settings.dict().items():
|
| 126 |
+
setattr(settings, key, value)
|
| 127 |
+
|
| 128 |
+
logger.info("Settings reset to defaults from .env")
|
| 129 |
+
|
| 130 |
+
return {
|
| 131 |
+
"message": "Settings reset to defaults",
|
| 132 |
+
"status": "success"
|
| 133 |
+
}
|
| 134 |
+
except Exception as e:
|
| 135 |
+
logger.error(f"Failed to reset settings: {e}")
|
| 136 |
+
raise HTTPException(
|
| 137 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 138 |
+
detail=f"Failed to reset settings: {str(e)}"
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
@router.get("/rag")
|
| 143 |
+
def get_rag_settings():
|
| 144 |
+
"""
|
| 145 |
+
Get only RAG-related settings.
|
| 146 |
+
|
| 147 |
+
Returns chunk sizes, similarity parameters, and vector store configuration.
|
| 148 |
+
"""
|
| 149 |
+
return {
|
| 150 |
+
"chunk_size": settings.chunk_size,
|
| 151 |
+
"chunk_overlap": settings.chunk_overlap,
|
| 152 |
+
"similarity_top_k": settings.similarity_top_k,
|
| 153 |
+
"similarity_threshold": settings.similarity_threshold,
|
| 154 |
+
"collection_name": settings.collection_name,
|
| 155 |
+
"persist_directory": settings.persist_directory,
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
@router.get("/models")
|
| 160 |
+
def get_model_settings():
|
| 161 |
+
"""
|
| 162 |
+
Get only model-related settings.
|
| 163 |
+
|
| 164 |
+
Returns LLM provider, model names, and fallback configuration.
|
| 165 |
+
"""
|
| 166 |
+
return {
|
| 167 |
+
"llm_provider": settings.llm_provider,
|
| 168 |
+
"enable_fallback": settings.enable_fallback,
|
| 169 |
+
"embedding_model_name": settings.embedding_model_name,
|
| 170 |
+
"gemini_model_name": settings.gemini_model_name,
|
| 171 |
+
"local_model_name": settings.local_model_name,
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
@router.get("/api")
|
| 176 |
+
def get_api_settings():
|
| 177 |
+
"""
|
| 178 |
+
Get only API-related settings.
|
| 179 |
+
|
| 180 |
+
Returns API metadata and CORS configuration.
|
| 181 |
+
"""
|
| 182 |
+
return {
|
| 183 |
+
"api_title": settings.api_title,
|
| 184 |
+
"api_version": settings.api_version,
|
| 185 |
+
"cors_origins": settings.cors_origins,
|
| 186 |
+
}
|
app/api/routes/vector_store.py
ADDED
|
@@ -0,0 +1,311 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
|
| 2 |
+
from langchain_core.documents import Document
|
| 3 |
+
from app.api.dependencies import get_rag_service
|
| 4 |
+
from app.core.config import settings
|
| 5 |
+
from app.api.schemas.requests import (
|
| 6 |
+
deleteDocs,
|
| 7 |
+
DocumentType,
|
| 8 |
+
PaginationParams,
|
| 9 |
+
DocumentFilters,
|
| 10 |
+
DeleteFilters,
|
| 11 |
+
)
|
| 12 |
+
from fastapi.responses import JSONResponse
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Optional, List, Dict, Any
|
| 15 |
+
from fastapi import Path as Params
|
| 16 |
+
from app.services.rag_service import RAGService
|
| 17 |
+
from app.services.ingestion_service import ingestion_service
|
| 18 |
+
|
| 19 |
+
import os
|
| 20 |
+
import shutil
|
| 21 |
+
import math
|
| 22 |
+
|
| 23 |
+
# Temp directory for uploaded files before ingestion
|
| 24 |
+
UPLOAD_TEMP_PATH = settings.root_path / "temp"
|
| 25 |
+
|
| 26 |
+
router = APIRouter()
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# ---------------------------------------------------------------------------
|
| 30 |
+
# Shared helpers
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
|
| 33 |
+
def _unpack_chroma_result(result: dict):
|
| 34 |
+
"""Unpack a raw ChromaDB result dict into parallel lists."""
|
| 35 |
+
ids = result.get("ids", [])
|
| 36 |
+
docs = result.get("documents", [])
|
| 37 |
+
metas = result.get("metadatas", [])
|
| 38 |
+
return ids, docs, metas
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _apply_doc_filters(
|
| 42 |
+
ids: List[str],
|
| 43 |
+
docs: List[str],
|
| 44 |
+
metas: List[Dict[str, Any]],
|
| 45 |
+
filters: DocumentFilters,
|
| 46 |
+
) -> List[Dict]:
|
| 47 |
+
"""Filter a Chroma result set by DocumentFilters and return shaped dicts."""
|
| 48 |
+
filtered = []
|
| 49 |
+
for i in range(len(ids)):
|
| 50 |
+
doc_text = docs[i]
|
| 51 |
+
meta = metas[i] if metas else {}
|
| 52 |
+
|
| 53 |
+
if filters.filename and meta.get("source_file") != filters.filename:
|
| 54 |
+
continue
|
| 55 |
+
if filters.source and meta.get("source") != filters.source:
|
| 56 |
+
continue
|
| 57 |
+
if filters.contains and filters.contains.lower() not in doc_text.lower():
|
| 58 |
+
continue
|
| 59 |
+
|
| 60 |
+
filtered.append({"id": ids[i], "content": doc_text, "metadata": meta})
|
| 61 |
+
return filtered
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _save_upload(file: UploadFile) -> Path:
|
| 65 |
+
"""Save an uploaded file to the temp directory and return its path."""
|
| 66 |
+
UPLOAD_TEMP_PATH.mkdir(parents=True, exist_ok=True)
|
| 67 |
+
file_path = UPLOAD_TEMP_PATH / file.filename
|
| 68 |
+
with open(file_path, "wb") as buffer:
|
| 69 |
+
shutil.copyfileobj(file.file, buffer)
|
| 70 |
+
return file_path
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# ---------------------------------------------------------------------------
|
| 74 |
+
# GET /filenames ← must be before GET /{id} to avoid route conflict
|
| 75 |
+
# ---------------------------------------------------------------------------
|
| 76 |
+
|
| 77 |
+
@router.get("/filenames")
|
| 78 |
+
def list_filenames(rag_service: RAGService = Depends(get_rag_service)):
|
| 79 |
+
"""Return a list of unique ingested filenames."""
|
| 80 |
+
return rag_service.get_filenames()
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# ---------------------------------------------------------------------------
|
| 84 |
+
# GET /
|
| 85 |
+
# ---------------------------------------------------------------------------
|
| 86 |
+
|
| 87 |
+
@router.get("/")
|
| 88 |
+
def list_documents(
|
| 89 |
+
params: PaginationParams = Depends(),
|
| 90 |
+
filters: DocumentFilters = Depends(),
|
| 91 |
+
rag_service: RAGService = Depends(get_rag_service),
|
| 92 |
+
):
|
| 93 |
+
"""
|
| 94 |
+
List all documents with pagination and optional filters.
|
| 95 |
+
|
| 96 |
+
Query Parameters:
|
| 97 |
+
- page: Page number (default: 1)
|
| 98 |
+
- limit: Items per page (default: 10, max: 100)
|
| 99 |
+
- order: Sort order – "asc" or "desc" (default: "desc")
|
| 100 |
+
- filename: Filter by source_file metadata
|
| 101 |
+
- source: Filter by source metadata path
|
| 102 |
+
- contains: Filter by text content (case-insensitive)
|
| 103 |
+
"""
|
| 104 |
+
ids, docs, metas = [], [], []
|
| 105 |
+
if filters.contains:
|
| 106 |
+
documents = rag_service.search_docs(
|
| 107 |
+
question=filters.contains,
|
| 108 |
+
k=params.limit,
|
| 109 |
+
filename = filters.filename
|
| 110 |
+
)
|
| 111 |
+
filtered = documents
|
| 112 |
+
print(filtered)
|
| 113 |
+
else:
|
| 114 |
+
ids, docs, metas = _unpack_chroma_result(rag_service.database.get())
|
| 115 |
+
filtered = _apply_doc_filters(ids, docs, metas, filters)
|
| 116 |
+
|
| 117 |
+
# Sort by creation date
|
| 118 |
+
reverse = params.order == "desc"
|
| 119 |
+
filtered.sort(
|
| 120 |
+
key=lambda x: x.get("metadata", {}).get("creationdate", ""),
|
| 121 |
+
reverse=reverse,
|
| 122 |
+
)
|
| 123 |
+
# Paginate
|
| 124 |
+
total_docs = len(filtered)
|
| 125 |
+
total_pages = math.ceil(total_docs / params.limit) if total_docs > 0 else 0
|
| 126 |
+
start = (params.page - 1) * params.limit
|
| 127 |
+
paginated = filtered[start : start + params.limit]
|
| 128 |
+
|
| 129 |
+
return {
|
| 130 |
+
"page": params.page,
|
| 131 |
+
"limit": params.limit,
|
| 132 |
+
"total_docs": total_docs,
|
| 133 |
+
"total_pages": total_pages,
|
| 134 |
+
"order": params.order,
|
| 135 |
+
"data": paginated,
|
| 136 |
+
"status": 200,
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
# ---------------------------------------------------------------------------
|
| 141 |
+
# GET /{id}
|
| 142 |
+
# ---------------------------------------------------------------------------
|
| 143 |
+
|
| 144 |
+
@router.get("/{id}")
|
| 145 |
+
def get_document(
|
| 146 |
+
id: str = Params(...),
|
| 147 |
+
rag_service: RAGService = Depends(get_rag_service),
|
| 148 |
+
):
|
| 149 |
+
"""Fetch a single document by its ChromaDB ID."""
|
| 150 |
+
if not id:
|
| 151 |
+
raise HTTPException(status_code=400, detail="Document ID is required")
|
| 152 |
+
|
| 153 |
+
result = rag_service.database.get_by_id(ids=[id])
|
| 154 |
+
ids, docs, metas = _unpack_chroma_result(result)
|
| 155 |
+
|
| 156 |
+
data = [
|
| 157 |
+
{"id": ids[i], "document": docs[i], "metadata": metas[i] if metas else {}}
|
| 158 |
+
for i in range(len(ids))
|
| 159 |
+
]
|
| 160 |
+
return {"data": data, "status": 200}
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
# ---------------------------------------------------------------------------
|
| 164 |
+
# POST / (file upload + ingestion)
|
| 165 |
+
# ---------------------------------------------------------------------------
|
| 166 |
+
|
| 167 |
+
SUPPORTED_EXTENSIONS = {".md", ".pdf", ".json", ".txt"}
|
| 168 |
+
|
| 169 |
+
@router.post("/")
|
| 170 |
+
def upload_document(
|
| 171 |
+
file: UploadFile = File(...),
|
| 172 |
+
title: Optional[str] = None,
|
| 173 |
+
rag_service: RAGService = Depends(get_rag_service),
|
| 174 |
+
):
|
| 175 |
+
"""
|
| 176 |
+
Upload and ingest a document file into the vector store.
|
| 177 |
+
Supported types: .md, .pdf, .json, .txt
|
| 178 |
+
"""
|
| 179 |
+
file_path = _save_upload(file)
|
| 180 |
+
ext = file_path.suffix.lower()
|
| 181 |
+
|
| 182 |
+
if ext not in SUPPORTED_EXTENSIONS:
|
| 183 |
+
file_path.unlink(missing_ok=True)
|
| 184 |
+
raise HTTPException(status_code=400, detail=f"Unsupported file type: {ext}")
|
| 185 |
+
|
| 186 |
+
docs = rag_service.ingest_documents(file_path)
|
| 187 |
+
file_path.unlink(missing_ok=True)
|
| 188 |
+
|
| 189 |
+
if not docs:
|
| 190 |
+
raise HTTPException(status_code=400, detail="No content could be extracted from the file")
|
| 191 |
+
|
| 192 |
+
return JSONResponse({
|
| 193 |
+
"filename": file.filename,
|
| 194 |
+
"message": f"{ext} uploaded and ingested successfully",
|
| 195 |
+
"docs_added": len(docs),
|
| 196 |
+
"status": 200,
|
| 197 |
+
})
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
# ---------------------------------------------------------------------------
|
| 201 |
+
# PUT /{id}
|
| 202 |
+
# ---------------------------------------------------------------------------
|
| 203 |
+
|
| 204 |
+
@router.put("/{id}")
|
| 205 |
+
def update_document(
|
| 206 |
+
doc: DocumentType,
|
| 207 |
+
id: str = Params(...),
|
| 208 |
+
rag_service: RAGService = Depends(get_rag_service),
|
| 209 |
+
):
|
| 210 |
+
"""Update an existing document's content and metadata by ID."""
|
| 211 |
+
if not id:
|
| 212 |
+
raise HTTPException(status_code=400, detail="Document ID is required")
|
| 213 |
+
|
| 214 |
+
content = doc.document.strip()
|
| 215 |
+
if not content:
|
| 216 |
+
raise HTTPException(status_code=400, detail="Document content cannot be empty")
|
| 217 |
+
|
| 218 |
+
updated_document = Document(
|
| 219 |
+
page_content=content,
|
| 220 |
+
metadata={**doc.metadata, "id": id},
|
| 221 |
+
)
|
| 222 |
+
rag_service.database.update_document(id, updated_document)
|
| 223 |
+
|
| 224 |
+
return {"status": 200, "message": f"{id} updated successfully"}
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
# ---------------------------------------------------------------------------
|
| 228 |
+
# DELETE /ids (bulk delete by explicit ID list)
|
| 229 |
+
# ---------------------------------------------------------------------------
|
| 230 |
+
|
| 231 |
+
@router.delete("/ids")
|
| 232 |
+
def delete_documents_by_ids(
|
| 233 |
+
body: deleteDocs,
|
| 234 |
+
rag_service: RAGService = Depends(get_rag_service),
|
| 235 |
+
):
|
| 236 |
+
"""Delete multiple documents by providing an explicit list of IDs."""
|
| 237 |
+
result = rag_service.database.delete(body.docs)
|
| 238 |
+
return {"message": "Documents deleted successfully", "deleted": len(body.docs), "result": result}
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
# ---------------------------------------------------------------------------
|
| 242 |
+
# DELETE /{id} (single delete)
|
| 243 |
+
# ---------------------------------------------------------------------------
|
| 244 |
+
|
| 245 |
+
@router.delete("/{id}")
|
| 246 |
+
def delete_document(
|
| 247 |
+
id: str = Params(...),
|
| 248 |
+
rag_service: RAGService = Depends(get_rag_service),
|
| 249 |
+
):
|
| 250 |
+
"""Delete a single document by its ChromaDB ID."""
|
| 251 |
+
if not id:
|
| 252 |
+
raise HTTPException(status_code=400, detail="Document ID is required")
|
| 253 |
+
|
| 254 |
+
result = rag_service.database.delete([id])
|
| 255 |
+
return {"message": "Document deleted successfully", "deleted": 1, "result": result, "status": 200}
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
# ---------------------------------------------------------------------------
|
| 259 |
+
# DELETE / (filter-based delete)
|
| 260 |
+
# ---------------------------------------------------------------------------
|
| 261 |
+
|
| 262 |
+
@router.delete("/")
|
| 263 |
+
def delete_documents_by_filter(
|
| 264 |
+
filters: DeleteFilters = Depends(),
|
| 265 |
+
rag_service: RAGService = Depends(get_rag_service),
|
| 266 |
+
):
|
| 267 |
+
"""
|
| 268 |
+
Delete documents matching filter criteria.
|
| 269 |
+
|
| 270 |
+
Query Parameters:
|
| 271 |
+
- filename: Delete documents with this source_file value
|
| 272 |
+
- source: Delete documents with this source path
|
| 273 |
+
- contains: Delete documents whose text contains this string
|
| 274 |
+
- dry_run: Preview matching docs without deleting (default: false)
|
| 275 |
+
"""
|
| 276 |
+
ids, docs, metas = _unpack_chroma_result(rag_service.database.get())
|
| 277 |
+
|
| 278 |
+
delete_ids = []
|
| 279 |
+
for i in range(len(ids)):
|
| 280 |
+
doc_text = docs[i]
|
| 281 |
+
meta = metas[i] if metas else {}
|
| 282 |
+
|
| 283 |
+
if filters.source:
|
| 284 |
+
stored_source = str(Path(meta.get("source", "")).resolve())
|
| 285 |
+
input_source = str(Path(filters.source).resolve())
|
| 286 |
+
if stored_source != input_source:
|
| 287 |
+
continue
|
| 288 |
+
|
| 289 |
+
if filters.filename and meta.get("source_file") != filters.filename:
|
| 290 |
+
continue
|
| 291 |
+
|
| 292 |
+
if filters.contains and filters.contains.lower() not in doc_text.lower():
|
| 293 |
+
continue
|
| 294 |
+
|
| 295 |
+
delete_ids.append(ids[i])
|
| 296 |
+
|
| 297 |
+
if filters.filename:
|
| 298 |
+
ingestion_service.delete_record(filters.filename)
|
| 299 |
+
|
| 300 |
+
if not delete_ids:
|
| 301 |
+
return {"message": "No matching documents found", "deleted": 0}
|
| 302 |
+
|
| 303 |
+
if filters.dry_run:
|
| 304 |
+
return {
|
| 305 |
+
"message": "Dry run – no documents deleted",
|
| 306 |
+
"matched_count": len(delete_ids),
|
| 307 |
+
"matched_ids": delete_ids,
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
result = rag_service.database.delete(delete_ids)
|
| 311 |
+
return {"message": "Documents deleted successfully", "deleted": len(delete_ids), "result": result}
|
app/api/schemas/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .tests import TestResponseSchema, TestRequestSchema
|
app/api/schemas/requests.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import List, Optional, Literal, Annotated, Dict, Any
|
| 3 |
+
from app.core.config import settings
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# ✅ Query Parameter Schemas - Clean and Reusable!
|
| 7 |
+
|
| 8 |
+
class PaginationParams(BaseModel):
|
| 9 |
+
"""Pagination query parameters for list endpoints."""
|
| 10 |
+
page: int = Field(default=1, ge=1, description="Page number (starts at 1)")
|
| 11 |
+
limit: int = Field(default=10, ge=1, le=100, description="Items per page (max 100)")
|
| 12 |
+
order: Literal["asc", "desc"] = Field(default="desc", description="Sort order")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class DocumentFilters(BaseModel):
|
| 16 |
+
"""Document filtering query parameters."""
|
| 17 |
+
filename: Optional[str] = Field(default=None, description="Filter by exact filename")
|
| 18 |
+
source: Optional[str] = Field(default=None, description="Filter by source path")
|
| 19 |
+
contains: Optional[str] = Field(default=None, description="Filter by text content (case-insensitive)")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class DeleteFilters(BaseModel):
|
| 23 |
+
"""Delete operation filters with dry-run support."""
|
| 24 |
+
filename: Optional[str] = Field(default=None, description="Delete documents with this filename")
|
| 25 |
+
source: Optional[str] = Field(default=None, description="Delete documents from this source")
|
| 26 |
+
contains: Optional[str] = Field(default=None, description="Delete documents containing this text")
|
| 27 |
+
dry_run: bool = Field(default=False, description="Preview deletions without executing")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Request Body Schemas
|
| 31 |
+
|
| 32 |
+
class RAGRequest(BaseModel):
|
| 33 |
+
"""Request schema for RAG query endpoint."""
|
| 34 |
+
question: Annotated[str, Field(min_length=1, description="Question that user wants to ask")]
|
| 35 |
+
history: Annotated[Optional[List[str]], Field(default=[], description="Previously Asked Questions")]
|
| 36 |
+
k: int = Field(default=settings.similarity_top_k, ge=1, le=100, description="Number of similar documents to retrieve")
|
| 37 |
+
threshold: float = Field(default=settings.similarity_threshold, ge=0, le=1, description="Similarity threshold")
|
| 38 |
+
include_llm_response: bool = Field(default=True, description="Whether to generate LLM answer")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class Query(BaseModel):
|
| 42 |
+
"""Query result schema."""
|
| 43 |
+
def __init__(self, question: str, answer: str):
|
| 44 |
+
self.question = question
|
| 45 |
+
self.answer = answer
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class deleteDocs(BaseModel):
|
| 49 |
+
"""Request schema for bulk delete by IDs."""
|
| 50 |
+
docs: Annotated[List[str], Field(min_length=1, description="List of IDs that you want to delete!")]
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class DocumentType(BaseModel):
|
| 54 |
+
"""Document update schema."""
|
| 55 |
+
id: str
|
| 56 |
+
metadata: Dict[str, Any]
|
| 57 |
+
document: str
|
| 58 |
+
|
| 59 |
+
class SimilaritySearch(BaseModel):
|
| 60 |
+
"""Search The Best Params for Similarity Search"""
|
| 61 |
+
query: str
|
| 62 |
+
k: int = Field(default=settings.similarity_top_k, ge=1, le=100, description="Number of similar documents to retrieve")
|
| 63 |
+
threshold: float = Field(default=settings.similarity_threshold, ge=0, le=1, description="Similarity threshold")
|
| 64 |
+
history: Annotated[Optional[List[str]], Field(default=[], description="Previously Asked Questions")]
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class TextIngestRequest(BaseModel):
|
| 68 |
+
"""Request schema for raw text ingestion."""
|
| 69 |
+
text: str = Field(..., min_length=1, description="Raw text content to ingest")
|
| 70 |
+
metadata: Dict[str, Any] = Field(default_factory=dict, description="Optional metadata (title, source, etc.)")
|
| 71 |
+
filename: Optional[str] = Field(default=None, description="Virtual filename/source for the document")
|
app/api/schemas/settings.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import List, Optional, Literal
|
| 3 |
+
from app.core.config import settings
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class SettingsUpdate(BaseModel):
|
| 7 |
+
"""Schema for updating application settings."""
|
| 8 |
+
|
| 9 |
+
# RAG Settings
|
| 10 |
+
chunk_size: Optional[int] = Field(default=settings.chunk_size, ge=100, le=5000, description="Text chunk size")
|
| 11 |
+
chunk_overlap: Optional[int] = Field(default=settings.chunk_overlap, ge=0, le=1000, description="Chunk overlap size")
|
| 12 |
+
similarity_top_k: Optional[int] = Field(default=settings.similarity_top_k, ge=1, le=20, description="Number of similar docs to retrieve")
|
| 13 |
+
similarity_threshold: Optional[float] = Field(default=settings.similarity_threshold, ge=0, le=1, description="Similarity threshold")
|
| 14 |
+
|
| 15 |
+
# Model Settings
|
| 16 |
+
llm_provider: Optional[Literal["gemini", "local"]] = Field(default=settings.llm_provider, description="LLM provider to use")
|
| 17 |
+
enable_fallback: Optional[bool] = Field(default=settings.enable_fallback, description="Enable fallback to alternate model")
|
| 18 |
+
gemini_model_name: Optional[str] = Field(default=settings.gemini_model_name, description="Gemini model name")
|
| 19 |
+
local_model_name: Optional[str] = Field(default=settings.local_model_name, description="Local model filename")
|
| 20 |
+
|
| 21 |
+
# API Settings
|
| 22 |
+
cors_origins: Optional[List[str]] = Field(default=settings.cors_origins, description="Allowed CORS origins")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class SettingsResponse(BaseModel):
|
| 26 |
+
"""Schema for settings response."""
|
| 27 |
+
|
| 28 |
+
# Paths (read-only)
|
| 29 |
+
root_path: str
|
| 30 |
+
model_path: str
|
| 31 |
+
data_path: str
|
| 32 |
+
|
| 33 |
+
# API Settings
|
| 34 |
+
api_title: str
|
| 35 |
+
api_version: str
|
| 36 |
+
cors_origins: List[str]
|
| 37 |
+
|
| 38 |
+
# RAG Settings
|
| 39 |
+
chunk_size: int
|
| 40 |
+
chunk_overlap: int
|
| 41 |
+
similarity_top_k: int
|
| 42 |
+
similarity_threshold: float
|
| 43 |
+
collection_name: str
|
| 44 |
+
persist_directory: str
|
| 45 |
+
|
| 46 |
+
# Model Settings
|
| 47 |
+
llm_provider: str
|
| 48 |
+
enable_fallback: bool
|
| 49 |
+
embedding_model_name: str
|
| 50 |
+
gemini_model_name: str
|
| 51 |
+
local_model_name: str
|
| 52 |
+
|
| 53 |
+
class Config:
|
| 54 |
+
from_attributes = True
|
app/api/schemas/tests.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import List
|
| 3 |
+
|
| 4 |
+
class Test(BaseModel):
|
| 5 |
+
question: str = Field(min_length=1, max_length=100, description="Question you want to test")
|
| 6 |
+
document: str = Field(min_length=1, description="Document name")
|
| 7 |
+
chunk_index: int = Field(default=0, min=0, description="Chunk index")
|
| 8 |
+
|
| 9 |
+
class TestRequestSchema(BaseModel):
|
| 10 |
+
tests: List[Test] = Field(min_length=1, description="give tests to evalute")
|
| 11 |
+
k: int = Field(default=5, min=0, max=20, description="maximum number of results")
|
| 12 |
+
threshold: float = Field(default= 0.4, min=0.0, max=1.0, description="Threshold for reference")
|
| 13 |
+
|
| 14 |
+
class TestResponse(BaseModel):
|
| 15 |
+
tests: Test
|
| 16 |
+
answer: bool
|
| 17 |
+
|
| 18 |
+
class TestResponseSchema(BaseModel):
|
| 19 |
+
tests: List[TestResponse] = Field(min_length=1, description="test results")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class TestClassifier(BaseModel):
|
| 23 |
+
question: str = Field(min_length=1, description="Question you want to test")
|
| 24 |
+
type: str = Field(min_length=1, description="Type to be predicted")
|
| 25 |
+
category: str = Field(min_length=1, description="Category to be predicted")
|
| 26 |
+
topic: str = Field(min_length=1, description="Topic to be predicted")
|
| 27 |
+
intent: str = Field(min_length=1, description="Intent to be predicted")
|
| 28 |
+
|
| 29 |
+
class TestClassifierReqSchema(BaseModel):
|
| 30 |
+
tests: List[TestClassifier] = Field(min_length=1, description="give tests to evalute")
|
app/core/__init__.py
ADDED
|
File without changes
|
app/core/config.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic_settings import BaseSettings
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Literal
|
| 4 |
+
|
| 5 |
+
class Settings(BaseSettings):
|
| 6 |
+
# Paths
|
| 7 |
+
root_path: Path = Path(__file__).resolve().parents[2]
|
| 8 |
+
core_models_path: Path = root_path / "ml_models"
|
| 9 |
+
model_path: Path = core_models_path / "llm"
|
| 10 |
+
embeddings_path: Path = core_models_path / "embeddings"
|
| 11 |
+
data_path: Path = root_path / "data"
|
| 12 |
+
documents_path: Path = data_path / "documents"
|
| 13 |
+
vector_stores_path: Path = data_path / "vector_stores"
|
| 14 |
+
classifier_path: Path = core_models_path / "classifier"
|
| 15 |
+
|
| 16 |
+
# API Settings
|
| 17 |
+
api_title: str = "VGEC RAG Chatbot API"
|
| 18 |
+
api_version: str = "1.0.0"
|
| 19 |
+
cors_origins: list[str] = ["*"]
|
| 20 |
+
|
| 21 |
+
# RAG Settings
|
| 22 |
+
chunk_size: int = 500
|
| 23 |
+
chunk_overlap: int = 100
|
| 24 |
+
similarity_top_k: int = 8
|
| 25 |
+
similarity_threshold: float = 0.4 # ✅ NEW - Filter docs by similarity score
|
| 26 |
+
collection_name: str = "classifier_test_1"
|
| 27 |
+
persist_directory: str = str(vector_stores_path / collection_name)
|
| 28 |
+
|
| 29 |
+
# Model Selection - ✅ NEW!
|
| 30 |
+
llm_provider: Literal["gemini", "local"] = "gemini" # Which model to use
|
| 31 |
+
enable_fallback: bool = False # Fallback to local if Gemini fails
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# Model Settings
|
| 35 |
+
embedding_model_name: str = "models/gemini-embedding-001"
|
| 36 |
+
gemini_model_name: str = "gemini-2.5-flash-lite"
|
| 37 |
+
local_model_name: str = "Qwen2.5-0.5B-Instruct-Q4_K_M.gguf"
|
| 38 |
+
# Llama-3.2-3B-Instruct-Q4_K_M.gguf
|
| 39 |
+
# query_model_name: str = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
|
| 40 |
+
# Qwen2.5-0.5B-Instruct-Q4_K_M.gguf
|
| 41 |
+
# Qwen3-0.6B-Q4_K_M.gguf
|
| 42 |
+
# Vi-Qwen2-1.5B-RAG.Q4_K_M.gguf
|
| 43 |
+
|
| 44 |
+
# Generation Settings
|
| 45 |
+
max_output_tokens: int = 2048 # Max tokens for Gemini responses
|
| 46 |
+
local_max_tokens: int = 512 # Max tokens for local model responses
|
| 47 |
+
|
| 48 |
+
# Google API - ✅ Pydantic automatically reads from .env
|
| 49 |
+
google_api_key: str # No default = required field
|
| 50 |
+
|
| 51 |
+
class Config:
|
| 52 |
+
env_file = ".env"
|
| 53 |
+
env_file_encoding = "utf-8"
|
| 54 |
+
|
| 55 |
+
settings = Settings()
|
app/core/paths.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
ROOT_PATH = Path(__file__).resolve().parents[2]
|
| 4 |
+
MODEL_PATH = ROOT_PATH / "ml_models"
|
| 5 |
+
LIBS_PATH = ROOT_PATH / "libs"
|
| 6 |
+
data_path = ROOT_PATH / "data"
|
| 7 |
+
|
| 8 |
+
print(ROOT_PATH)
|
| 9 |
+
print(MODEL_PATH)
|
| 10 |
+
print(LIBS_PATH)
|
app/main.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, APIRouter
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from app.core.config import settings
|
| 4 |
+
from app.api.routes import rag, vector_store, settings as settingsRouter
|
| 5 |
+
|
| 6 |
+
app = FastAPI()
|
| 7 |
+
|
| 8 |
+
# Include individual routers
|
| 9 |
+
API_PREFIX = "/api/v1"
|
| 10 |
+
|
| 11 |
+
app.include_router(rag.router, prefix=f"{API_PREFIX}/rag", tags=["RAG"])
|
| 12 |
+
app.include_router(vector_store.router, prefix=f"{API_PREFIX}/vector", tags=["Vector Store"])
|
| 13 |
+
app.include_router(settingsRouter.router, prefix=f"{API_PREFIX}/settings", tags=["Settings"])
|
| 14 |
+
|
| 15 |
+
app.add_middleware(
|
| 16 |
+
CORSMiddleware,
|
| 17 |
+
allow_origins=settings.cors_origins,
|
| 18 |
+
allow_credentials=True,
|
| 19 |
+
allow_methods=["*"],
|
| 20 |
+
allow_headers=["*"],
|
| 21 |
+
)
|
app/models/__init__.py
ADDED
|
File without changes
|
app/prompts/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .system_prompts import SYSTEM_PROMPT, QUESTION_WRITER_SYSTEM_PROMPT, wrap_exaone
|
app/prompts/system_prompts.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# SYSTEM_PROMPT = """
|
| 3 |
+
# You are VGEC-Assistant, a polite and helpful information retrieval chatbot for Vishwakarma Government Engineering College (VGEC).
|
| 4 |
+
|
| 5 |
+
# You MUST answer the user's question using ONLY the information inside the given CONTEXT. The CONTEXT is the only source of truth.
|
| 6 |
+
# you have to help the users and guide them to answer based on the given context, dont guess but provide answer or guide them in any way you fit.
|
| 7 |
+
|
| 8 |
+
# Output Format:
|
| 9 |
+
# - Always respond in plain text as complete sentences.
|
| 10 |
+
# - Do not add extra explanation or new facts.
|
| 11 |
+
# - Keep responses concise and courteous.
|
| 12 |
+
# - Do NOT use outside knowledge.
|
| 13 |
+
# - Do NOT guess.
|
| 14 |
+
# - Always respond in markdown format.
|
| 15 |
+
|
| 16 |
+
# ---
|
| 17 |
+
# HISTORY:
|
| 18 |
+
# {history}
|
| 19 |
+
# ---
|
| 20 |
+
# CONTEXT:
|
| 21 |
+
# {context}
|
| 22 |
+
# ---
|
| 23 |
+
# QUESTION:
|
| 24 |
+
# {question}
|
| 25 |
+
# ---
|
| 26 |
+
# ANSWER:
|
| 27 |
+
# """
|
| 28 |
+
|
| 29 |
+
SYSTEM_PROMPT = """
|
| 30 |
+
You are VGEC-Assistant, a helpful chatbot for Vishwakarma Government Engineering College (VGEC).
|
| 31 |
+
Answer the user's question using ONLY the information in the given CONTEXT.
|
| 32 |
+
|
| 33 |
+
If the answer can be logically inferred from the context, provide the answer clearly.
|
| 34 |
+
If the answer is not present in the context, say: "Sorry, I couldn't find that in the provided information."
|
| 35 |
+
|
| 36 |
+
Guidelines:
|
| 37 |
+
- Keep the response short and clear.
|
| 38 |
+
- Do not repeat the context.
|
| 39 |
+
- Do not guess or make assumptions.
|
| 40 |
+
- Answer in Markdown Format.
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
HISTORY:
|
| 44 |
+
{history}
|
| 45 |
+
---
|
| 46 |
+
CONTEXT:
|
| 47 |
+
{context}
|
| 48 |
+
---
|
| 49 |
+
QUESTION:
|
| 50 |
+
{question}
|
| 51 |
+
---
|
| 52 |
+
ANSWER:
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
def wrap_exaone(prompt):
|
| 56 |
+
return f"""[|system|]
|
| 57 |
+
You are a helpful AI assistant. Answer only from the given context. If unsure, say "I don't know".
|
| 58 |
+
[|endofturn|]
|
| 59 |
+
|
| 60 |
+
[|user|]
|
| 61 |
+
{prompt.strip()}
|
| 62 |
+
[|endofturn|]
|
| 63 |
+
|
| 64 |
+
[|assistant|]
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
QUESTION_WRITER_SYSTEM_PROMPT = """You are a query rewriting assistant for Vishwakarma Government Engineering College (VGEC).
|
| 68 |
+
|
| 69 |
+
STRICT RULES:
|
| 70 |
+
1. Expand abbreviations using ONLY this mapping:
|
| 71 |
+
- IT = Information Technology Department
|
| 72 |
+
- ICT = Information and Communication Technology Department
|
| 73 |
+
- CE = Computer Engineering Department
|
| 74 |
+
- EC = Electronics and Communication Engineering Department
|
| 75 |
+
- IC = Instrumentation and Control Engineering Department
|
| 76 |
+
- PE = Power Electronics Department
|
| 77 |
+
- ME = Mechanical Engineering Department
|
| 78 |
+
- Civil = Civil Engineering Department
|
| 79 |
+
- CSE = Computer Science & Engineering (Data Science) Department
|
| 80 |
+
- DS = Computer Science & Engineering (Data Science) Department
|
| 81 |
+
- ACPC = Admission Committee for Professional Courses (administrative, NOT a department)
|
| 82 |
+
- STS = Student Section Portal (administrative, NOT a department)
|
| 83 |
+
|
| 84 |
+
2. CRITICAL: If query has NO department abbreviation, do NOT add any department.
|
| 85 |
+
|
| 86 |
+
3. Output ONLY the rewritten query. No quotes, no prefixes, no explanations.
|
| 87 |
+
|
| 88 |
+
GOOD EXAMPLES:
|
| 89 |
+
User: "ds fees?"
|
| 90 |
+
Rewritten: What are the fees for the Computer Science & Engineering (Data Science) Department?
|
| 91 |
+
|
| 92 |
+
User: "cse block?"
|
| 93 |
+
Rewritten: Which block houses the Computer Science & Engineering (Data Science) Department?
|
| 94 |
+
|
| 95 |
+
User: "fees"
|
| 96 |
+
Rewritten: What are the fees?
|
| 97 |
+
|
| 98 |
+
User: "admission"
|
| 99 |
+
Rewritten: What is the admission process?
|
| 100 |
+
|
| 101 |
+
User: "acpc registration"
|
| 102 |
+
Rewritten: What is the ACPC registration process?
|
| 103 |
+
|
| 104 |
+
BAD EXAMPLES (NEVER DO THIS):
|
| 105 |
+
User: "fees"
|
| 106 |
+
Bad: What are the fees for the Computer Science & Engineering (Data Science) Department?
|
| 107 |
+
|
| 108 |
+
User: "placement"
|
| 109 |
+
Bad: What are the placement statistics for the Mechanical Engineering Department?
|
| 110 |
+
|
| 111 |
+
Query: {query}
|
| 112 |
+
Rewritten:"""
|
app/services/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file is intentionally empty to prevent circular imports.
|
| 2 |
+
# Import services directly from their modules.
|
app/services/classifier_service.py
ADDED
|
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pickle
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
from sklearn.linear_model import LogisticRegression
|
| 6 |
+
from sklearn.model_selection import train_test_split, cross_val_score
|
| 7 |
+
from app.core.config import settings
|
| 8 |
+
from typing import List, Optional
|
| 9 |
+
|
| 10 |
+
def load_pipeline(path):
|
| 11 |
+
with open(path, "rb") as f:
|
| 12 |
+
pipeline = pickle.load(f)
|
| 13 |
+
|
| 14 |
+
return pipeline
|
| 15 |
+
|
| 16 |
+
class Classifier:
|
| 17 |
+
def __init__(
|
| 18 |
+
self,
|
| 19 |
+
tfidf,
|
| 20 |
+
abbreviations,
|
| 21 |
+
master_index,
|
| 22 |
+
le_type,
|
| 23 |
+
le_category,
|
| 24 |
+
le_topic,
|
| 25 |
+
le_intent,
|
| 26 |
+
models=None,
|
| 27 |
+
df=None,
|
| 28 |
+
):
|
| 29 |
+
self.tfidf = tfidf
|
| 30 |
+
self.abbreviations = abbreviations
|
| 31 |
+
self.master_index = master_index
|
| 32 |
+
|
| 33 |
+
self.le_type = le_type
|
| 34 |
+
self.le_category = le_category
|
| 35 |
+
self.le_topic = le_topic
|
| 36 |
+
self.le_intent = le_intent
|
| 37 |
+
model_path = settings.embeddings_path / "mdbr-leaf-mt"
|
| 38 |
+
if model_path.exists():
|
| 39 |
+
self.embedding_model = SentenceTransformer(str(model_path))
|
| 40 |
+
else:
|
| 41 |
+
self.embedding_model = SentenceTransformer("MongoDB/mdbr-leaf-mt")
|
| 42 |
+
|
| 43 |
+
# Prediction thresholds: below these, the field is set to None entirely
|
| 44 |
+
self.threshold = {
|
| 45 |
+
"type": 0.4,
|
| 46 |
+
"category": 0.4,
|
| 47 |
+
"topic": 0.5,
|
| 48 |
+
"intent": 0.6
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
# Filter thresholds: above these, the field is used as a hard ChromaDB filter
|
| 52 |
+
# Kept separate so you can tune "when to predict" vs "when to filter" independently
|
| 53 |
+
self.filter_threshold = {
|
| 54 |
+
"type": 0.65,
|
| 55 |
+
"category": 0.65,
|
| 56 |
+
"topic": 0.70,
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
# If trained models are passed
|
| 60 |
+
if models is not None:
|
| 61 |
+
self.models = models
|
| 62 |
+
else:
|
| 63 |
+
if df is None:
|
| 64 |
+
raise ValueError("Either provide trained models or provide df to train.")
|
| 65 |
+
self.models = self.train_models(df)
|
| 66 |
+
|
| 67 |
+
def _build_filter(self, result):
|
| 68 |
+
# If type confidence doesn't clear the filter bar, the entire filter
|
| 69 |
+
# is unreliable — return None so retrieval does a full scan instead.
|
| 70 |
+
if result.get("type_conf", 0) < self.filter_threshold["type"]:
|
| 71 |
+
return None
|
| 72 |
+
|
| 73 |
+
# --- Hard AND anchors (always reliable) ---
|
| 74 |
+
hard_conditions = []
|
| 75 |
+
hard_conditions.append({"type": result["type"]})
|
| 76 |
+
|
| 77 |
+
# intent — handles special case for "count" to include "detail"
|
| 78 |
+
intent = result.get("intent") or "detail"
|
| 79 |
+
if intent == "count":
|
| 80 |
+
hard_conditions.append({"$or": [{"intent": "count"}, {"intent": "detail"}]})
|
| 81 |
+
else:
|
| 82 |
+
hard_conditions.append({"intent": intent})
|
| 83 |
+
|
| 84 |
+
# --- Soft OR hints (category / topic) ---
|
| 85 |
+
# A document only needs to match ONE of these to pass.
|
| 86 |
+
# This avoids dropping valid docs that are tagged with category but
|
| 87 |
+
# not topic (or vice-versa), while still keeping retrieval directional.
|
| 88 |
+
soft_conditions = []
|
| 89 |
+
|
| 90 |
+
if result.get("category") and result.get("category_conf", 0) >= self.filter_threshold["category"]:
|
| 91 |
+
soft_conditions.append({"category": result["category"]})
|
| 92 |
+
else:
|
| 93 |
+
soft_conditions.append({"category": "general"})
|
| 94 |
+
|
| 95 |
+
if result.get("topic") and result.get("topic_conf", 0) >= self.filter_threshold["topic"]:
|
| 96 |
+
soft_conditions.append({"topic": result["topic"]})
|
| 97 |
+
else:
|
| 98 |
+
soft_conditions.append({"topic": "general"})
|
| 99 |
+
|
| 100 |
+
# Build final filter
|
| 101 |
+
# Case 1: No soft hints — filter on hard anchors only (broad query like "list all departments")
|
| 102 |
+
if not soft_conditions:
|
| 103 |
+
if len(hard_conditions) == 1:
|
| 104 |
+
return hard_conditions[0]
|
| 105 |
+
return {"$and": hard_conditions}
|
| 106 |
+
|
| 107 |
+
# Case 2: One soft hint — add it directly to the AND (no $or needed)
|
| 108 |
+
if len(soft_conditions) == 1:
|
| 109 |
+
return {"$and": hard_conditions + soft_conditions}
|
| 110 |
+
|
| 111 |
+
# Case 3: Both category and topic are confident — combine as $or inside the AND
|
| 112 |
+
# Final shape: type AND intent AND (category OR topic)
|
| 113 |
+
return {"$and": hard_conditions + [{"$or": soft_conditions}]}
|
| 114 |
+
|
| 115 |
+
def predict_with_filter(self, queries):
|
| 116 |
+
filters = self.predict(queries)[0]
|
| 117 |
+
return self._build_filter(filters)
|
| 118 |
+
|
| 119 |
+
def expand_abbreviations(self, text):
|
| 120 |
+
text = text.lower().strip()
|
| 121 |
+
for abbr, full in self.abbreviations.items():
|
| 122 |
+
pattern = r'\b' + re.escape(abbr.lower()) + r'\b' # ← lowercase the key too
|
| 123 |
+
text = re.sub(pattern, full, text)
|
| 124 |
+
return text
|
| 125 |
+
|
| 126 |
+
def get_features(self, queries):
|
| 127 |
+
|
| 128 |
+
queries_clean = [self.expand_abbreviations(q) for q in queries]
|
| 129 |
+
|
| 130 |
+
embeddings = self.embedding_model.encode(
|
| 131 |
+
queries_clean, show_progress_bar=False
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
if not hasattr(self.tfidf, "vocabulary_"):
|
| 135 |
+
tfidf_features = self.tfidf.fit_transform(queries_clean).toarray()
|
| 136 |
+
else:
|
| 137 |
+
tfidf_features = self.tfidf.transform(queries_clean).toarray()
|
| 138 |
+
|
| 139 |
+
return np.hstack([embeddings, tfidf_features])
|
| 140 |
+
|
| 141 |
+
def train_single(self, X, y, field, C=0.01):
|
| 142 |
+
|
| 143 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 144 |
+
X, y,
|
| 145 |
+
test_size=0.2,
|
| 146 |
+
random_state=42,
|
| 147 |
+
stratify=y
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
clf = LogisticRegression(
|
| 151 |
+
C=C,
|
| 152 |
+
penalty="l2",
|
| 153 |
+
solver="lbfgs",
|
| 154 |
+
max_iter=2000,
|
| 155 |
+
class_weight="balanced",
|
| 156 |
+
random_state=42
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
clf.fit(X_train, y_train)
|
| 160 |
+
|
| 161 |
+
train_acc = clf.score(X_train, y_train)
|
| 162 |
+
test_acc = clf.score(X_test, y_test)
|
| 163 |
+
|
| 164 |
+
cv_scores = cross_val_score(clf, X, y, cv=5)
|
| 165 |
+
|
| 166 |
+
print(f"\n{field.upper()}:")
|
| 167 |
+
print(f"Train: {train_acc:.3f} | Test: {test_acc:.3f} | CV: {cv_scores.mean():.3f}")
|
| 168 |
+
|
| 169 |
+
return clf
|
| 170 |
+
|
| 171 |
+
def train_models(self, df):
|
| 172 |
+
|
| 173 |
+
X = self.get_features(df["question"].tolist())
|
| 174 |
+
|
| 175 |
+
self.models["type"] = self.train_single(
|
| 176 |
+
X, df["type"].values, "type", C=0.01
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
self.models["category"] = self.train_single(
|
| 180 |
+
X, df["category"].values, "category", C=0.005
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
self.models["topic"] = self.train_single(
|
| 184 |
+
X, df["topic"].values, "topic", C=0.005
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
self.models["intent"] = self.train_single(
|
| 188 |
+
X, df["intent"].values, "intent", C=0.005
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
return self.models
|
| 192 |
+
|
| 193 |
+
def predict(self, queries: List[str], enforce_constraints=True):
|
| 194 |
+
|
| 195 |
+
X = self.get_features(queries)
|
| 196 |
+
results = []
|
| 197 |
+
|
| 198 |
+
for i, query in enumerate(queries):
|
| 199 |
+
|
| 200 |
+
res = {"question": query}
|
| 201 |
+
|
| 202 |
+
# ---------- TYPE ----------
|
| 203 |
+
type_proba = self.models["type"].predict_proba([X[i]])[0]
|
| 204 |
+
type_classes = self.models["type"].classes_
|
| 205 |
+
|
| 206 |
+
type_idx = np.argmax(type_proba)
|
| 207 |
+
type_pred = type_classes[type_idx]
|
| 208 |
+
|
| 209 |
+
res["type"] = self.le_type.inverse_transform([type_pred])[0]
|
| 210 |
+
res["type_conf"] = float(type_proba[type_idx])
|
| 211 |
+
|
| 212 |
+
# ---------- CATEGORY ----------
|
| 213 |
+
category_proba = self.models["category"].predict_proba([X[i]])[0]
|
| 214 |
+
category_classes = self.models["category"].classes_
|
| 215 |
+
|
| 216 |
+
if enforce_constraints:
|
| 217 |
+
|
| 218 |
+
category_labels = self.le_category.inverse_transform(category_classes)
|
| 219 |
+
allowed = set(self.master_index[res["type"]]["categories"])
|
| 220 |
+
|
| 221 |
+
filtered = [
|
| 222 |
+
(label, prob)
|
| 223 |
+
for label, prob in zip(category_labels, category_proba)
|
| 224 |
+
if label in allowed
|
| 225 |
+
]
|
| 226 |
+
|
| 227 |
+
if filtered:
|
| 228 |
+
best_category, best_prob = max(filtered, key=lambda x: x[1])
|
| 229 |
+
else:
|
| 230 |
+
idx = np.argmax(category_proba)
|
| 231 |
+
best_category = category_labels[idx]
|
| 232 |
+
best_prob = category_proba[idx]
|
| 233 |
+
|
| 234 |
+
res["category"] = best_category
|
| 235 |
+
res["category_conf"] = float(best_prob)
|
| 236 |
+
|
| 237 |
+
else:
|
| 238 |
+
|
| 239 |
+
idx = np.argmax(category_proba)
|
| 240 |
+
pred = category_classes[idx]
|
| 241 |
+
|
| 242 |
+
res["category"] = self.le_category.inverse_transform([pred])[0]
|
| 243 |
+
res["category_conf"] = float(category_proba[idx])
|
| 244 |
+
|
| 245 |
+
# ---------- TOPIC ----------
|
| 246 |
+
topic_proba = self.models["topic"].predict_proba([X[i]])[0]
|
| 247 |
+
topic_classes = self.models["topic"].classes_
|
| 248 |
+
|
| 249 |
+
if enforce_constraints:
|
| 250 |
+
|
| 251 |
+
topic_labels = self.le_topic.inverse_transform(topic_classes)
|
| 252 |
+
allowed = set(self.master_index[res["type"]]["topics"])
|
| 253 |
+
|
| 254 |
+
filtered = [
|
| 255 |
+
(label, prob)
|
| 256 |
+
for label, prob in zip(topic_labels, topic_proba)
|
| 257 |
+
if label in allowed
|
| 258 |
+
]
|
| 259 |
+
|
| 260 |
+
if filtered:
|
| 261 |
+
best_topic, best_prob = max(filtered, key=lambda x: x[1])
|
| 262 |
+
else:
|
| 263 |
+
idx = np.argmax(topic_proba)
|
| 264 |
+
best_topic = topic_labels[idx]
|
| 265 |
+
best_prob = topic_proba[idx]
|
| 266 |
+
|
| 267 |
+
res["topic"] = best_topic
|
| 268 |
+
res["topic_conf"] = float(best_prob)
|
| 269 |
+
|
| 270 |
+
else:
|
| 271 |
+
|
| 272 |
+
idx = np.argmax(topic_proba)
|
| 273 |
+
pred = topic_classes[idx]
|
| 274 |
+
|
| 275 |
+
res["topic"] = self.le_topic.inverse_transform([pred])[0]
|
| 276 |
+
res["topic_conf"] = float(topic_proba[idx])
|
| 277 |
+
|
| 278 |
+
# ---------- INTENT ----------
|
| 279 |
+
intent_proba = self.models["intent"].predict_proba([X[i]])[0]
|
| 280 |
+
intent_classes = self.models["intent"].classes_
|
| 281 |
+
|
| 282 |
+
intent_idx = np.argmax(intent_proba)
|
| 283 |
+
intent_pred = intent_classes[intent_idx]
|
| 284 |
+
|
| 285 |
+
res["intent"] = self.le_intent.inverse_transform([intent_pred])[0]
|
| 286 |
+
res["intent_conf"] = float(intent_proba[intent_idx])
|
| 287 |
+
|
| 288 |
+
if res["type_conf"] < self.threshold["type"]:
|
| 289 |
+
res["type"] = None
|
| 290 |
+
res["type_conf"] = 0
|
| 291 |
+
if res["category_conf"] < self.threshold["category"]:
|
| 292 |
+
res["category"] = None
|
| 293 |
+
res["category_conf"] = 0
|
| 294 |
+
if res["topic_conf"] < self.threshold["topic"]:
|
| 295 |
+
res["topic"] = None
|
| 296 |
+
res["topic_conf"] = 0
|
| 297 |
+
if res["intent_conf"] < self.threshold["intent"]:
|
| 298 |
+
res["intent"] = None
|
| 299 |
+
res["intent_conf"] = 0
|
| 300 |
+
|
| 301 |
+
print("=" * 50)
|
| 302 |
+
print(query)
|
| 303 |
+
print(f"Type: {res['type']}, {res['type_conf']}")
|
| 304 |
+
print(f"Category: {res['category']}, {res['category_conf']}")
|
| 305 |
+
print(f"Topic: {res['topic']}, {res['topic_conf']}")
|
| 306 |
+
print(f"Intent: {res['intent']}, {res['intent_conf']}")
|
| 307 |
+
print("=" * 50)
|
| 308 |
+
|
| 309 |
+
results.append(res)
|
| 310 |
+
|
| 311 |
+
return results
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
classifier_path = settings.classifier_path / "chatbot_classifier.pkl"
|
| 315 |
+
pipeline = load_pipeline(classifier_path)
|
| 316 |
+
|
| 317 |
+
models = pipeline["models"]
|
| 318 |
+
tfidf = pipeline["tfidf"]
|
| 319 |
+
|
| 320 |
+
le_type = pipeline["le_type"]
|
| 321 |
+
le_category = pipeline["le_category"]
|
| 322 |
+
le_topic = pipeline["le_topic"]
|
| 323 |
+
le_intent = pipeline["le_intent"]
|
| 324 |
+
|
| 325 |
+
MASTER_INDEX = pipeline["MASTER_INDEX"]
|
| 326 |
+
ABBREVIATIONS = pipeline["ABBREVIATIONS"]
|
| 327 |
+
|
| 328 |
+
clf = Classifier(
|
| 329 |
+
tfidf=tfidf,
|
| 330 |
+
abbreviations=ABBREVIATIONS,
|
| 331 |
+
master_index=MASTER_INDEX,
|
| 332 |
+
le_type=le_type,
|
| 333 |
+
le_category=le_category,
|
| 334 |
+
le_topic=le_topic,
|
| 335 |
+
le_intent=le_intent,
|
| 336 |
+
models=models
|
| 337 |
+
)
|
app/services/document_loader.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_core.documents import Document
|
| 2 |
+
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
|
| 3 |
+
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
import uuid
|
| 7 |
+
from typing import Optional, List
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class document_loader:
|
| 11 |
+
def __init__(self, filepath: Path, glob: str = "*.pdf"):
|
| 12 |
+
self.filepath = filepath
|
| 13 |
+
self.glob = glob
|
| 14 |
+
self.loader = PyPDFLoader
|
| 15 |
+
|
| 16 |
+
# loading services
|
| 17 |
+
def load(self):
|
| 18 |
+
doc_loader = PyPDFLoader(self.filepath)
|
| 19 |
+
return doc_loader.load()
|
| 20 |
+
|
| 21 |
+
def load_md(self):
|
| 22 |
+
return UnstructuredMarkdownLoader(self.filepath).load()
|
| 23 |
+
|
| 24 |
+
def lazy_load(self):
|
| 25 |
+
doc_loader = PyPDFLoader(self.filepath)
|
| 26 |
+
return doc_loader.lazy_load()
|
| 27 |
+
|
| 28 |
+
def load_multiple(self):
|
| 29 |
+
doc_loader = DirectoryLoader(
|
| 30 |
+
self.filepath,
|
| 31 |
+
glob=self.glob,
|
| 32 |
+
loader_cls=PyPDFLoader
|
| 33 |
+
)
|
| 34 |
+
return doc_loader.load()
|
app/services/file_service.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.core.config import settings
|
| 2 |
+
from typing import List, Optional, Dict, Any
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
| 5 |
+
import json
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from app.utils.document_helpers import build_metadata
|
| 8 |
+
from langchain_core.documents import Document
|
| 9 |
+
import uuid
|
| 10 |
+
|
| 11 |
+
from app.utils.preprocessing import preprocess_filename
|
| 12 |
+
|
| 13 |
+
class FileService:
|
| 14 |
+
"""`
|
| 15 |
+
FileService helps manage files and their metadata.
|
| 16 |
+
It stores file information in a central JSON file (e.g., vgec_rag.json).
|
| 17 |
+
"""
|
| 18 |
+
def __init__(self):
|
| 19 |
+
self.settings = settings
|
| 20 |
+
# The name of the file where we store metadata
|
| 21 |
+
self.metadata_filename = f"{self.settings.collection_name}.json"
|
| 22 |
+
# The full path to that metadata file in the data folder
|
| 23 |
+
self.metadata_path = self.settings.data_path / self.metadata_filename
|
| 24 |
+
|
| 25 |
+
self.file_storage_path = self.settings.data_path / "documents"
|
| 26 |
+
|
| 27 |
+
# Load existing metadata if it exists, otherwise start fresh
|
| 28 |
+
if self.metadata_path.exists():
|
| 29 |
+
self.records = self.load_metadata()
|
| 30 |
+
else:
|
| 31 |
+
self.records = {}
|
| 32 |
+
|
| 33 |
+
def load_metadata(self) -> Dict[str, Any]:
|
| 34 |
+
"""Reads the metadata from the JSON file."""
|
| 35 |
+
try:
|
| 36 |
+
with open(self.metadata_path, 'r', encoding='utf-8') as f:
|
| 37 |
+
return json.load(f)
|
| 38 |
+
except (json.JSONDecodeError, Exception):
|
| 39 |
+
return {}
|
| 40 |
+
|
| 41 |
+
def save_metadata(self):
|
| 42 |
+
"""Saves current memory records back to the JSON file."""
|
| 43 |
+
# Ensure the data directory exists
|
| 44 |
+
self.metadata_path.parent.mkdir(parents=True, exist_ok=True)
|
| 45 |
+
with open(self.metadata_path, 'w', encoding='utf-8') as f:
|
| 46 |
+
json.dump(self.records, f, indent=4)
|
| 47 |
+
|
| 48 |
+
def read_file(self, file_path: Path) -> Optional[str]:
|
| 49 |
+
"""Reads content from a file and updates the logs."""
|
| 50 |
+
if not file_path.exists():
|
| 51 |
+
return None
|
| 52 |
+
|
| 53 |
+
file_name = preprocess_filename(file_path)
|
| 54 |
+
if file_name.endswith(".pdf"):
|
| 55 |
+
documents = PyMuPDFLoader(file_path).load()
|
| 56 |
+
content = "\n".join([doc.page_content for doc in documents])
|
| 57 |
+
print(content)
|
| 58 |
+
metadata = {
|
| 59 |
+
"page_count": len(documents),
|
| 60 |
+
"ext": "pdf"
|
| 61 |
+
}
|
| 62 |
+
elif file_name.endswith(".txt"):
|
| 63 |
+
split_content_metadata = build_metadata(file_path)
|
| 64 |
+
inbuilt_metadata = split_content_metadata['metadata']
|
| 65 |
+
content = split_content_metadata['content']
|
| 66 |
+
metadata = {
|
| 67 |
+
**inbuilt_metadata,
|
| 68 |
+
"ext": "txt"
|
| 69 |
+
}
|
| 70 |
+
elif file_name.endswith(".md"):
|
| 71 |
+
split_content_metadata = build_metadata(file_path)
|
| 72 |
+
inbuilt_metadata = split_content_metadata['metadata']
|
| 73 |
+
content = split_content_metadata['content']
|
| 74 |
+
metadata = {
|
| 75 |
+
**inbuilt_metadata,
|
| 76 |
+
"ext": "md"
|
| 77 |
+
}
|
| 78 |
+
elif file_name.endswith(".json"):
|
| 79 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 80 |
+
data = json.load(f)
|
| 81 |
+
content = json.dumps(data["content"])
|
| 82 |
+
metadata = {
|
| 83 |
+
"id": data["id"],
|
| 84 |
+
"title": data.get("name", data.get("title", "untitled")),
|
| 85 |
+
"source": data["source"],
|
| 86 |
+
"source_file": file_name or "untitled",
|
| 87 |
+
"created_date": datetime.now().isoformat(),
|
| 88 |
+
"type": data.get("type", "general"),
|
| 89 |
+
"category": data.get("category", "general"),
|
| 90 |
+
"topic": data.get("topic", "general"),
|
| 91 |
+
"ext": "json"
|
| 92 |
+
}
|
| 93 |
+
# file_name
|
| 94 |
+
doc = Document(page_content=content, metadata=metadata)
|
| 95 |
+
doc.metadata["id"] = doc.metadata.get(
|
| 96 |
+
"id",
|
| 97 |
+
str(uuid.uuid4())
|
| 98 |
+
)
|
| 99 |
+
doc.metadata["title"] = doc.metadata.get(
|
| 100 |
+
"title",
|
| 101 |
+
file_name
|
| 102 |
+
)
|
| 103 |
+
doc.metadata["source_file"] = doc.metadata.get(
|
| 104 |
+
"source_file",
|
| 105 |
+
file_name
|
| 106 |
+
)
|
| 107 |
+
doc.metadata["updated_at"] = datetime.now().isoformat()
|
| 108 |
+
doc.metadata["created_at"] = doc.metadata.get(
|
| 109 |
+
"created_at",
|
| 110 |
+
datetime.now().isoformat()
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# Update logs to reflect that we interacted with this file
|
| 114 |
+
self.update_logs(file_path, metadata)
|
| 115 |
+
return doc
|
| 116 |
+
|
| 117 |
+
def write_file(self, file_path: Path, content: str, metadata: Optional[dict] = None):
|
| 118 |
+
"""Writes content to a file and saves its metadata."""
|
| 119 |
+
# Ensure the directory for the file exists
|
| 120 |
+
|
| 121 |
+
filename = preprocess_filename(file_path)
|
| 122 |
+
file_save_path = self.file_storage_path / filename
|
| 123 |
+
|
| 124 |
+
file_save_path.parent.mkdir(parents=True, exist_ok=True)
|
| 125 |
+
if filename.endswith(".pdf"):
|
| 126 |
+
with open(file_path, 'rb') as f:
|
| 127 |
+
content = f.read()
|
| 128 |
+
with open(file_save_path, 'wb') as f:
|
| 129 |
+
f.write(content)
|
| 130 |
+
elif filename.endswith(".txt"):
|
| 131 |
+
with open(file_save_path, 'w', encoding='utf-8') as f:
|
| 132 |
+
f.write(content)
|
| 133 |
+
elif filename.endswith(".md"):
|
| 134 |
+
with open(file_save_path, 'w', encoding='utf-8') as f:
|
| 135 |
+
f.write(content)
|
| 136 |
+
elif filename.endswith(".json"):
|
| 137 |
+
with open(file_save_path, 'w', encoding='utf-8') as f:
|
| 138 |
+
json.dump(content, f, indent=4)
|
| 139 |
+
else:
|
| 140 |
+
with open(file_save_path, 'w', encoding='utf-8') as f:
|
| 141 |
+
f.write(content)
|
| 142 |
+
|
| 143 |
+
# Update the logs with the provided metadata
|
| 144 |
+
self.update_logs(file_save_path, metadata)
|
| 145 |
+
|
| 146 |
+
def update_logs(self, file_path: Path, metadata: Optional[dict] = None):
|
| 147 |
+
"""Helper to prepare metadata before saving."""
|
| 148 |
+
file_name = file_path.name
|
| 149 |
+
|
| 150 |
+
# If no metadata is provided, we try to preserve existing
|
| 151 |
+
# metadata or use an empty dict if it's new.
|
| 152 |
+
if metadata is None:
|
| 153 |
+
metadata = self.records.get(file_name, {})
|
| 154 |
+
|
| 155 |
+
self.manage_metadata(file_name, metadata)
|
| 156 |
+
|
| 157 |
+
def manage_metadata(self, file_name: str, metadata: dict):
|
| 158 |
+
"""Updates the internal dictionary and saves it to the disk."""
|
| 159 |
+
self.records[file_name] = metadata
|
| 160 |
+
self.save_metadata()
|
| 161 |
+
|
| 162 |
+
def patch_metadata(self, file_path: Path, metadata: dict):
|
| 163 |
+
file_name = file_path.name
|
| 164 |
+
original_metadata = self.records.get(file_name, {})
|
| 165 |
+
self.manage_metadata(
|
| 166 |
+
file_name= file_name,
|
| 167 |
+
metadata= {
|
| 168 |
+
**original_metadata,
|
| 169 |
+
**metadata
|
| 170 |
+
}
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
def get_records(self) -> Dict[str, Any]:
|
| 174 |
+
"""Returns all stored metadata records."""
|
| 175 |
+
return self.records
|
| 176 |
+
|
| 177 |
+
def get_record(self, file_name: str) -> Optional[Dict[str, Any]]:
|
| 178 |
+
"""Returns metadata for a specific file."""
|
| 179 |
+
return self.records.get(file_name)
|
| 180 |
+
|
| 181 |
+
def delete_record(self, file_name: str) -> bool:
|
| 182 |
+
"""Removes a metadata record from the JSON file."""
|
| 183 |
+
if file_name in self.records:
|
| 184 |
+
del self.records[file_name]
|
| 185 |
+
self.save_metadata()
|
| 186 |
+
return True
|
| 187 |
+
return False
|
| 188 |
+
|
| 189 |
+
def update_record(self, file_name: str, metadata: dict) -> bool:
|
| 190 |
+
"""Updates the metadata for an existing record."""
|
| 191 |
+
if file_name in self.records:
|
| 192 |
+
self.records[file_name] = metadata
|
| 193 |
+
self.save_metadata()
|
| 194 |
+
return True
|
| 195 |
+
return False
|
| 196 |
+
|
| 197 |
+
# Initialize a globally accessible service instance
|
| 198 |
+
file_service = FileService()
|
app/services/filter-demo
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ANCHORS = {
|
| 2 |
+
"type": {
|
| 3 |
+
"department": "department academic branch faculty courses engineering science",
|
| 4 |
+
"facility": "facility building campus lab central infrastructure",
|
| 5 |
+
"service": "service office administration student support section",
|
| 6 |
+
"hostel": "hostel dormitory residence accommodation warden mess",
|
| 7 |
+
"library": "library books journal reading catalog lending",
|
| 8 |
+
"placement": "placement recruitment company offer package career tnp",
|
| 9 |
+
"research": "research patent publication funded grant scholar",
|
| 10 |
+
"club": "club society committee nss ncc ieee cultural extracurricular",
|
| 11 |
+
"admission": "admission enrollment intake eligibility registration criteria",
|
| 12 |
+
},
|
| 13 |
+
|
| 14 |
+
"category": {
|
| 15 |
+
# Keep it short — name first, then 4-5 unique discriminative terms
|
| 16 |
+
"applied_mechanics": "applied mechanics AM statics dynamics stress strain",
|
| 17 |
+
"chemical": "chemical engineering ChE chemistry reaction process plant",
|
| 18 |
+
"civil": "civil engineering CE construction structural geotechnical survey",
|
| 19 |
+
"computer": "computer engineering CE hardware microprocessor VLSI embedded digital",
|
| 20 |
+
"cse_ds": "computer science CSE data science AI machine learning neural network",
|
| 21 |
+
"electronics_comm": "electronics communication ECE signal RF wireless antenna analog",
|
| 22 |
+
"electronics_inst": "electronics instrumentation EI biomedical sensors transducer measurement",
|
| 23 |
+
"electrical": "electrical engineering EE power motor transformer transmission drives",
|
| 24 |
+
"it": "information technology IT software ERP cloud database devops",
|
| 25 |
+
"ict": "information communication technology ICT telecom fiber networking protocol",
|
| 26 |
+
"instrumentation": "instrumentation control IC PLC SCADA automation feedback industrial",
|
| 27 |
+
"mechanical": "mechanical engineering ME thermal manufacturing CAD CAM machining fluid",
|
| 28 |
+
"power_electronics": "power electronics PE converter inverter MOSFET IGBT rectifier chopper",
|
| 29 |
+
"science_humanities": "science humanities SH physics mathematics english communication foundation",
|
| 30 |
+
|
| 31 |
+
"transport": "transport bus route commute shuttle pickup drop campus travel",
|
| 32 |
+
"finance": "finance fees tuition payment charges fine scholarship due",
|
| 33 |
+
"medical": "medical health doctor dispensary clinic nurse first aid",
|
| 34 |
+
"sports": "sports ground gym cricket football badminton court fitness",
|
| 35 |
+
"grievance": "grievance complaint harassment redressal scst women discrimination",
|
| 36 |
+
"forms": "forms bonafide certificate download application NOC document",
|
| 37 |
+
|
| 38 |
+
"ug": "undergraduate UG BE BTech bachelor four year first year gujcet jee",
|
| 39 |
+
"pg": "postgraduate PG ME MTech MBA MCA master two year gate",
|
| 40 |
+
|
| 41 |
+
"tnp": "tnp training placement cell campus drive offer letter coordinator",
|
| 42 |
+
"patent": "patent intellectual property invention filed granted rights",
|
| 43 |
+
"ssip": "ssip startup student innovation gujarat seed funding incubation",
|
| 44 |
+
"funded": "funded grant DST DRDO ISRO sponsored external research project",
|
| 45 |
+
"publication": "publication journal paper conference scopus SCI citation article",
|
| 46 |
+
|
| 47 |
+
"nss": "nss national service scheme volunteer blood donation community camp",
|
| 48 |
+
"ncc": "ncc national cadet corps army navy air force drill parade",
|
| 49 |
+
"ieee": "ieee electrical electronics engineers student chapter symposium",
|
| 50 |
+
"iei": "iei institution engineers india professional chapter membership",
|
| 51 |
+
"adventure": "adventure trekking hiking outdoor camping expedition nature club",
|
| 52 |
+
"women_cell": "women cell WDC empowerment gender equality ladies committee",
|
| 53 |
+
|
| 54 |
+
"principal": "principal director head institution chairman governing management",
|
| 55 |
+
"accreditation": "accreditation NBA NIRF NAAC AICTE GTU ranking grade approval",
|
| 56 |
+
"awards": "awards achievements recognition felicitation distinction honor trophy",
|
| 57 |
+
},
|
| 58 |
+
|
| 59 |
+
"topic": {
|
| 60 |
+
"faculty": "faculty professor lecturer HOD staff designation qualification phd",
|
| 61 |
+
"lab": "lab laboratory practical equipment instruments apparatus experiment",
|
| 62 |
+
"syllabus": "syllabus curriculum subjects units topics chapters semester GTU",
|
| 63 |
+
"timetable": "timetable class schedule period slot timing routine batch",
|
| 64 |
+
"event": "event fest hackathon seminar competition workshop cultural program",
|
| 65 |
+
"project": "project final year mini SIH capstone dissertation submission",
|
| 66 |
+
"virtual_tour": "virtual tour 360 view online walkthrough campus interactive",
|
| 67 |
+
"notice": "notice notification announcement circular bulletin update board",
|
| 68 |
+
"fees": "fees tuition charges structure breakdown payment due scholarship",
|
| 69 |
+
"rules": "rules regulations discipline policy conduct code guidelines norms",
|
| 70 |
+
"facilities": "facilities amenities wifi mess canteen gym recreation available",
|
| 71 |
+
"contact": "contact phone email address reach person call office",
|
| 72 |
+
"process": "process procedure steps apply method eligibility criteria workflow",
|
| 73 |
+
"document": "document certificate bonafide migration TC attestation official",
|
| 74 |
+
"route": "route bus stop pickup drop timing schedule commute point",
|
| 75 |
+
"stats": "statistics total number count figures record percentage ratio data",
|
| 76 |
+
"calendar": "calendar academic dates holidays exam deadlines semester schedule",
|
| 77 |
+
"vision": "vision mission goals objectives values purpose motto statement",
|
| 78 |
+
"induction": "induction orientation welcome freshman new student speaker activity",
|
| 79 |
+
},
|
| 80 |
+
|
| 81 |
+
"intent": {
|
| 82 |
+
"list": "list all show every what are enumerate display available options",
|
| 83 |
+
"count": "how many total count number quantity strength size",
|
| 84 |
+
"detail": "what is explain describe tell me about information overview",
|
| 85 |
+
"process": "how to apply steps procedure guide method approach eligibility",
|
| 86 |
+
"greeting": "hello hi hey good morning good evening namaste greetings",
|
| 87 |
+
},
|
| 88 |
+
}
|
| 89 |
+
ABBREVIATIONS = {
|
| 90 |
+
"ce": "computer engineering",
|
| 91 |
+
"cse": "computer science engineering",
|
| 92 |
+
"ds": "data science",
|
| 93 |
+
"it": "information technology",
|
| 94 |
+
"ict": "information communication technology",
|
| 95 |
+
"ece": "electronics communication engineering",
|
| 96 |
+
"ei": "electronics instrumentation engineering",
|
| 97 |
+
"ic": "instrumentation control",
|
| 98 |
+
"ee": "electrical engineering",
|
| 99 |
+
"pe": "power electronics",
|
| 100 |
+
"me": "mechanical engineering",
|
| 101 |
+
"am": "applied mechanics",
|
| 102 |
+
}
|
| 103 |
+
MASTER_INDEX = {
|
| 104 |
+
"department": {
|
| 105 |
+
"categories": [
|
| 106 |
+
"applied_mechanics", "chemical", "civil", "computer", "cse_ds",
|
| 107 |
+
"electronics_comm", "electronics_inst", "electrical", "it", "ict",
|
| 108 |
+
"instrumentation", "mechanical", "power_electronics", "science_humanities"
|
| 109 |
+
],
|
| 110 |
+
"topics": [
|
| 111 |
+
"faculty", "lab", "syllabus", "timetable", "event",
|
| 112 |
+
"project", "virtual_tour", "notice", "contact", "stats"
|
| 113 |
+
]
|
| 114 |
+
},
|
| 115 |
+
"facility": {"categories": [None], "topics": ["lab", "facilities", "contact", "virtual_tour"]},
|
| 116 |
+
"service": {
|
| 117 |
+
"categories": ["transport", "finance", "medical", "sports", "grievance", "forms", "principal", "accreditation", "awards"],
|
| 118 |
+
"topics": ["fees", "rules", "process", "document", "route", "contact", "calendar", "stats", "vision", "induction"]
|
| 119 |
+
},
|
| 120 |
+
"hostel": {"categories": [None], "topics": ["fees", "rules", "facilities", "contact"]},
|
| 121 |
+
"library": {"categories": [None], "topics": ["fees", "rules", "facilities", "contact", "calendar", "document"]},
|
| 122 |
+
"placement": {"categories": ["tnp"], "topics": ["stats", "process", "contact", "event"]},
|
| 123 |
+
"research": {"categories": ["patent", "ssip", "funded", "publication"], "topics": ["stats", "project", "process", "contact"]},
|
| 124 |
+
"club": {"categories": ["nss", "ncc", "ieee", "iei", "adventure", "women_cell"], "topics": ["event", "contact", "process", "stats", "notice"]},
|
| 125 |
+
"admission": {"categories": ["ug", "pg"], "topics": ["process", "fees", "document", "result", "calendar", "contact"]}
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
from sentence_transformers import SentenceTransformer
|
| 129 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 130 |
+
import numpy as np
|
| 131 |
+
import re
|
| 132 |
+
from app.core.config import settings
|
| 133 |
+
from typing import Optional
|
| 134 |
+
|
| 135 |
+
class FilterClassifier:
|
| 136 |
+
def __init__(self, threshold: Optional[float] = None):
|
| 137 |
+
self.anchor_embeddings = {}
|
| 138 |
+
self.model = SentenceTransformer(str(settings.embeddings_path / "bge-small"))
|
| 139 |
+
self._build_anchor_embeddings()
|
| 140 |
+
self.threshold = threshold if threshold is not None else 0.4
|
| 141 |
+
|
| 142 |
+
def _build_anchor_embeddings(self):
|
| 143 |
+
if self.anchor_embeddings:
|
| 144 |
+
return
|
| 145 |
+
for domain, anchors in ANCHORS.items():
|
| 146 |
+
self.anchor_embeddings[domain] = {}
|
| 147 |
+
for label, text in anchors.items():
|
| 148 |
+
self.anchor_embeddings[domain][label] = self.model.encode(text)
|
| 149 |
+
|
| 150 |
+
def handle_abbreviations(self, query: str) -> str:
|
| 151 |
+
for abbr, full_form in ABBREVIATIONS.items():
|
| 152 |
+
query = query.replace(abbr, full_form)
|
| 153 |
+
return query
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def classify(self, query: str) -> dict:
|
| 157 |
+
query = self.handle_abbreviations(query)
|
| 158 |
+
query_emb = self.model.encode(query)
|
| 159 |
+
|
| 160 |
+
result = {"type":None,"category":None,"topic":None,"year":None,"intent":None}
|
| 161 |
+
|
| 162 |
+
for field, value_embeddings in self.anchor_embeddings.items ():
|
| 163 |
+
scores = {
|
| 164 |
+
val: cosine_similarity([query_emb], [emb])[0][0]
|
| 165 |
+
for val, emb in value_embeddings.items()
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
print(scores, max(scores, key = scores.get))
|
| 169 |
+
best_val = max(scores, key=scores.get)
|
| 170 |
+
best_score = scores[best_val]
|
| 171 |
+
print(best_val, best_score)
|
| 172 |
+
|
| 173 |
+
# print(field, result[field])
|
| 174 |
+
|
| 175 |
+
# Only accept if confidence is above threshold
|
| 176 |
+
if best_score > self.threshold:
|
| 177 |
+
result[field] = best_val
|
| 178 |
+
|
| 179 |
+
year = re.search(r"\b(20\d{2})\b", query)
|
| 180 |
+
result["year"] = int(year.group()) if year else None
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
if result["type"] is not None:
|
| 184 |
+
if (result["category"] is None
|
| 185 |
+
or result["category"] not in MASTER_INDEX[result["type"]]["categories"]):
|
| 186 |
+
result["category"] = None
|
| 187 |
+
|
| 188 |
+
if (result["topic"] is None
|
| 189 |
+
or result["topic"] not in MASTER_INDEX[result["type"]]["topics"]):
|
| 190 |
+
result["topic"] = None
|
| 191 |
+
else:
|
| 192 |
+
result["category"] = None
|
| 193 |
+
result["topic"] = None
|
| 194 |
+
|
| 195 |
+
return result
|
| 196 |
+
|
| 197 |
+
classifier = FilterClassifier()
|
app/services/filter_classifier copy.py
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
ANCHORS = {
|
| 3 |
+
|
| 4 |
+
# ─────────────────────────────────────────────────
|
| 5 |
+
# TYPE — discriminative, non-overlapping
|
| 6 |
+
# ─────────────────────────────────────────────────
|
| 7 |
+
"type": {
|
| 8 |
+
"department": "department academic branch division faculty staff courses offered semester",
|
| 9 |
+
"facility": "facility central infrastructure campus physical space building block floor room",
|
| 10 |
+
"service": "service administrative office cell section support helpdesk student welfare",
|
| 11 |
+
"hostel": "hostel dormitory residence hall accommodation mess warden boarding lodging",
|
| 12 |
+
"library": "library books journals reading room catalog issue return lending periodicals",
|
| 13 |
+
"placement": "placement recruitment hired campus drive offer letter company package lpa tnp",
|
| 14 |
+
"research": "research innovation funded grant patent publication lab project scholar phd",
|
| 15 |
+
"club": "club society student chapter committee extracurricular nss ncc ieee cultural sports",
|
| 16 |
+
"admission": "admission enrollment application intake registration eligibility criteria merit joining",
|
| 17 |
+
},
|
| 18 |
+
|
| 19 |
+
# ─────────────────────────────────────────────────
|
| 20 |
+
# CATEGORY — must contain the obvious name first
|
| 21 |
+
# ─────────────────────────────────────────────────
|
| 22 |
+
"category": {
|
| 23 |
+
|
| 24 |
+
# ── departments ──
|
| 25 |
+
"applied_mechanics": "applied mechanics AM statics dynamics solid mechanics fluid mechanics stress strain deformation",
|
| 26 |
+
"chemical": "chemical engineering ChE chemistry process plant reaction distillation thermodynamics petrochemical",
|
| 27 |
+
"civil": "civil engineering CE construction structural geotechnical surveying transportation concrete roads bridges",
|
| 28 |
+
"computer": "computer engineering CE computer hardware microprocessor VLSI embedded systems digital circuits processor chip architecture",
|
| 29 |
+
"cse_ds": "computer science CSE data science DS artificial intelligence machine learning neural network deep learning NLP analytics algorithm",
|
| 30 |
+
"electronics_comm": "electronics communication ECE EC signal processing analog RF wireless antenna microwave telecommunication",
|
| 31 |
+
"electronics_inst": "electronics instrumentation EI biomedical sensors LVDT transducer measurement calibration control systems",
|
| 32 |
+
"electrical": "electrical engineering EE power systems generation transmission distribution motor transformer drives induction synchronous",
|
| 33 |
+
"it": "information technology IT software development ERP cloud computing database devops web application enterprise",
|
| 34 |
+
"ict": "information communication technology ICT telecom networking fiber optic protocol bandwidth routing switching internet",
|
| 35 |
+
"instrumentation": "instrumentation control IC PLC SCADA automation process control feedback loop industrial plant",
|
| 36 |
+
"mechanical": "mechanical engineering ME mech thermal fluid manufacturing machining CAD CAM turbine heat transfer production",
|
| 37 |
+
"power_electronics": "power electronics PE converter inverter MOSFET IGBT rectifier chopper switching drives variable frequency",
|
| 38 |
+
"science_humanities": "science humanities SH physics chemistry mathematics english communication basic science applied science foundation",
|
| 39 |
+
|
| 40 |
+
# ── service ──
|
| 41 |
+
"transport": "transport bus route commute shuttle vehicle pickup drop timing campus travel conveyance",
|
| 42 |
+
"finance": "finance fees tuition payment semester charges fine scholarship refund due bank challan",
|
| 43 |
+
"medical": "medical health doctor dispensary clinic first aid nurse campus sick injury treatment",
|
| 44 |
+
"sports": "sports ground gym fitness cricket football badminton volleyball court track field athletics",
|
| 45 |
+
"grievance": "grievance complaint redressal harassment scst obc women discrimination appeal committee inquiry",
|
| 46 |
+
"forms": "forms download bonafide certificate application document tc migration no objection NOC",
|
| 47 |
+
|
| 48 |
+
# ── admission ──
|
| 49 |
+
"ug": "undergraduate UG BE BTech bachelor four year degree engineering first year admission gujcet jee lateral",
|
| 50 |
+
"pg": "postgraduate PG ME MTech MBA MCA master two year degree admission gate mat entrance",
|
| 51 |
+
|
| 52 |
+
# ── placement ──
|
| 53 |
+
"tnp": "tnp training placement cell campus recruitment company drive package offer letter placement officer coordinator",
|
| 54 |
+
|
| 55 |
+
# ── research ──
|
| 56 |
+
"patent": "patent intellectual property IP invention filed granted innovation protection rights",
|
| 57 |
+
"ssip": "ssip startup student innovation project gujarat government seed funding incubation entrepreneurship",
|
| 58 |
+
"funded": "funded sponsored externally grant DST ISRO DRDO government industry collaborative research project",
|
| 59 |
+
"publication": "publication journal paper conference proceedings scopus SCI research article citation author",
|
| 60 |
+
|
| 61 |
+
# ── club ──
|
| 62 |
+
"nss": "nss national service scheme volunteer social community service blood donation camp awareness",
|
| 63 |
+
"ncc": "ncc national cadet corps army navy air force cadet drill parade certificate b c",
|
| 64 |
+
"ieee": "ieee institute electrical electronics engineers student chapter technical symposium paper",
|
| 65 |
+
"iei": "iei institution engineers india professional body student chapter membership",
|
| 66 |
+
"adventure": "adventure advanature nature trekking outdoor hiking camping expedition rock climbing club",
|
| 67 |
+
"women_cell": "women development cell WDC empowerment gender equality ladies committee harassment redressal",
|
| 68 |
+
|
| 69 |
+
# ── administration ──
|
| 70 |
+
"principal": "principal director head of institution management chairman governing body top administration",
|
| 71 |
+
"accreditation": "accreditation NBA NIRF NAAC AICTE GTU affiliation ranking approval grade score",
|
| 72 |
+
"awards": "awards achievements recognition felicitation distinction honor national state rank trophy",
|
| 73 |
+
},
|
| 74 |
+
|
| 75 |
+
# ─────────────────────────────────────────────────
|
| 76 |
+
# TOPIC — aspects, clearly separated from each other
|
| 77 |
+
# ─────────────────────────────────────────────────
|
| 78 |
+
"topic": {
|
| 79 |
+
"faculty": "faculty professor lecturer instructor assistant professor associate professor HOD teaching staff designation qualification phd",
|
| 80 |
+
"lab": "laboratory lab practical experiment equipment instruments workshop hands-on setup apparatus bench",
|
| 81 |
+
"syllabus": "syllabus curriculum course content subjects units topics chapters semester wise GTU prescribed",
|
| 82 |
+
"timetable": "timetable class schedule routine period slot lecture timing weekly daily batch division",
|
| 83 |
+
"event": "event events fest hackathon seminar workshop competition cultural technical program organized upcoming",
|
| 84 |
+
"project": "project final year mini SIH capstone student work dissertation major submission",
|
| 85 |
+
"virtual_tour": "virtual tour 360 degree view online walkthrough campus room infrastructure interactive map",
|
| 86 |
+
"notice": "notice notification announcement circular update bulletin board recent latest information",
|
| 87 |
+
"fees": "fees tuition charges amount structure semester breakdown fine late scholarship payment due",
|
| 88 |
+
"rules": "rules regulations discipline policy code conduct guidelines norms behaviour dress restriction",
|
| 89 |
+
"facilities": "facilities amenities available infrastructure wifi internet mess canteen gym recreation services provided",
|
| 90 |
+
"contact": "contact phone number email address reach person call department office location",
|
| 91 |
+
"process": "process procedure steps how to apply method eligibility criteria requirement workflow sequence",
|
| 92 |
+
"document": "document certificate bonafide migration leaving tc attestation verification required official",
|
| 93 |
+
"route": "route bus stop timing pickup drop point schedule commute map destination",
|
| 94 |
+
"stats": "statistics data figures record total number count percentage ratio achievement placement pass",
|
| 95 |
+
"calendar": "calendar academic dates holidays exam schedule important deadlines events semester start end",
|
| 96 |
+
"vision": "vision mission goals objectives values purpose statement motto philosophy aim",
|
| 97 |
+
"induction": "induction orientation welcome program new student freshman speaker activity schedule",
|
| 98 |
+
},
|
| 99 |
+
|
| 100 |
+
# ─────────────────────────────────────────────────
|
| 101 |
+
# INTENT — must be semantically far apart
|
| 102 |
+
# ─────────────────────────────────────────────────
|
| 103 |
+
"intent": {
|
| 104 |
+
"list": "list all show every what are all available options display give me all enumerate",
|
| 105 |
+
"count": "how many total count number quantity how much strength size",
|
| 106 |
+
"detail": "what is explain describe tell me about information overview summary background",
|
| 107 |
+
"process": "how to apply steps procedure method way guide eligibility criteria approach",
|
| 108 |
+
"greeting": "hello hi hey good morning good afternoon good evening how are you namaste greetings",
|
| 109 |
+
},
|
| 110 |
+
}
|
| 111 |
+
ABBREVIATIONS = {
|
| 112 |
+
"ce": "computer engineering",
|
| 113 |
+
"cse": "computer science engineering",
|
| 114 |
+
"ds": "data science",
|
| 115 |
+
"it": "information technology",
|
| 116 |
+
"ict": "information communication technology",
|
| 117 |
+
"ece": "electronics communication engineering",
|
| 118 |
+
"ei": "electronics instrumentation engineering",
|
| 119 |
+
"ic": "instrumentation control",
|
| 120 |
+
"ee": "electrical engineering",
|
| 121 |
+
"pe": "power electronics",
|
| 122 |
+
"me": "mechanical engineering",
|
| 123 |
+
"am": "applied mechanics",
|
| 124 |
+
"che": "chemical engineering",
|
| 125 |
+
"ch": "chemical",
|
| 126 |
+
"ce": "civil engineering",
|
| 127 |
+
"ug": "undergraduate",
|
| 128 |
+
"pg": "postgraduate",
|
| 129 |
+
"be": "bachelor of engineering",
|
| 130 |
+
"btech": "bachelor of technology",
|
| 131 |
+
"me": "master of engineering",
|
| 132 |
+
"mtech": "master of technology",
|
| 133 |
+
"mba": "master of business administration",
|
| 134 |
+
"mca": "master of computer applications",
|
| 135 |
+
"tnp": "training and placement",
|
| 136 |
+
"nss": "national service scheme",
|
| 137 |
+
"ncc": "national cadet corps",
|
| 138 |
+
"ieee": "institute of electrical and electronics engineers",
|
| 139 |
+
"iei": "institution of engineers india",
|
| 140 |
+
"wdc": "women development cell",
|
| 141 |
+
"sip": "student innovation project",
|
| 142 |
+
"gtu": "gujarat technological university",
|
| 143 |
+
"nba": "national board of accreditation",
|
| 144 |
+
"naac": "national assessment and accreditation council",
|
| 145 |
+
"nirf": "national institutional ranking framework",
|
| 146 |
+
"aicte": "all india council for technical education",
|
| 147 |
+
"drdo": "defence research and development organisation",
|
| 148 |
+
"isro": "indian space research organisation",
|
| 149 |
+
"dst": "department of science and technology",
|
| 150 |
+
"sih": "smart india hackathon",
|
| 151 |
+
"lpa": "lakhs per annum",
|
| 152 |
+
"noc": "no objection certificate",
|
| 153 |
+
"tc": "transfer certificate",
|
| 154 |
+
"hod": "head of department",
|
| 155 |
+
"phd": "doctor of philosophy",
|
| 156 |
+
"scada": "supervisory control and data acquisition",
|
| 157 |
+
"plc": "programmable logic controller",
|
| 158 |
+
"lvdt": "linear variable differential transformer",
|
| 159 |
+
"mosfet": "metal oxide semiconductor field effect transistor",
|
| 160 |
+
"igbt": "insulated gate bipolar transistor",
|
| 161 |
+
"vlsi": "very large scale integration",
|
| 162 |
+
"cad": "computer aided design",
|
| 163 |
+
"cam": "computer aided manufacturing",
|
| 164 |
+
"erp": "enterprise resource planning",
|
| 165 |
+
"rf": "radio frequency",
|
| 166 |
+
"nlp": "natural language processing",
|
| 167 |
+
"ai": "artificial intelligence",
|
| 168 |
+
"ml": "machine learning",
|
| 169 |
+
"scopus": "scopus",
|
| 170 |
+
"sci": "science citation index",
|
| 171 |
+
"ip": "intellectual property",
|
| 172 |
+
}
|
| 173 |
+
MASTER_INDEX = {
|
| 174 |
+
"department": {
|
| 175 |
+
"categories": [
|
| 176 |
+
"applied_mechanics", "chemical", "civil", "computer", "cse_ds",
|
| 177 |
+
"electronics_comm", "electronics_inst", "electrical", "it", "ict",
|
| 178 |
+
"instrumentation", "mechanical", "power_electronics", "science_humanities"
|
| 179 |
+
],
|
| 180 |
+
"topics": [
|
| 181 |
+
"faculty", "lab", "syllabus", "timetable", "event",
|
| 182 |
+
"project", "virtual_tour", "notice", "contact"
|
| 183 |
+
]
|
| 184 |
+
},
|
| 185 |
+
"facility": {"categories": [None], "topics": ["lab", "facilities", "contact", "virtual_tour"]},
|
| 186 |
+
"service": {
|
| 187 |
+
"categories": ["transport", "finance", "medical", "sports", "grievance", "forms", "principal", "accreditation", "awards"],
|
| 188 |
+
"topics": ["fees", "rules", "process", "document", "route", "contact", "calendar", "stats", "vision", "induction"]
|
| 189 |
+
},
|
| 190 |
+
"hostel": {"categories": [None], "topics": ["fees", "rules", "facilities", "contact"]},
|
| 191 |
+
"library": {"categories": [None], "topics": ["fees", "rules", "facilities", "contact", "calendar", "document"]},
|
| 192 |
+
"placement": {"categories": ["tnp"], "topics": ["stats", "process", "contact", "event"]},
|
| 193 |
+
"research": {"categories": ["patent", "ssip", "funded", "publication"], "topics": ["stats", "project", "process", "contact"]},
|
| 194 |
+
"club": {"categories": ["nss", "ncc", "ieee", "iei", "adventure", "women_cell"], "topics": ["event", "contact", "process", "stats", "notice"]},
|
| 195 |
+
"admission": {"categories": ["ug", "pg"], "topics": ["process", "fees", "document", "result", "calendar", "contact"]}
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
from sentence_transformers import SentenceTransformer
|
| 199 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 200 |
+
import numpy as np
|
| 201 |
+
import re
|
| 202 |
+
from app.core.config import settings
|
| 203 |
+
from typing import Optional
|
| 204 |
+
from rank_bm25 import BM25Okapi
|
| 205 |
+
|
| 206 |
+
FIELD_THRESHOLDS = {
|
| 207 |
+
"type": 0.25, # Was 0.5 - too high for embedding-heavy field
|
| 208 |
+
"category": 0.5, # Keep - BM25-heavy works well
|
| 209 |
+
"topic": 0.4, # Was 0.5 - slight reduction
|
| 210 |
+
"intent": 0.5, # Keep - usually clear signals
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
FIELD_WEIGHTS = {
|
| 215 |
+
"type": (0.6, 0.4), # embedding-heavy — semantic
|
| 216 |
+
"category": (0.35, 0.65), # BM25-heavy — exact names matter most
|
| 217 |
+
"topic": (0.55, 0.45),
|
| 218 |
+
"intent": (0.7, 0.3), # embedding-heavy — semantic intent
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
class FilterClassifier:
|
| 222 |
+
def __init__(self, threshold=None):
|
| 223 |
+
self.model = SentenceTransformer(str(settings.embeddings_path / "bge-small"))
|
| 224 |
+
self.anchor_embeddings = {}
|
| 225 |
+
self.bm25_classifiers = {}
|
| 226 |
+
self.anchor_keys = {}
|
| 227 |
+
self._build_anchor_embeddings()
|
| 228 |
+
self._build_bm25()
|
| 229 |
+
self.threshold = threshold
|
| 230 |
+
|
| 231 |
+
def _build_anchor_embeddings(self):
|
| 232 |
+
for field, anchors in ANCHORS.items():
|
| 233 |
+
self.anchor_embeddings[field] = {
|
| 234 |
+
label: self.model.encode(f"{label} {text}")
|
| 235 |
+
for label, text in anchors.items()
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
def _build_bm25(self):
|
| 239 |
+
for field, anchors in ANCHORS.items():
|
| 240 |
+
keys = list(anchors.keys())
|
| 241 |
+
docs = [f"{label} {text}".lower().split() for label, text in anchors.items()]
|
| 242 |
+
self.anchor_keys[field] = keys
|
| 243 |
+
self.bm25_classifiers[field] = BM25Okapi(docs)
|
| 244 |
+
|
| 245 |
+
def handle_abbreviations(self, query: str) -> str:
|
| 246 |
+
tokens = query.lower().split()
|
| 247 |
+
expanded = [ABBREVIATIONS.get(t, t) for t in tokens]
|
| 248 |
+
return " ".join(expanded)
|
| 249 |
+
|
| 250 |
+
def preprocess(self, query: str) -> str:
|
| 251 |
+
# Remove punctuation except spaces
|
| 252 |
+
query = re.sub(r'[^\w\s]', ' ', query.lower())
|
| 253 |
+
# Handle multiple spaces
|
| 254 |
+
query = re.sub(r'\s+', ' ', query).strip()
|
| 255 |
+
return query
|
| 256 |
+
|
| 257 |
+
def classify(self, query: str) -> dict:
|
| 258 |
+
query = self.handle_abbreviations(query)
|
| 259 |
+
query = self.preprocess(query)
|
| 260 |
+
query_emb = self.model.encode(query)
|
| 261 |
+
tokenized = query.lower().split()
|
| 262 |
+
|
| 263 |
+
result = {"type": None, "category": None, "topic": None, "year": None, "intent": None}
|
| 264 |
+
|
| 265 |
+
# 1. Classify Primary Fields (Type and Intent)
|
| 266 |
+
result["type"] = self._get_best_match("type", query_emb, tokenized)
|
| 267 |
+
result["intent"] = self._get_best_match("intent", query_emb, tokenized)
|
| 268 |
+
|
| 269 |
+
# 2. Extract Year (Independent)
|
| 270 |
+
year_match = re.search(r"\b(20\d{2})\b", query)
|
| 271 |
+
result["year"] = int(year_match.group()) if year_match else None
|
| 272 |
+
|
| 273 |
+
# 3. Cascading Classification for Category and Topic
|
| 274 |
+
if result["type"]:
|
| 275 |
+
valid_config = MASTER_INDEX.get(result["type"], {})
|
| 276 |
+
|
| 277 |
+
# Filtered Category
|
| 278 |
+
valid_cats = valid_config.get("categories", [])
|
| 279 |
+
if valid_cats and valid_cats != [None]:
|
| 280 |
+
result["category"] = self._get_best_match("category", query_emb, tokenized, allowed_labels=valid_cats)
|
| 281 |
+
|
| 282 |
+
# Filtered Topic
|
| 283 |
+
valid_topics = valid_config.get("topics", [])
|
| 284 |
+
if valid_topics:
|
| 285 |
+
result["topic"] = self._get_best_match("topic", query_emb, tokenized, allowed_labels=valid_topics)
|
| 286 |
+
|
| 287 |
+
return result
|
| 288 |
+
|
| 289 |
+
def _get_best_match(self, field: str, query_emb: np.ndarray, tokenized: list, allowed_labels: list = None) -> Optional[str]:
|
| 290 |
+
"""Helper to find the best match for a field, optionally restricted to a subset of labels."""
|
| 291 |
+
keys = self.anchor_keys[field]
|
| 292 |
+
value_embeddings = self.anchor_embeddings[field]
|
| 293 |
+
|
| 294 |
+
# If restricted, only consider allowed labels
|
| 295 |
+
target_keys = allowed_labels if allowed_labels else keys
|
| 296 |
+
|
| 297 |
+
# 1. Embedding scores
|
| 298 |
+
emb_scores = {
|
| 299 |
+
val: cosine_similarity([query_emb], [value_embeddings[val]])[0][0]
|
| 300 |
+
for val in target_keys if val in value_embeddings
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
# 2. BM25 scores (subset aware)
|
| 304 |
+
raw_bm25 = self.bm25_classifiers[field].get_scores(tokenized)
|
| 305 |
+
global_bm25_max = max(raw_bm25) if len(raw_bm25) > 0 and max(raw_bm25) > 0 else 1
|
| 306 |
+
|
| 307 |
+
# We need to map global BM25 scores to our subset
|
| 308 |
+
subset_bm25 = {}
|
| 309 |
+
for val in target_keys:
|
| 310 |
+
if val in keys:
|
| 311 |
+
idx = keys.index(val)
|
| 312 |
+
subset_bm25[val] = raw_bm25[idx]
|
| 313 |
+
|
| 314 |
+
# Normalize BM25 scores using the GLOBAL maximum to keep perspective
|
| 315 |
+
normalized_bm25 = {v: s / global_bm25_max for v, s in subset_bm25.items()}
|
| 316 |
+
|
| 317 |
+
# 3. Combine with Weights
|
| 318 |
+
emb_w, bm25_w = FIELD_WEIGHTS[field]
|
| 319 |
+
combined = {
|
| 320 |
+
val: (emb_w * emb_scores.get(val, 0)) + (bm25_w * normalized_bm25.get(val, 0))
|
| 321 |
+
for val in target_keys
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
if not combined:
|
| 325 |
+
return None
|
| 326 |
+
|
| 327 |
+
best_val = max(combined, key=combined.get)
|
| 328 |
+
best_score = combined[best_val]
|
| 329 |
+
|
| 330 |
+
print(f"{field}: {best_val} | combined={best_score:.3f} emb={emb_scores.get(best_val,0):.3f} bm25={normalized_bm25.get(best_val,0):.3f}")
|
| 331 |
+
|
| 332 |
+
return best_val if best_score > FIELD_THRESHOLDS[field] else None
|
| 333 |
+
|
| 334 |
+
classifier = FilterClassifier()
|
app/services/filter_classifier.py
ADDED
|
@@ -0,0 +1,529 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
ANCHORS = {
|
| 3 |
+
|
| 4 |
+
# ─────────────────────────────────────────────────
|
| 5 |
+
# TYPE — written as exemplar descriptions so the
|
| 6 |
+
# SentenceTransformer embeddings are maximally
|
| 7 |
+
# separated. Each entry must be clearly distinct.
|
| 8 |
+
# ─────────────────────────────────────────────────
|
| 9 |
+
"type": {
|
| 10 |
+
"department": (
|
| 11 |
+
"Which engineering department offers this course? "
|
| 12 |
+
"Tell me about the academic branch, its faculty, subjects, labs, syllabus and semester structure."
|
| 13 |
+
),
|
| 14 |
+
"facility": (
|
| 15 |
+
"Where is this campus facility located? "
|
| 16 |
+
"Show me the central infrastructure, physical spaces, buildings, blocks, floors and rooms on campus."
|
| 17 |
+
),
|
| 18 |
+
"service": (
|
| 19 |
+
"How do I use this administrative service? "
|
| 20 |
+
"I need help from an office, cell, section or support desk for student welfare."
|
| 21 |
+
),
|
| 22 |
+
"hostel": (
|
| 23 |
+
"What are the hostel rules and accommodation details? "
|
| 24 |
+
"I want to know about the dormitory, residence hall, mess, warden and boarding facilities."
|
| 25 |
+
),
|
| 26 |
+
"library": (
|
| 27 |
+
"How do I borrow books from the library? "
|
| 28 |
+
"Tell me about the reading room, book catalog, journal issue, return and lending system."
|
| 29 |
+
),
|
| 30 |
+
"placement": (
|
| 31 |
+
"Which companies came for campus placement this year? "
|
| 32 |
+
"I want to know about recruitment drives, offer letters, packages, TNP cell and placement statistics."
|
| 33 |
+
),
|
| 34 |
+
"research": (
|
| 35 |
+
"How do I apply for a funded research project? "
|
| 36 |
+
"Tell me about grants, patents, publications, PhD scholars and innovation at the institute."
|
| 37 |
+
),
|
| 38 |
+
"club": (
|
| 39 |
+
"How do I join a student club or society? "
|
| 40 |
+
"Tell me about extracurricular chapters like NSS, NCC, IEEE and cultural or sports committees."
|
| 41 |
+
),
|
| 42 |
+
"admission": (
|
| 43 |
+
"What is the admission process to join this college? "
|
| 44 |
+
"I want to know about enrollment, eligibility criteria, merit list, application and registration."
|
| 45 |
+
),
|
| 46 |
+
},
|
| 47 |
+
|
| 48 |
+
# ─────────────────────────────────────────────────
|
| 49 |
+
# CATEGORY — exemplar sentences, grouped by parent
|
| 50 |
+
# type. Unique, distinctive keywords embedded in
|
| 51 |
+
# natural sentences to prevent cross-category leaks.
|
| 52 |
+
# ─────────────────────────────────────────────────
|
| 53 |
+
"category": {
|
| 54 |
+
|
| 55 |
+
# ── departments ──
|
| 56 |
+
"applied_mechanics": (
|
| 57 |
+
"The Applied Mechanics department covers statics, dynamics, solid mechanics, "
|
| 58 |
+
"fluid mechanics, stress-strain analysis and structural deformation."
|
| 59 |
+
),
|
| 60 |
+
"chemical": (
|
| 61 |
+
"The Chemical Engineering department covers chemistry, thermodynamics, reaction engineering, "
|
| 62 |
+
"distillation, process plant design and petrochemical processes."
|
| 63 |
+
),
|
| 64 |
+
"civil": (
|
| 65 |
+
"The Civil Engineering department covers structural engineering, construction, "
|
| 66 |
+
"geotechnical surveying, transportation, concrete design, roads and bridges."
|
| 67 |
+
),
|
| 68 |
+
"computer": (
|
| 69 |
+
"The Computer Engineering department covers computer hardware, microprocessors, "
|
| 70 |
+
"VLSI design, embedded systems, digital circuits and chip architecture."
|
| 71 |
+
),
|
| 72 |
+
"cse_ds": (
|
| 73 |
+
"The Computer Science and Data Science department covers algorithms, artificial intelligence, "
|
| 74 |
+
"machine learning, neural networks, deep learning, NLP and data analytics."
|
| 75 |
+
),
|
| 76 |
+
"electronics_comm": (
|
| 77 |
+
"The Electronics and Communication Engineering department covers signal processing, "
|
| 78 |
+
"analog circuits, RF systems, wireless communication, antennas and microwave technology."
|
| 79 |
+
),
|
| 80 |
+
"electronics_inst": (
|
| 81 |
+
"The Electronics and Instrumentation Engineering department covers sensors, transducers, "
|
| 82 |
+
"LVDT, biomedical instrumentation, measurement, calibration and control systems."
|
| 83 |
+
),
|
| 84 |
+
"electrical": (
|
| 85 |
+
"The Electrical Engineering department covers power systems, generation, transmission, "
|
| 86 |
+
"distribution, motors, transformers, induction machines and synchronous drives."
|
| 87 |
+
),
|
| 88 |
+
"it": (
|
| 89 |
+
"The Information Technology department covers software development, ERP systems, "
|
| 90 |
+
"cloud computing, databases, DevOps, web applications and enterprise solutions."
|
| 91 |
+
),
|
| 92 |
+
"ict": (
|
| 93 |
+
"The Information and Communication Technology department covers telecom, networking, "
|
| 94 |
+
"fiber optics, routing, switching, bandwidth and internet protocols."
|
| 95 |
+
),
|
| 96 |
+
"instrumentation": (
|
| 97 |
+
"The Instrumentation and Control department covers PLC, SCADA, automation, "
|
| 98 |
+
"process control, feedback loops and industrial control plant systems."
|
| 99 |
+
),
|
| 100 |
+
"mechanical": (
|
| 101 |
+
"The Mechanical Engineering department covers manufacturing, machining, thermal engineering, "
|
| 102 |
+
"fluid mechanics, CAD CAM design, turbines, heat transfer and production."
|
| 103 |
+
),
|
| 104 |
+
"power_electronics": (
|
| 105 |
+
"The Power Electronics department covers converters, inverters, MOSFETs, IGBTs, "
|
| 106 |
+
"rectifiers, choppers, variable frequency drives and switching circuits."
|
| 107 |
+
),
|
| 108 |
+
"science_humanities": (
|
| 109 |
+
"The Science and Humanities department covers applied physics, chemistry, mathematics, "
|
| 110 |
+
"English communication and basic foundation sciences for engineering."
|
| 111 |
+
),
|
| 112 |
+
|
| 113 |
+
# ── service categories ──
|
| 114 |
+
"transport": (
|
| 115 |
+
"The college transport service operates bus routes for commuting. "
|
| 116 |
+
"I want to know the bus stop, timing, pickup and drop schedule and shuttle conveyance."
|
| 117 |
+
),
|
| 118 |
+
"finance": (
|
| 119 |
+
"The finance office handles tuition fees, semester payment, fine, scholarship, refund and bank challan."
|
| 120 |
+
),
|
| 121 |
+
"medical": (
|
| 122 |
+
"The campus medical facility has a doctor, dispensary and clinic for first aid, "
|
| 123 |
+
"nursing and student health treatment."
|
| 124 |
+
),
|
| 125 |
+
"sports": (
|
| 126 |
+
"The sports facility has grounds, a gym and courts for cricket, football, "
|
| 127 |
+
"badminton, volleyball, athletics and fitness activities."
|
| 128 |
+
),
|
| 129 |
+
"grievance": (
|
| 130 |
+
"The grievance redressal cell handles student complaints about harassment, "
|
| 131 |
+
"discrimination based on SC ST OBC, gender issues and appeal inquiries."
|
| 132 |
+
),
|
| 133 |
+
"forms": (
|
| 134 |
+
"I need to download a bonafide certificate, NOC, migration form, TC or no-objection document from the college."
|
| 135 |
+
),
|
| 136 |
+
|
| 137 |
+
# ── admission categories ──
|
| 138 |
+
"ug": (
|
| 139 |
+
"Undergraduate BE BTech bachelor degree admission through GUJCET JEE. "
|
| 140 |
+
"Four year engineering program, first year intake, lateral entry eligibility."
|
| 141 |
+
),
|
| 142 |
+
"pg": (
|
| 143 |
+
"Postgraduate ME MTech MBA MCA master degree admission through GATE MAT entrance exam. "
|
| 144 |
+
"Two year program eligibility and registration process."
|
| 145 |
+
),
|
| 146 |
+
|
| 147 |
+
# ── placement categories ──
|
| 148 |
+
"tnp": (
|
| 149 |
+
"The Training and Placement cell organizes campus recruitment drives. "
|
| 150 |
+
"Companies visit for interviews, offer letters and placement packages are coordinated by the placement officer."
|
| 151 |
+
),
|
| 152 |
+
|
| 153 |
+
# ── research categories ──
|
| 154 |
+
"patent": (
|
| 155 |
+
"A patent was filed for a new invention idea. "
|
| 156 |
+
"Intellectual property protection, granted innovation rights for student or faculty work."
|
| 157 |
+
),
|
| 158 |
+
"ssip": (
|
| 159 |
+
"The SSIP scheme funds student startup and innovation projects. "
|
| 160 |
+
"Gujarat government provides seed money, incubation and entrepreneurship support."
|
| 161 |
+
),
|
| 162 |
+
"funded": (
|
| 163 |
+
"This is a sponsored research project funded by DST, ISRO or DRDO. "
|
| 164 |
+
"External grant, industry collaboration, government funded research work."
|
| 165 |
+
),
|
| 166 |
+
"publication": (
|
| 167 |
+
"A research paper was published in a Scopus or SCI journal. "
|
| 168 |
+
"Conference proceedings, citation, article authorship and research publication record."
|
| 169 |
+
),
|
| 170 |
+
|
| 171 |
+
# ── club categories ──
|
| 172 |
+
"nss": (
|
| 173 |
+
"NSS National Service Scheme organizes volunteer activities, blood donation camps, "
|
| 174 |
+
"social awareness programs and community service for students."
|
| 175 |
+
),
|
| 176 |
+
"ncc": (
|
| 177 |
+
"NCC National Cadet Corps trains cadets in army, navy and air force drills, "
|
| 178 |
+
"parade, and issues B and C certificates."
|
| 179 |
+
),
|
| 180 |
+
"ieee": (
|
| 181 |
+
"The IEEE student chapter organizes technical symposiums, paper presentations "
|
| 182 |
+
"and seminars for electrical and electronics engineering students."
|
| 183 |
+
),
|
| 184 |
+
"iei": (
|
| 185 |
+
"The IEI Institution of Engineers India student chapter is a professional body "
|
| 186 |
+
"offering membership and extracurricular technical activities."
|
| 187 |
+
),
|
| 188 |
+
"adventure": (
|
| 189 |
+
"The adventure club organizes trekking, hiking, camping, outdoor expeditions "
|
| 190 |
+
"and rock climbing activities in nature."
|
| 191 |
+
),
|
| 192 |
+
"women_cell": (
|
| 193 |
+
"The Women Development Cell promotes gender equality, ladies empowerment, "
|
| 194 |
+
"and handles harassment redressal for female students and staff."
|
| 195 |
+
),
|
| 196 |
+
|
| 197 |
+
# ── administration categories ──
|
| 198 |
+
"principal": (
|
| 199 |
+
"The principal is the head of the institution. "
|
| 200 |
+
"The director, chairman and governing body manage top-level college administration."
|
| 201 |
+
),
|
| 202 |
+
"accreditation": (
|
| 203 |
+
"The college has NBA, NAAC, NIRF rankings and AICTE GTU affiliation. "
|
| 204 |
+
"Accreditation grade, approval score and institutional ranking details."
|
| 205 |
+
),
|
| 206 |
+
"awards": (
|
| 207 |
+
"The college has received awards and recognition at national and state level. "
|
| 208 |
+
"Students and faculty have achieved distinctions, trophies and rank honors."
|
| 209 |
+
),
|
| 210 |
+
},
|
| 211 |
+
|
| 212 |
+
# ─────────────────────────────────────────────────
|
| 213 |
+
# TOPIC — aspects of a subject. Written as distinct
|
| 214 |
+
# question fragments to separate overlapping terms.
|
| 215 |
+
# ─────────────────────────────────────────────────
|
| 216 |
+
"topic": {
|
| 217 |
+
"faculty": (
|
| 218 |
+
"Who are the faculty members? I want to know about professors, lecturers, "
|
| 219 |
+
"HOD designation, teaching staff qualification and PhD details."
|
| 220 |
+
),
|
| 221 |
+
"lab": (
|
| 222 |
+
"Where is the laboratory? I want to know about practical experiments, "
|
| 223 |
+
"equipment, instruments, workshop setup and apparatus in the lab."
|
| 224 |
+
),
|
| 225 |
+
"syllabus": (
|
| 226 |
+
"What is the course syllabus? Show me the curriculum, subjects, units, "
|
| 227 |
+
"chapters and semester-wise GTU prescribed course content."
|
| 228 |
+
),
|
| 229 |
+
"timetable": (
|
| 230 |
+
"What is the class timetable? I need the lecture schedule, period slots, "
|
| 231 |
+
"weekly routine and batch division timing."
|
| 232 |
+
),
|
| 233 |
+
"event": (
|
| 234 |
+
"What events are coming up? Tell me about the fest, hackathon, seminar, "
|
| 235 |
+
"workshop, cultural program or technical competition organized."
|
| 236 |
+
),
|
| 237 |
+
"project": (
|
| 238 |
+
"Tell me about student projects. I want to know about final year projects, "
|
| 239 |
+
"mini projects, SIH capstone work and dissertation submissions."
|
| 240 |
+
),
|
| 241 |
+
"virtual_tour": (
|
| 242 |
+
"Can I take a virtual tour of the campus? "
|
| 243 |
+
"Show me the 360-degree online walkthrough, interactive map of rooms and infrastructure."
|
| 244 |
+
),
|
| 245 |
+
"notice": (
|
| 246 |
+
"Are there any new notices? Show me the latest announcements, circulars, "
|
| 247 |
+
"bulletin board updates and recent notifications."
|
| 248 |
+
),
|
| 249 |
+
"fees": (
|
| 250 |
+
"What are the fees? I want the tuition fee structure, semester charges, "
|
| 251 |
+
"fine, late fee, scholarship and payment due breakdown."
|
| 252 |
+
),
|
| 253 |
+
"rules": (
|
| 254 |
+
"What are the rules? Tell me about college regulations, discipline policy, "
|
| 255 |
+
"code of conduct, dress code and behaviour guidelines."
|
| 256 |
+
),
|
| 257 |
+
"facilities": (
|
| 258 |
+
"What facilities are available? Tell me about amenities like WiFi, mess, canteen, "
|
| 259 |
+
"gym, recreation areas and other campus services provided."
|
| 260 |
+
),
|
| 261 |
+
"contact": (
|
| 262 |
+
"How do I contact them? I need the phone number, email address, "
|
| 263 |
+
"office location and person to reach at the department."
|
| 264 |
+
),
|
| 265 |
+
"process": (
|
| 266 |
+
"What is the process to apply? Tell me the step-by-step procedure, "
|
| 267 |
+
"method, eligibility requirement and workflow sequence."
|
| 268 |
+
),
|
| 269 |
+
"document": (
|
| 270 |
+
"What documents do I need? I need a bonafide certificate, migration form, "
|
| 271 |
+
"leaving certificate, TC or official attestation and verification."
|
| 272 |
+
),
|
| 273 |
+
"route": (
|
| 274 |
+
"What is the bus route? Tell me the bus stop, pickup and drop point, "
|
| 275 |
+
"commute map, schedule and destination timing."
|
| 276 |
+
),
|
| 277 |
+
"stats": (
|
| 278 |
+
"What are the statistics? Show me data, figures, total numbers, pass percentage, "
|
| 279 |
+
"ratio, achievements and placement records."
|
| 280 |
+
),
|
| 281 |
+
"calendar": (
|
| 282 |
+
"What does the academic calendar look like? Show me exam dates, holidays, "
|
| 283 |
+
"semester start and end, and important event deadlines."
|
| 284 |
+
),
|
| 285 |
+
"vision": (
|
| 286 |
+
"What is the vision and mission of the college? "
|
| 287 |
+
"Tell me the goals, objectives, values, motto and philosophy statement."
|
| 288 |
+
),
|
| 289 |
+
"induction": (
|
| 290 |
+
"When is the induction program? Tell me about the orientation, welcome program, "
|
| 291 |
+
"freshman schedule, speaker list and new student activities."
|
| 292 |
+
),
|
| 293 |
+
},
|
| 294 |
+
|
| 295 |
+
# ─────────────────────────────────────────────────
|
| 296 |
+
# INTENT — semantically distant action patterns.
|
| 297 |
+
# Use strong, distinct phrasing to avoid overlap.
|
| 298 |
+
# ─────────────────────────────────────────────────
|
| 299 |
+
"intent": {
|
| 300 |
+
"list": (
|
| 301 |
+
"List all available options. Show me every item. "
|
| 302 |
+
"Give me a complete enumeration. Display all choices."
|
| 303 |
+
),
|
| 304 |
+
"count": (
|
| 305 |
+
"How many are there? What is the total count? "
|
| 306 |
+
"Tell me the number, quantity and strength."
|
| 307 |
+
),
|
| 308 |
+
"detail": (
|
| 309 |
+
"What is this? Explain it to me. Describe and tell me about it. "
|
| 310 |
+
"I want an overview, summary and background information."
|
| 311 |
+
),
|
| 312 |
+
"process": (
|
| 313 |
+
"How do I do this? What are the steps? "
|
| 314 |
+
"Guide me through the procedure, method and eligibility requirements."
|
| 315 |
+
),
|
| 316 |
+
"greeting": (
|
| 317 |
+
"Hello! Hi, good morning, good evening. "
|
| 318 |
+
"How are you? Namaste. Hey, greetings to you."
|
| 319 |
+
),
|
| 320 |
+
},
|
| 321 |
+
}
|
| 322 |
+
ABBREVIATIONS = {
|
| 323 |
+
# Departments
|
| 324 |
+
"ce": "computer civil engineering",
|
| 325 |
+
"cse": "computer science engineering",
|
| 326 |
+
"ds": "data science",
|
| 327 |
+
"it": "information technology",
|
| 328 |
+
"ict": "information communication technology",
|
| 329 |
+
"ece": "electronics communication engineering",
|
| 330 |
+
"ei": "electronics instrumentation engineering",
|
| 331 |
+
"ic": "instrumentation control",
|
| 332 |
+
"ee": "electrical engineering",
|
| 333 |
+
"pe": "power electronics",
|
| 334 |
+
"me": "mechanical master engineering",
|
| 335 |
+
"am": "applied mechanics",
|
| 336 |
+
"che": "chemical engineering",
|
| 337 |
+
"ch": "chemical",
|
| 338 |
+
|
| 339 |
+
# Degrees & Admission
|
| 340 |
+
"ug": "undergraduate",
|
| 341 |
+
"pg": "postgraduate",
|
| 342 |
+
"be": "bachelor engineering",
|
| 343 |
+
"btech": "bachelor technology",
|
| 344 |
+
"mtech": "master technology",
|
| 345 |
+
"mba": "master business administration",
|
| 346 |
+
"mca": "master computer applications",
|
| 347 |
+
|
| 348 |
+
# Organizations & Cells
|
| 349 |
+
"tnp": "training placement",
|
| 350 |
+
"nss": "national service scheme",
|
| 351 |
+
"ncc": "national cadet corps",
|
| 352 |
+
"ieee": "institute electrical electronics engineers",
|
| 353 |
+
"iei": "institution engineers india",
|
| 354 |
+
"wdc": "women development cell",
|
| 355 |
+
"sip": "student innovation project",
|
| 356 |
+
"gtu": "gujarat technological university",
|
| 357 |
+
"nba": "national board accreditation",
|
| 358 |
+
"naac": "national assessment accreditation council",
|
| 359 |
+
"nirf": "national institutional ranking framework",
|
| 360 |
+
"aicte": "all india council technical education",
|
| 361 |
+
"drdo": "defence research development organisation",
|
| 362 |
+
"isro": "indian space research organisation",
|
| 363 |
+
"dst": "department science technology",
|
| 364 |
+
|
| 365 |
+
# General
|
| 366 |
+
"sih": "smart india hackathon",
|
| 367 |
+
"lpa": "lakhs per annum",
|
| 368 |
+
"noc": "no objection certificate",
|
| 369 |
+
"tc": "transfer certificate",
|
| 370 |
+
"hod": "head department",
|
| 371 |
+
"phd": "doctor philosophy",
|
| 372 |
+
|
| 373 |
+
# Technical
|
| 374 |
+
"scada": "supervisory control data acquisition",
|
| 375 |
+
"plc": "programmable logic controller",
|
| 376 |
+
"lvdt": "linear variable differential transformer",
|
| 377 |
+
"mosfet": "metal oxide semiconductor field effect transistor",
|
| 378 |
+
"igbt": "insulated gate bipolar transistor",
|
| 379 |
+
"vlsi": "very large scale integration",
|
| 380 |
+
"cad": "computer aided design",
|
| 381 |
+
"cam": "computer aided manufacturing",
|
| 382 |
+
"erp": "enterprise resource planning",
|
| 383 |
+
"rf": "radio frequency",
|
| 384 |
+
"nlp": "natural language processing",
|
| 385 |
+
"ai": "artificial intelligence",
|
| 386 |
+
"ml": "machine learning",
|
| 387 |
+
"ip": "intellectual property",
|
| 388 |
+
}
|
| 389 |
+
MASTER_INDEX = {
|
| 390 |
+
"department": {
|
| 391 |
+
"categories": [
|
| 392 |
+
"applied_mechanics", "chemical", "civil", "computer", "cse_ds",
|
| 393 |
+
"electronics_comm", "electronics_inst", "electrical", "it", "ict",
|
| 394 |
+
"instrumentation", "mechanical", "power_electronics", "science_humanities"
|
| 395 |
+
],
|
| 396 |
+
"topics": [
|
| 397 |
+
"faculty", "lab", "syllabus", "timetable", "event",
|
| 398 |
+
"project", "virtual_tour", "notice", "contact"
|
| 399 |
+
]
|
| 400 |
+
},
|
| 401 |
+
"facility": {"categories": [None], "topics": ["lab", "facilities", "contact", "virtual_tour"]},
|
| 402 |
+
"service": {
|
| 403 |
+
"categories": ["transport", "finance", "medical", "sports", "grievance", "forms", "principal", "accreditation", "awards"],
|
| 404 |
+
"topics": ["fees", "rules", "process", "document", "route", "contact", "calendar", "stats", "vision", "induction"]
|
| 405 |
+
},
|
| 406 |
+
"hostel": {"categories": [None], "topics": ["fees", "rules", "facilities", "contact"]},
|
| 407 |
+
"library": {"categories": [None], "topics": ["fees", "rules", "facilities", "contact", "calendar", "document"]},
|
| 408 |
+
"placement": {"categories": ["tnp"], "topics": ["stats", "process", "contact", "event"]},
|
| 409 |
+
"research": {"categories": ["patent", "ssip", "funded", "publication"], "topics": ["stats", "project", "process", "contact"]},
|
| 410 |
+
"club": {"categories": ["nss", "ncc", "ieee", "iei", "adventure", "women_cell"], "topics": ["event", "contact", "process", "stats", "notice"]},
|
| 411 |
+
"admission": {"categories": ["ug", "pg"], "topics": ["process", "fees", "document", "result", "calendar", "contact"]}
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
from sentence_transformers import SentenceTransformer
|
| 415 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 416 |
+
import numpy as np
|
| 417 |
+
import re
|
| 418 |
+
from app.core.config import settings
|
| 419 |
+
from typing import Optional
|
| 420 |
+
from rank_bm25 import BM25Okapi
|
| 421 |
+
|
| 422 |
+
FIELD_THRESHOLDS = {
|
| 423 |
+
"type": 0.25, # Was 0.5 - too high for embedding-heavy field
|
| 424 |
+
"category": 0.5, # Keep - BM25-heavy works well
|
| 425 |
+
"topic": 0.4, # Was 0.5 - slight reduction
|
| 426 |
+
"intent": 0.5, # Keep - usually clear signals
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
FIELD_WEIGHTS = {
|
| 431 |
+
"type": (0.6, 0.4), # embedding-heavy — semantic
|
| 432 |
+
"category": (0.35, 0.65), # BM25-heavy — exact names matter most
|
| 433 |
+
"topic": (0.55, 0.45),
|
| 434 |
+
"intent": (0.7, 0.3), # embedding-heavy — semantic intent
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
class FilterClassifier:
|
| 438 |
+
def __init__(self, threshold=None):-==–
|
| 439 |
+
self.model = SentenceTransformer(str(settings.embeddings_path / "bge-small"))
|
| 440 |
+
self.anchor_embeddings = {}
|
| 441 |
+
self.bm25_classifiers = {}
|
| 442 |
+
self.anchor_keys = {}
|
| 443 |
+
self._build_anchor_embeddings()
|
| 444 |
+
self._build_bm25()
|
| 445 |
+
self.threshold = threshold
|
| 446 |
+
|
| 447 |
+
def _build_anchor_embeddings(self):
|
| 448 |
+
for field, anchors in ANCHORS.items():
|
| 449 |
+
self.anchor_embeddings[field] = {
|
| 450 |
+
label: self.model.encode(f"{label} {text}")
|
| 451 |
+
for label, text in anchors.items()
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
def _build_bm25(self):
|
| 455 |
+
for field, anchors in ANCHORS.items():
|
| 456 |
+
keys = list(anchors.keys())
|
| 457 |
+
docs = [f"{label} {text}".lower().split() for label, text in anchors.items()]
|
| 458 |
+
self.anchor_keys[field] = keys
|
| 459 |
+
self.bm25_classifiers[field] = BM25Okapi(docs)
|
| 460 |
+
|
| 461 |
+
def handle_abbreviations(self, query: str) -> str:
|
| 462 |
+
tokens = query.lower().split()
|
| 463 |
+
expanded = [ABBREVIATIONS.get(t, t) for t in tokens]
|
| 464 |
+
return " ".join(expanded)
|
| 465 |
+
|
| 466 |
+
def preprocess(self, query: str) -> str:
|
| 467 |
+
# Remove punctuation except spaces
|
| 468 |
+
query = re.sub(r'[^\w\s]', ' ', query.lower())
|
| 469 |
+
# Handle multiple spaces
|
| 470 |
+
query = re.sub(r'\s+', ' ', query).strip()
|
| 471 |
+
return query
|
| 472 |
+
|
| 473 |
+
def classify(self, query: str) -> dict:
|
| 474 |
+
query = self.handle_abbreviations(query)
|
| 475 |
+
query = self.preprocess(query)
|
| 476 |
+
query_emb = self.model.encode(query)
|
| 477 |
+
tokenized = query.lower().split()
|
| 478 |
+
result = {"type": None, "category": None, "topic": None, "year": None, "intent": None}
|
| 479 |
+
|
| 480 |
+
for field, value_embeddings in self.anchor_embeddings.items():
|
| 481 |
+
keys = self.anchor_keys[field]
|
| 482 |
+
|
| 483 |
+
# Embedding scores
|
| 484 |
+
emb_scores = {
|
| 485 |
+
val: cosine_similarity([query_emb], [emb])[0][0]
|
| 486 |
+
for val, emb in value_embeddings.items()
|
| 487 |
+
}
|
| 488 |
+
|
| 489 |
+
# BM25 scores — normalized to [0, 1]
|
| 490 |
+
raw_bm25 = self.bm25_classifiers[field].get_scores(tokenized)
|
| 491 |
+
bm25_max = max(raw_bm25) if max(raw_bm25) > 0 else 1
|
| 492 |
+
bm25_scores = {
|
| 493 |
+
keys[i]: raw_bm25[i] / bm25_max
|
| 494 |
+
for i in range(len(keys))
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
# Combine
|
| 498 |
+
emb_w, bm25_w = FIELD_WEIGHTS[field]
|
| 499 |
+
combined = {
|
| 500 |
+
val: emb_w * emb_scores[val] + bm25_w * bm25_scores.get(val, 0)
|
| 501 |
+
for val in keys
|
| 502 |
+
}
|
| 503 |
+
|
| 504 |
+
best_val = max(combined, key=combined.get)
|
| 505 |
+
best_score = combined[best_val]
|
| 506 |
+
threshold = FIELD_THRESHOLDS[field]
|
| 507 |
+
|
| 508 |
+
print(f"{field}: {best_val} | combined={best_score:.3f} emb={emb_scores[best_val]:.3f} bm25={bm25_scores.get(best_val,0):.3f}")
|
| 509 |
+
|
| 510 |
+
if best_score > threshold:
|
| 511 |
+
result[field] = best_val
|
| 512 |
+
|
| 513 |
+
year = re.search(r"\b(20\d{2})\b", query)
|
| 514 |
+
result["year"] = int(year.group()) if year else None
|
| 515 |
+
|
| 516 |
+
if result["type"] is not None:
|
| 517 |
+
if (result["category"] is None
|
| 518 |
+
or result["category"] not in MASTER_INDEX[result["type"]]["categories"]):
|
| 519 |
+
result["category"] = None
|
| 520 |
+
if (result["topic"] is None
|
| 521 |
+
or result["topic"] not in MASTER_INDEX[result["type"]]["topics"]):
|
| 522 |
+
result["topic"] = None
|
| 523 |
+
else:
|
| 524 |
+
result["category"] = None
|
| 525 |
+
result["topic"] = None
|
| 526 |
+
|
| 527 |
+
return result
|
| 528 |
+
|
| 529 |
+
classifier = FilterClassifier()
|
app/services/hybrid_retrieval.py
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hybrid Retrieval Service
|
| 3 |
+
========================
|
| 4 |
+
Combines BM25 (keyword) and vector (semantic) retrieval using
|
| 5 |
+
Reciprocal Rank Fusion (RRF) for stable, well-tested score merging.
|
| 6 |
+
|
| 7 |
+
Strategy:
|
| 8 |
+
1. Run vector similarity search → fetches top-k candidates from ChromaDB.
|
| 9 |
+
2. Those same candidate documents become the BM25 corpus (no second DB call).
|
| 10 |
+
3. Fuse both ranked lists using RRF.
|
| 11 |
+
4. Apply an optional title-match boost post-fusion.
|
| 12 |
+
5. Return the top-k results.
|
| 13 |
+
|
| 14 |
+
Why RRF instead of EnsembleRetriever?
|
| 15 |
+
- EnsembleRetriever depends on langchain_classic which is unstable.
|
| 16 |
+
- RRF is score-agnostic: it only uses rank order, so you never need to
|
| 17 |
+
normalise BM25 scores against cosine distances.
|
| 18 |
+
- It's the standard fusion method in production hybrid search systems
|
| 19 |
+
(used by Elasticsearch, Cohere, etc.).
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
from dataclasses import dataclass, field
|
| 25 |
+
from typing import List, Optional, Dict
|
| 26 |
+
|
| 27 |
+
from langchain_community.retrievers import BM25Retriever
|
| 28 |
+
from langchain_core.documents import Document
|
| 29 |
+
|
| 30 |
+
from app.utils.preprocessing import preprocess
|
| 31 |
+
from app.services.classifier_service import clf
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# ---------------------------------------------------------------------------
|
| 35 |
+
# Configuration
|
| 36 |
+
# ---------------------------------------------------------------------------
|
| 37 |
+
|
| 38 |
+
@dataclass
|
| 39 |
+
class HybridRetrievalConfig:
|
| 40 |
+
"""Centralised configuration for the hybrid retrieval pipeline."""
|
| 41 |
+
|
| 42 |
+
# Number of candidates each sub-retriever fetches before fusion
|
| 43 |
+
candidate_k: int = 15
|
| 44 |
+
|
| 45 |
+
# Final number of documents returned after fusion + reranking
|
| 46 |
+
top_k: int = 5
|
| 47 |
+
|
| 48 |
+
# Weights for fused score: bm25_weight + vector_weight should equal 1.0
|
| 49 |
+
bm25_weight: float = 0.6
|
| 50 |
+
vector_weight: float = 0.4
|
| 51 |
+
|
| 52 |
+
# RRF constant – larger k smooths rank differences (standard default: 60)
|
| 53 |
+
rrf_k: int = 60
|
| 54 |
+
|
| 55 |
+
# BM25 hyperparameters
|
| 56 |
+
bm25_k1: float = 1.5 # term frequency saturation
|
| 57 |
+
bm25_b: float = 0.5 # length normalisation
|
| 58 |
+
|
| 59 |
+
# Title-match boost: added to fused score for each query word found in title
|
| 60 |
+
title_boost_per_word: float = 0.1
|
| 61 |
+
|
| 62 |
+
# Minimum fused score to include a result (set to 0.0 to disable)
|
| 63 |
+
score_threshold: float = 0.0
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# ---------------------------------------------------------------------------
|
| 67 |
+
# Result type
|
| 68 |
+
# ---------------------------------------------------------------------------
|
| 69 |
+
|
| 70 |
+
@dataclass
|
| 71 |
+
class RetrievalResult:
|
| 72 |
+
"""A single retrieved document with provenance scores."""
|
| 73 |
+
document: Document
|
| 74 |
+
fused_score: float
|
| 75 |
+
bm25_rank: Optional[int] = None # rank in BM25 list (1-indexed), None if absent
|
| 76 |
+
vector_rank: Optional[int] = None # rank in vector list (1-indexed), None if absent
|
| 77 |
+
title_boost: float = 0.0
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# ---------------------------------------------------------------------------
|
| 81 |
+
# Core service
|
| 82 |
+
# ---------------------------------------------------------------------------
|
| 83 |
+
|
| 84 |
+
class HybridRetrievalService:
|
| 85 |
+
"""
|
| 86 |
+
Hybrid retrieval that fuses BM25 and vector search results via RRF.
|
| 87 |
+
|
| 88 |
+
Usage
|
| 89 |
+
-----
|
| 90 |
+
::
|
| 91 |
+
|
| 92 |
+
service = HybridRetrievalService(vector_db=rag.db)
|
| 93 |
+
results = service.retrieve(query="Faculties of Computer Department")
|
| 94 |
+
for r in results:
|
| 95 |
+
print(r.fused_score, r.document.page_content[:80])
|
| 96 |
+
"""
|
| 97 |
+
|
| 98 |
+
def __init__(
|
| 99 |
+
self,
|
| 100 |
+
vector_db,
|
| 101 |
+
config: Optional[HybridRetrievalConfig] = None,
|
| 102 |
+
):
|
| 103 |
+
"""
|
| 104 |
+
Parameters
|
| 105 |
+
----------
|
| 106 |
+
vector_db:
|
| 107 |
+
A LangChain-compatible vector store (e.g., Chroma instance from
|
| 108 |
+
``RAGService.db``) that supports ``similarity_search_with_score``.
|
| 109 |
+
config:
|
| 110 |
+
Optional configuration object. Defaults to ``HybridRetrievalConfig()``.
|
| 111 |
+
"""
|
| 112 |
+
self.vector_db = vector_db
|
| 113 |
+
self.cfg = config or HybridRetrievalConfig()
|
| 114 |
+
self.classifier = clf
|
| 115 |
+
self.raw_filters = {}
|
| 116 |
+
|
| 117 |
+
# ------------------------------------------------------------------
|
| 118 |
+
# Public API
|
| 119 |
+
# ------------------------------------------------------------------
|
| 120 |
+
|
| 121 |
+
def retrieve(
|
| 122 |
+
self,
|
| 123 |
+
query: str,
|
| 124 |
+
) -> List[RetrievalResult]:
|
| 125 |
+
"""
|
| 126 |
+
Run hybrid retrieval and return ranked results.
|
| 127 |
+
|
| 128 |
+
Vector search runs once to fetch the candidate pool from ChromaDB.
|
| 129 |
+
Those documents are immediately reused as the BM25 corpus, so there
|
| 130 |
+
is no redundant database call.
|
| 131 |
+
|
| 132 |
+
Parameters
|
| 133 |
+
----------
|
| 134 |
+
query:
|
| 135 |
+
The raw user query (preprocessing is applied internally).
|
| 136 |
+
|
| 137 |
+
Returns
|
| 138 |
+
-------
|
| 139 |
+
List[RetrievalResult]
|
| 140 |
+
Top-k results sorted by descending fused score.
|
| 141 |
+
"""
|
| 142 |
+
processed_query = self.classifier.expand_abbreviations(query)
|
| 143 |
+
print("Processed Query: ", processed_query)
|
| 144 |
+
|
| 145 |
+
# Step 1: Single vector search — produces both the ranking AND the candidate pool
|
| 146 |
+
vector_ranking = self._vector_rank(processed_query)
|
| 147 |
+
|
| 148 |
+
if not vector_ranking:
|
| 149 |
+
return []
|
| 150 |
+
|
| 151 |
+
# Step 2: Extract the candidate docs from the vector results for BM25
|
| 152 |
+
candidate_docs = []
|
| 153 |
+
for doc, _score, _rank in vector_ranking:
|
| 154 |
+
doc.metadata["original_content"] = doc.page_content
|
| 155 |
+
doc.page_content = preprocess(doc.page_content)
|
| 156 |
+
doc.page_content = doc.metadata['title'] + ": " + doc.page_content
|
| 157 |
+
candidate_docs.append(doc)
|
| 158 |
+
|
| 159 |
+
# Step 3: BM25 search over the same candidate pool (no extra DB call)
|
| 160 |
+
bm25_ranking = self._bm25_rank(processed_query, candidate_docs)
|
| 161 |
+
|
| 162 |
+
# Step 4: Fuse both rankings via RRF
|
| 163 |
+
fused = self._reciprocal_rank_fusion(bm25_ranking, vector_ranking)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
filter_boost = {
|
| 167 |
+
"type": 1.10,
|
| 168 |
+
"category": 1.20,
|
| 169 |
+
"topic": 1.20,
|
| 170 |
+
"intent": 1.05
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
# Step 5: Boost scores based on filter confidence
|
| 174 |
+
for result in fused:
|
| 175 |
+
doc = result.document
|
| 176 |
+
|
| 177 |
+
for field in ["type", "category", "topic", "intent"]:
|
| 178 |
+
if field in doc.metadata:
|
| 179 |
+
# Check if classifier was confident AND matched
|
| 180 |
+
val = self.raw_filters.get(field)
|
| 181 |
+
conf = self.raw_filters.get(f"{field}_conf", 0)
|
| 182 |
+
|
| 183 |
+
# if val is not None and doc.metadata[field] == val and conf > 0.92 and field == "category":
|
| 184 |
+
# result.fused_score *= 1.25
|
| 185 |
+
if val is not None and doc.metadata[field] == val and conf > 0.90 and field != "intent":
|
| 186 |
+
result.fused_score *= filter_boost[field]
|
| 187 |
+
elif val is not None and doc.metadata[field] == val and conf > 0.7:
|
| 188 |
+
result.fused_score *= 1.05
|
| 189 |
+
|
| 190 |
+
boosted = self._apply_title_boost(fused, processed_query)
|
| 191 |
+
|
| 192 |
+
# Step 5: Filter, sort, and return top-k
|
| 193 |
+
for r in boosted:
|
| 194 |
+
r.fused_score = r.fused_score * 10
|
| 195 |
+
r.document.page_content = r.document.metadata["original_content"]
|
| 196 |
+
r.document.metadata["original_content"] = ""
|
| 197 |
+
results = [r for r in boosted if r.fused_score >= self.cfg.score_threshold]
|
| 198 |
+
results = sorted(results, key=lambda r: r.fused_score, reverse=True)
|
| 199 |
+
return results[: self.cfg.top_k]
|
| 200 |
+
|
| 201 |
+
# ------------------------------------------------------------------
|
| 202 |
+
# Private helpers
|
| 203 |
+
# ------------------------------------------------------------------
|
| 204 |
+
|
| 205 |
+
def _bm25_rank(
|
| 206 |
+
self,
|
| 207 |
+
processed_query: str,
|
| 208 |
+
candidate_docs: List[Document],
|
| 209 |
+
) -> List[tuple[Document, float, int]]:
|
| 210 |
+
"""
|
| 211 |
+
Run BM25 over the candidate pool.
|
| 212 |
+
|
| 213 |
+
Returns a list of (document, raw_bm25_score, rank) tuples,
|
| 214 |
+
ordered by descending score (rank is 1-indexed).
|
| 215 |
+
"""
|
| 216 |
+
retriever = BM25Retriever.from_documents(
|
| 217 |
+
candidate_docs,
|
| 218 |
+
bm25_params={"k1": self.cfg.bm25_k1, "b": self.cfg.bm25_b},
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
tokens = retriever.preprocess_func(processed_query)
|
| 222 |
+
raw_scores = retriever.vectorizer.get_scores(tokens)
|
| 223 |
+
|
| 224 |
+
# Pair each document with its BM25 score and sort descending
|
| 225 |
+
scored = sorted(
|
| 226 |
+
zip(retriever.docs, raw_scores),
|
| 227 |
+
key=lambda x: x[1],
|
| 228 |
+
reverse=True,
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
return [(doc, score, rank + 1) for rank, (doc, score) in enumerate(scored)]
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
def _vector_rank(
|
| 235 |
+
self,
|
| 236 |
+
processed_query: str,
|
| 237 |
+
) -> List[tuple[Document, float, int]]:
|
| 238 |
+
"""
|
| 239 |
+
Run vector similarity search against ChromaDB.
|
| 240 |
+
|
| 241 |
+
Returns a list of (document, similarity_score, rank) tuples,
|
| 242 |
+
ordered by descending similarity (rank is 1-indexed).
|
| 243 |
+
Chroma returns (document, distance); we convert to similarity = 1 - distance.
|
| 244 |
+
"""
|
| 245 |
+
# SCORE_FALLBACK_THRESHOLD = 0.4
|
| 246 |
+
self.raw_filters = {}
|
| 247 |
+
filters = self.classifier.predict_with_filter([processed_query])
|
| 248 |
+
raw_filters = self.classifier.predict([processed_query])[0]
|
| 249 |
+
self.raw_filters = raw_filters
|
| 250 |
+
|
| 251 |
+
if filters:
|
| 252 |
+
raw_results = self.vector_db.similarity_search_with_score(
|
| 253 |
+
processed_query, k=self.cfg.candidate_k, filter=filters
|
| 254 |
+
)
|
| 255 |
+
# best_score = (1 - raw_results[0][1]) if raw_results else 0
|
| 256 |
+
# if not raw_results or best_score < SCORE_FALLBACK_THRESHOLD:
|
| 257 |
+
if not raw_results:
|
| 258 |
+
# print("FAILED UNDER THRESHOLD")
|
| 259 |
+
# print("*="*50)
|
| 260 |
+
# print("Query: ", processed_query)
|
| 261 |
+
# print("Filters: ", filters)
|
| 262 |
+
# print("Raw Results: ", raw_results)
|
| 263 |
+
# print("Best Score: ", best_score)
|
| 264 |
+
# print("*="*50)
|
| 265 |
+
|
| 266 |
+
raw_results = self.vector_db.similarity_search_with_score(
|
| 267 |
+
processed_query, k=self.cfg.candidate_k
|
| 268 |
+
)
|
| 269 |
+
else:
|
| 270 |
+
raw_results = self.vector_db.similarity_search_with_score(
|
| 271 |
+
processed_query, k=self.cfg.candidate_k
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
ranked = []
|
| 276 |
+
for rank, (doc, distance) in enumerate(raw_results):
|
| 277 |
+
similarity = 1.0 - distance
|
| 278 |
+
ranked.append((doc, similarity, rank + 1))
|
| 279 |
+
|
| 280 |
+
return ranked
|
| 281 |
+
|
| 282 |
+
def _reciprocal_rank_fusion(
|
| 283 |
+
self,
|
| 284 |
+
bm25_ranking: List[tuple[Document, float, int]],
|
| 285 |
+
vector_ranking: List[tuple[Document, float, int]],
|
| 286 |
+
) -> List[RetrievalResult]:
|
| 287 |
+
"""
|
| 288 |
+
Fuse two ranked lists using Reciprocal Rank Fusion (RRF).
|
| 289 |
+
|
| 290 |
+
RRF score for a document d:
|
| 291 |
+
score(d) = w_bm25 * 1/(k + rank_bm25(d))
|
| 292 |
+
+ w_vec * 1/(k + rank_vec(d))
|
| 293 |
+
|
| 294 |
+
Documents not present in a list are simply omitted from that term.
|
| 295 |
+
We use page_content as the deduplication key (consistent with how
|
| 296 |
+
BM25Retriever stores docs).
|
| 297 |
+
"""
|
| 298 |
+
rrf_k = self.cfg.rrf_k
|
| 299 |
+
|
| 300 |
+
# Build lookup: content_key -> RetrievalResult
|
| 301 |
+
fused: Dict[str, RetrievalResult] = {}
|
| 302 |
+
|
| 303 |
+
def content_key(doc: Document) -> str:
|
| 304 |
+
# Use a short hash of content for stable keying
|
| 305 |
+
return doc.page_content
|
| 306 |
+
|
| 307 |
+
# --- BM25 contribution ---
|
| 308 |
+
for doc, _raw_score, rank in bm25_ranking:
|
| 309 |
+
key = content_key(doc)
|
| 310 |
+
contribution = self.cfg.bm25_weight * (1.0 / (rrf_k + rank))
|
| 311 |
+
if key not in fused:
|
| 312 |
+
fused[key] = RetrievalResult(document=doc, fused_score=0.0)
|
| 313 |
+
fused[key].fused_score += contribution
|
| 314 |
+
fused[key].bm25_rank = rank
|
| 315 |
+
|
| 316 |
+
# --- Vector contribution ---
|
| 317 |
+
for doc, _similarity, rank in vector_ranking:
|
| 318 |
+
key = content_key(doc)
|
| 319 |
+
contribution = self.cfg.vector_weight * (1.0 / (rrf_k + rank))
|
| 320 |
+
if key not in fused:
|
| 321 |
+
fused[key] = RetrievalResult(document=doc, fused_score=0.0)
|
| 322 |
+
fused[key].fused_score += contribution
|
| 323 |
+
fused[key].vector_rank = rank
|
| 324 |
+
|
| 325 |
+
return list(fused.values())
|
| 326 |
+
|
| 327 |
+
def _apply_title_boost(
|
| 328 |
+
self,
|
| 329 |
+
results: List[RetrievalResult],
|
| 330 |
+
processed_query: str,
|
| 331 |
+
) -> List[RetrievalResult]:
|
| 332 |
+
"""
|
| 333 |
+
Boost fused score for documents whose title contains query words.
|
| 334 |
+
|
| 335 |
+
Each matching word adds ``cfg.title_boost_per_word`` to the score.
|
| 336 |
+
This is a lightweight, interpretable re-ranking step that rewards
|
| 337 |
+
exact title hits without overriding semantic relevance entirely.
|
| 338 |
+
"""
|
| 339 |
+
query_words = set(processed_query.lower().split())
|
| 340 |
+
|
| 341 |
+
for result in results:
|
| 342 |
+
title = preprocess(result.document.metadata.get("title", "").lower())
|
| 343 |
+
if not title:
|
| 344 |
+
continue
|
| 345 |
+
|
| 346 |
+
boost = sum(
|
| 347 |
+
self.cfg.title_boost_per_word
|
| 348 |
+
for word in query_words
|
| 349 |
+
if word and word in title
|
| 350 |
+
)
|
| 351 |
+
result.title_boost = boost
|
| 352 |
+
result.fused_score += boost
|
| 353 |
+
|
| 354 |
+
return results
|
app/services/ingestion_service.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from app.services.file_service import file_service
|
| 3 |
+
from app.services.text_splitter import TextSplitter
|
| 4 |
+
from langchain_core.documents import Document
|
| 5 |
+
from app.utils.preprocessing import normalize
|
| 6 |
+
import json
|
| 7 |
+
from typing import List
|
| 8 |
+
from fastapi import HTTPException
|
| 9 |
+
|
| 10 |
+
class IngestionService:
|
| 11 |
+
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
|
| 12 |
+
self.chunk_size = chunk_size
|
| 13 |
+
self.chunk_overlap = chunk_overlap
|
| 14 |
+
self.text_splitter = TextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
|
| 15 |
+
|
| 16 |
+
def load_file(self, file_path: Path):
|
| 17 |
+
# read file
|
| 18 |
+
document = file_service.read_file(file_path)
|
| 19 |
+
metadata = document.metadata
|
| 20 |
+
|
| 21 |
+
#save_file
|
| 22 |
+
file_service.write_file(file_path, document.page_content, metadata)
|
| 23 |
+
|
| 24 |
+
#handle_file_chunks
|
| 25 |
+
if metadata["ext"] == "json":
|
| 26 |
+
return self.handle_json_docs(document, metadata)
|
| 27 |
+
else:
|
| 28 |
+
return self.handle_text_docs(document, file_path, metadata)
|
| 29 |
+
|
| 30 |
+
def ingest(self, file_path: Path):
|
| 31 |
+
documents = self.load_file(file_path)
|
| 32 |
+
return documents
|
| 33 |
+
|
| 34 |
+
def get_records(self):
|
| 35 |
+
return file_service.get_records()
|
| 36 |
+
|
| 37 |
+
def delete_record(self, filename: str):
|
| 38 |
+
return file_service.delete_record(filename)
|
| 39 |
+
|
| 40 |
+
def path_record(self, file_path: Path, metadata:dict):
|
| 41 |
+
file_service.patch_metadata(file_path, metadata)
|
| 42 |
+
|
| 43 |
+
def handle_json_docs(self, document: Document, metadata: dict) -> List[Document]:
|
| 44 |
+
docs = []
|
| 45 |
+
json_data = json.loads(document.page_content)
|
| 46 |
+
count = 0
|
| 47 |
+
# content: { key: { list: [], detail: text }, key: { list: [], detail: text } }
|
| 48 |
+
for key,value in json_data.items():
|
| 49 |
+
for intent, intent_content in value.items():
|
| 50 |
+
if intent == "list":
|
| 51 |
+
chunk = ""
|
| 52 |
+
for idx, item in enumerate(intent_content or []):
|
| 53 |
+
if(item.strip() == ""):
|
| 54 |
+
continue
|
| 55 |
+
chunk += f"{idx+1}. {item.strip()}\n"
|
| 56 |
+
if(chunk):
|
| 57 |
+
docs.append(Document(page_content=chunk, metadata={**metadata, "topic": key, "intent": intent, "chunk_index": count}))
|
| 58 |
+
count += 1
|
| 59 |
+
if(len(intent_content) > 0):
|
| 60 |
+
docs.append(Document(page_content=f"Total {key}: {len(intent_content)}", metadata={**metadata, "topic": key, "intent": "count", "chunk_index": count}))
|
| 61 |
+
count += 1
|
| 62 |
+
|
| 63 |
+
elif intent == "detail" or intent == "details":
|
| 64 |
+
if(intent_content.strip() == ""):
|
| 65 |
+
continue
|
| 66 |
+
chunk = f"{intent_content.strip()}"
|
| 67 |
+
docs.append(Document(page_content=chunk, metadata={**metadata, "topic": key, "intent": "detail", "chunk_index": count}))
|
| 68 |
+
count += 1
|
| 69 |
+
return docs
|
| 70 |
+
|
| 71 |
+
def handle_text_docs(self, document: Document, file_path: Path, metadata: dict) -> List[Document]:
|
| 72 |
+
docs = []
|
| 73 |
+
# split document into chunks
|
| 74 |
+
documents = self.text_splitter.split_documents([document])
|
| 75 |
+
|
| 76 |
+
# create preprocess document texts
|
| 77 |
+
for idx, doc in enumerate(documents):
|
| 78 |
+
# Create a copy to avoid modifying original during iteration
|
| 79 |
+
new_doc = Document(
|
| 80 |
+
page_content=normalize(doc.page_content),
|
| 81 |
+
metadata={
|
| 82 |
+
**doc.metadata,
|
| 83 |
+
**metadata,
|
| 84 |
+
"source": file_path.name,
|
| 85 |
+
"chunk_index": idx
|
| 86 |
+
}
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# Check normalized content has actual text
|
| 90 |
+
if len(new_doc.page_content.strip()) > 0:
|
| 91 |
+
docs.append(new_doc)
|
| 92 |
+
|
| 93 |
+
return docs # Return the processed list, not final_docs
|
| 94 |
+
|
| 95 |
+
ingestion_service = IngestionService()
|
app/services/rag_service.py
ADDED
|
@@ -0,0 +1,483 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
+
# LangChain Imports
|
| 5 |
+
from fastapi import HTTPException
|
| 6 |
+
from langchain_core.prompts import PromptTemplate
|
| 7 |
+
from langchain_chroma import Chroma
|
| 8 |
+
from langchain.messages import HumanMessage, AIMessage, SystemMessage
|
| 9 |
+
from typing import Optional, List
|
| 10 |
+
from app.api.schemas.tests import TestRequestSchema, TestClassifierReqSchema
|
| 11 |
+
from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_score, recall_score
|
| 12 |
+
|
| 13 |
+
# Locals
|
| 14 |
+
from app.services.text_splitter import TextSplitter
|
| 15 |
+
from app.services.vector_store import VectorStore
|
| 16 |
+
from app.utils.preprocessing import normalize, preprocess_documents, preprocess_query
|
| 17 |
+
from app.utils.document_helpers import get_references, create_documents, build_metadata, get_references_v2
|
| 18 |
+
from app.prompts import SYSTEM_PROMPT, wrap_exaone
|
| 19 |
+
from app.core.config import settings
|
| 20 |
+
from app.services.hybrid_retrieval import HybridRetrievalService, HybridRetrievalConfig
|
| 21 |
+
from app.services.ingestion_service import IngestionService
|
| 22 |
+
from app.services.classifier_service import clf
|
| 23 |
+
|
| 24 |
+
def format_history(history: list[str]) -> str:
|
| 25 |
+
formatted = []
|
| 26 |
+
for i, msg in enumerate(history):
|
| 27 |
+
role = "User" if i % 2 == 0 else "Assistant"
|
| 28 |
+
formatted.append(f"{role}: {msg}")
|
| 29 |
+
return "\n".join(formatted)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class RAGService:
|
| 33 |
+
def __init__(self,
|
| 34 |
+
model,
|
| 35 |
+
collection_name: str = None,
|
| 36 |
+
persist_directory: str = None,
|
| 37 |
+
embedding_model = None,
|
| 38 |
+
k: int = None):
|
| 39 |
+
|
| 40 |
+
# initialize Models
|
| 41 |
+
self.model = model
|
| 42 |
+
self.embedding_model = embedding_model
|
| 43 |
+
self.collection_name = collection_name or settings.collection_name
|
| 44 |
+
self.k = k or settings.similarity_top_k
|
| 45 |
+
self.persist_directory = persist_directory or settings.persist_directory
|
| 46 |
+
self.evaluation = {}
|
| 47 |
+
|
| 48 |
+
#setup vector database
|
| 49 |
+
self.db = Chroma(
|
| 50 |
+
collection_name=self.collection_name,
|
| 51 |
+
embedding_function=self.embedding_model,
|
| 52 |
+
persist_directory=self.persist_directory
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
self.database = VectorStore(self.db)
|
| 56 |
+
self.text_splitter = TextSplitter()
|
| 57 |
+
|
| 58 |
+
# document_loader
|
| 59 |
+
# self.doc_loader = document_loader(filepath=self.filepath)
|
| 60 |
+
|
| 61 |
+
self.template = PromptTemplate.from_template(SYSTEM_PROMPT)
|
| 62 |
+
self.retriever = self.db.as_retriever(search_type="similarity", search_kwargs={"k": self.k})
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def get_filenames(self):
|
| 66 |
+
ingestion_service = IngestionService()
|
| 67 |
+
return ingestion_service.get_records()
|
| 68 |
+
|
| 69 |
+
def ingest_documents(self, filepath: str, chunk_size: int = None, chunk_overlap: int = None):
|
| 70 |
+
start = time.time()
|
| 71 |
+
path = Path(filepath)
|
| 72 |
+
if not path.exists():
|
| 73 |
+
raise FileNotFoundError(f"File not found: {filepath}")
|
| 74 |
+
|
| 75 |
+
chunk_size = chunk_size or settings.chunk_size
|
| 76 |
+
chunk_overlap = chunk_overlap or settings.chunk_overlap
|
| 77 |
+
ingestion_service = IngestionService(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 78 |
+
chunks = ingestion_service.ingest(path)
|
| 79 |
+
print("Chunks: ", chunks)
|
| 80 |
+
|
| 81 |
+
if chunks is None or len(chunks) == 0:
|
| 82 |
+
raise HTTPException(
|
| 83 |
+
status_code=400,
|
| 84 |
+
detail=f"No text content found in '{path.name}'. The file may be image-based or empty."
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
if len(chunks) == 1 and chunks[0].page_content.strip() == "":
|
| 88 |
+
raise HTTPException(
|
| 89 |
+
status_code=400,
|
| 90 |
+
detail=f"Document '{path.name}' contains empty or unreadable content."
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
size_bytes = path.stat().st_size
|
| 94 |
+
size_mb = (size_bytes / (1024 * 1024))
|
| 95 |
+
# Add to database
|
| 96 |
+
self.database.add_documents(chunks)
|
| 97 |
+
|
| 98 |
+
dim_bytes_with_chunks = (768 * 4) + chunk_size
|
| 99 |
+
dim_bytes = dim_bytes_with_chunks * len(chunks)
|
| 100 |
+
Estimated_DB_MB = dim_bytes / (1024 * 1024)
|
| 101 |
+
|
| 102 |
+
ingestion_service.path_record(file_path=path, metadata={
|
| 103 |
+
"doc_chunks": len(chunks),
|
| 104 |
+
"chunk_size": chunk_size,
|
| 105 |
+
"chunk_overlap": chunk_overlap,
|
| 106 |
+
"execution_time": time.time() - start,
|
| 107 |
+
"file_size": size_mb,
|
| 108 |
+
"db_size": Estimated_DB_MB
|
| 109 |
+
})
|
| 110 |
+
|
| 111 |
+
print(f"✅ Added {len(chunks)} chunks from {path.name} to vector store")
|
| 112 |
+
return chunks
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def query(
|
| 116 |
+
self,
|
| 117 |
+
question: str,
|
| 118 |
+
history: List[str] = None,
|
| 119 |
+
k: int = None,
|
| 120 |
+
threshold: float = None,
|
| 121 |
+
include_llm_response: bool = True
|
| 122 |
+
):
|
| 123 |
+
"""
|
| 124 |
+
Unified search method with optional LLM response and threshold filtering.
|
| 125 |
+
|
| 126 |
+
Args:
|
| 127 |
+
question: User's question
|
| 128 |
+
history: Conversation history (optional)
|
| 129 |
+
k: Number of documents to retrieve (defaults to settings)
|
| 130 |
+
threshold: Similarity threshold filter (defaults to settings)
|
| 131 |
+
include_llm_response: Whether to generate LLM answer (default: True)
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
dict with 'answer' (if include_llm_response), 'references', 'context'
|
| 135 |
+
"""
|
| 136 |
+
if not question or len(question.strip()) == 0:
|
| 137 |
+
return {
|
| 138 |
+
"answer": "Please provide a valid question",
|
| 139 |
+
"references": [],
|
| 140 |
+
"context": ""
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
# Use defaults from settings
|
| 144 |
+
k = k or self.k or settings.similarity_top_k
|
| 145 |
+
threshold = threshold if threshold is not None else settings.similarity_threshold
|
| 146 |
+
history = history or []
|
| 147 |
+
|
| 148 |
+
# Normalize and search
|
| 149 |
+
question = preprocess_query(question)
|
| 150 |
+
docs = self.database.similarity_search_with_score(query=question, k=k)
|
| 151 |
+
|
| 152 |
+
# print(question)
|
| 153 |
+
|
| 154 |
+
# Get references and context
|
| 155 |
+
ctx = get_references(docs, threshold=threshold)
|
| 156 |
+
documents = ctx.get('documents', [])
|
| 157 |
+
context = ctx.get('context', '')
|
| 158 |
+
# Filter by threshold
|
| 159 |
+
filtered_docs = documents
|
| 160 |
+
|
| 161 |
+
# If no docs pass threshold
|
| 162 |
+
if not filtered_docs:
|
| 163 |
+
return {
|
| 164 |
+
"answer": "No relevant documents found matching the similarity threshold",
|
| 165 |
+
"references": [],
|
| 166 |
+
"context": "",
|
| 167 |
+
"threshold_used": threshold
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
# Generate LLM response if requested
|
| 171 |
+
if include_llm_response:
|
| 172 |
+
formatted_history = format_history(history)
|
| 173 |
+
prompt = self.template.invoke({
|
| 174 |
+
"history": formatted_history,
|
| 175 |
+
"question": question,
|
| 176 |
+
"context": context
|
| 177 |
+
})
|
| 178 |
+
response = self.model.invoke(prompt)
|
| 179 |
+
answer = response.content
|
| 180 |
+
else:
|
| 181 |
+
answer = "" # Just return context without LLM
|
| 182 |
+
context = ""
|
| 183 |
+
|
| 184 |
+
return {
|
| 185 |
+
"answer": answer,
|
| 186 |
+
"references": filtered_docs,
|
| 187 |
+
"context": context,
|
| 188 |
+
"threshold_used": threshold,
|
| 189 |
+
"k_used": k
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
def hybrid_query(
|
| 193 |
+
self,
|
| 194 |
+
question: str,
|
| 195 |
+
history: List[str] = None,
|
| 196 |
+
k: int = None,
|
| 197 |
+
threshold: float = None,
|
| 198 |
+
include_llm_response: bool = True
|
| 199 |
+
):
|
| 200 |
+
"""
|
| 201 |
+
Unified search method with optional LLM response and threshold filtering.
|
| 202 |
+
|
| 203 |
+
Args:
|
| 204 |
+
question: User's question
|
| 205 |
+
history: Conversation history (optional)
|
| 206 |
+
k: Number of documents to retrieve (defaults to settings)
|
| 207 |
+
threshold: Similarity threshold filter (defaults to settings)
|
| 208 |
+
include_llm_response: Whether to generate LLM answer (default: True)
|
| 209 |
+
|
| 210 |
+
Returns:
|
| 211 |
+
dict with 'answer' (if include_llm_response), 'references', 'context'
|
| 212 |
+
"""
|
| 213 |
+
if not question or len(question.strip()) == 0:
|
| 214 |
+
return {
|
| 215 |
+
"answer": "Please provide a valid question",
|
| 216 |
+
"references": [],
|
| 217 |
+
"context": ""
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
# Use defaults from settings
|
| 221 |
+
threshold = threshold if threshold is not None else settings.similarity_threshold
|
| 222 |
+
history = history or []
|
| 223 |
+
|
| 224 |
+
query = question
|
| 225 |
+
candidate_k = 15 # how many docs vector search fetches (also the BM25 pool size)
|
| 226 |
+
final_k = k or settings.similarity_top_k # how many results to return after fusion
|
| 227 |
+
|
| 228 |
+
config = HybridRetrievalConfig(
|
| 229 |
+
candidate_k=candidate_k,
|
| 230 |
+
top_k=final_k,
|
| 231 |
+
bm25_weight=0.45,
|
| 232 |
+
vector_weight=0.55,
|
| 233 |
+
rrf_k=20,
|
| 234 |
+
bm25_k1=1.2,
|
| 235 |
+
bm25_b=0.9,
|
| 236 |
+
title_boost_per_word=0.004,
|
| 237 |
+
score_threshold=threshold,
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
service = HybridRetrievalService(vector_db=self.db, config=config)
|
| 241 |
+
docs = service.retrieve(query=query)
|
| 242 |
+
|
| 243 |
+
ctx = get_references_v2(docs, threshold=threshold)
|
| 244 |
+
documents = ctx.get('documents', [])
|
| 245 |
+
context = ctx.get('context', 'No context available')
|
| 246 |
+
filtered_docs = documents
|
| 247 |
+
|
| 248 |
+
print("*"*50)
|
| 249 |
+
print("context: ", context)
|
| 250 |
+
print("*"*50)
|
| 251 |
+
|
| 252 |
+
if not filtered_docs:
|
| 253 |
+
return {
|
| 254 |
+
"answer": "No relevant documents found matching the similarity threshold",
|
| 255 |
+
"references": [],
|
| 256 |
+
"context": "",
|
| 257 |
+
"threshold_used": threshold
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
if include_llm_response:
|
| 261 |
+
formatted_history = format_history(history)
|
| 262 |
+
prompt = self.template.invoke({
|
| 263 |
+
"history": formatted_history,
|
| 264 |
+
"question": question,
|
| 265 |
+
"context": context
|
| 266 |
+
})
|
| 267 |
+
|
| 268 |
+
# if settings.local_model_name == "EXAONE-3.5-2.4B-Instruct-Q4_K_M.gguf":
|
| 269 |
+
# prompt = wrap_exaone(prompt)
|
| 270 |
+
|
| 271 |
+
response = self.model.invoke(prompt)
|
| 272 |
+
answer = response.content if hasattr(response, "content") else response
|
| 273 |
+
|
| 274 |
+
else:
|
| 275 |
+
answer = "" # Just return context without LLM
|
| 276 |
+
context = ""
|
| 277 |
+
|
| 278 |
+
return {
|
| 279 |
+
"answer": answer,
|
| 280 |
+
"references": filtered_docs,
|
| 281 |
+
"context": context,
|
| 282 |
+
"threshold_used": threshold,
|
| 283 |
+
"k_used": k
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
def search_docs(
|
| 287 |
+
self,
|
| 288 |
+
question: str,
|
| 289 |
+
k: int = 10,
|
| 290 |
+
filename: str = None
|
| 291 |
+
):
|
| 292 |
+
"""
|
| 293 |
+
Unified search method with optional LLM response and threshold filtering.
|
| 294 |
+
|
| 295 |
+
Args:
|
| 296 |
+
question: User's question
|
| 297 |
+
k: Number of documents to retrieve (defaults to settings)
|
| 298 |
+
|
| 299 |
+
Returns:
|
| 300 |
+
dict with 'answer' (if include_llm_response), 'references', 'context'
|
| 301 |
+
"""
|
| 302 |
+
if not question or len(question.strip()) == 0:
|
| 303 |
+
return {
|
| 304 |
+
"answer": "Please provide a valid question",
|
| 305 |
+
"references": [],
|
| 306 |
+
"context": ""
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
# Use defaults from settings
|
| 310 |
+
threshold = settings.similarity_threshold
|
| 311 |
+
|
| 312 |
+
query = question
|
| 313 |
+
candidate_k = 15 # how many docs vector search fetches (also the BM25 pool size)
|
| 314 |
+
final_k = k or settings.similarity_top_k # how many results to return after fusion
|
| 315 |
+
|
| 316 |
+
config = HybridRetrievalConfig(
|
| 317 |
+
candidate_k=candidate_k,
|
| 318 |
+
top_k=final_k,
|
| 319 |
+
bm25_weight=0.7,
|
| 320 |
+
vector_weight=0.3,
|
| 321 |
+
rrf_k=20,
|
| 322 |
+
bm25_k1=1.5,
|
| 323 |
+
bm25_b=0.75,
|
| 324 |
+
title_boost_per_word=0.004,
|
| 325 |
+
score_threshold=threshold,
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
service = HybridRetrievalService(vector_db=self.db, config=config)
|
| 329 |
+
docs = service.retrieve(query=query)
|
| 330 |
+
results = []
|
| 331 |
+
for doc in docs:
|
| 332 |
+
results.append({
|
| 333 |
+
"id": doc.document.id,
|
| 334 |
+
"content": doc.document.page_content,
|
| 335 |
+
"metadata": doc.document.metadata,
|
| 336 |
+
"score": doc.fused_score
|
| 337 |
+
})
|
| 338 |
+
|
| 339 |
+
if(filename):
|
| 340 |
+
results = [doc for doc in results if doc["metadata"]["source_file"] == filename]
|
| 341 |
+
|
| 342 |
+
return results
|
| 343 |
+
|
| 344 |
+
def test_queries(self, tests: TestRequestSchema, query_delay: float = 1.0):
|
| 345 |
+
"""
|
| 346 |
+
query_delay: seconds to wait between queries.
|
| 347 |
+
Gemini free tier allows 100 embedding RPM → safe delay = 1.0s.
|
| 348 |
+
For 150 queries: ~2.5 min total.
|
| 349 |
+
"""
|
| 350 |
+
results = []
|
| 351 |
+
k = tests.k
|
| 352 |
+
threshold = tests.threshold
|
| 353 |
+
|
| 354 |
+
for idx, test in enumerate(tests.tests):
|
| 355 |
+
question = test.question
|
| 356 |
+
document = test.document
|
| 357 |
+
chunk_index = test.chunk_index
|
| 358 |
+
|
| 359 |
+
response = self.hybrid_query(
|
| 360 |
+
question=question,
|
| 361 |
+
history=[],
|
| 362 |
+
k=k,
|
| 363 |
+
threshold=threshold,
|
| 364 |
+
include_llm_response=False,
|
| 365 |
+
)
|
| 366 |
+
|
| 367 |
+
# Respect Gemini embedding rate limit (100 RPM free tier)
|
| 368 |
+
if query_delay > 0 and idx < len(tests.tests) - 1:
|
| 369 |
+
print(f"[test_queries] {idx + 1}/{len(tests.tests)} done — sleeping {query_delay}s")
|
| 370 |
+
time.sleep(query_delay)
|
| 371 |
+
|
| 372 |
+
# print("*"*50)
|
| 373 |
+
# print(response)
|
| 374 |
+
# print("*"*50)
|
| 375 |
+
|
| 376 |
+
ans_found = False
|
| 377 |
+
ans = {"tests": test}
|
| 378 |
+
|
| 379 |
+
correct_source_chunks = 0
|
| 380 |
+
len_all_docs = len(response.get("references", []))
|
| 381 |
+
rank = None
|
| 382 |
+
|
| 383 |
+
for idx, ref in enumerate(response.get("references", [])):
|
| 384 |
+
if ref.get("source") == document:
|
| 385 |
+
correct_source_chunks += 1
|
| 386 |
+
|
| 387 |
+
if (
|
| 388 |
+
ref.get("source") == document
|
| 389 |
+
and ref.get("chunk_index") == chunk_index
|
| 390 |
+
and rank is None
|
| 391 |
+
):
|
| 392 |
+
ans_found = True
|
| 393 |
+
rank = idx + 1
|
| 394 |
+
|
| 395 |
+
wrong_source_chunks = len_all_docs - correct_source_chunks
|
| 396 |
+
if len_all_docs > 0:
|
| 397 |
+
doc_precision = correct_source_chunks / len_all_docs
|
| 398 |
+
doc_noise = wrong_source_chunks / len_all_docs
|
| 399 |
+
else:
|
| 400 |
+
doc_precision = 0
|
| 401 |
+
doc_noise = 0
|
| 402 |
+
doc_recall = 1 if correct_source_chunks > 0 else 0
|
| 403 |
+
doc_error = 1 - doc_recall
|
| 404 |
+
|
| 405 |
+
if rank is not None:
|
| 406 |
+
mrr = 1 / rank
|
| 407 |
+
else:
|
| 408 |
+
mrr = 0
|
| 409 |
+
|
| 410 |
+
ans["answer"] = ans_found
|
| 411 |
+
ans["correct_source_chunks"] = correct_source_chunks
|
| 412 |
+
ans["wrong_source_chunks"] = wrong_source_chunks
|
| 413 |
+
ans["doc_precision"] = doc_precision
|
| 414 |
+
ans["doc_recall"] = doc_recall
|
| 415 |
+
ans["doc_error"] = doc_error
|
| 416 |
+
ans["mrr"] = mrr
|
| 417 |
+
ans["top_1_hit"] = 1 if rank == 1 else 0
|
| 418 |
+
ans["doc_noise"] = doc_noise
|
| 419 |
+
results.append(ans)
|
| 420 |
+
|
| 421 |
+
avg_doc_precision = sum([r["doc_precision"] for r in results]) / len(results)
|
| 422 |
+
avg_doc_recall = sum([r["doc_recall"] for r in results]) / len(results)
|
| 423 |
+
avg_mrr = sum([r["mrr"] for r in results]) / len(results)
|
| 424 |
+
hit_rate = sum([1 for r in results if r["answer"]]) / len(results)
|
| 425 |
+
top_1_hit_rate = sum([r["top_1_hit"] for r in results]) / len(results)
|
| 426 |
+
avg_doc_noise = sum([r["doc_noise"] for r in results]) / len(results)
|
| 427 |
+
error_rate = 1 - hit_rate
|
| 428 |
+
avg_doc_error = sum([r["doc_error"] for r in results]) / len(results)
|
| 429 |
+
|
| 430 |
+
return {
|
| 431 |
+
"results": results,
|
| 432 |
+
"avg_doc_precision": avg_doc_precision,
|
| 433 |
+
"avg_doc_recall": avg_doc_recall,
|
| 434 |
+
"avg_mrr": avg_mrr,
|
| 435 |
+
"hit_rate": hit_rate,
|
| 436 |
+
"top_1_hit_rate": top_1_hit_rate,
|
| 437 |
+
"avg_doc_noise": avg_doc_noise,
|
| 438 |
+
"error_rate": error_rate,
|
| 439 |
+
"avg_doc_error": avg_doc_error
|
| 440 |
+
}
|
| 441 |
+
|
| 442 |
+
def test_classifier(self, tests: TestClassifierReqSchema):
|
| 443 |
+
queries = [test.question for test in tests.tests]
|
| 444 |
+
result = clf.predict(queries)
|
| 445 |
+
|
| 446 |
+
fields = ["type", "category", "topic", "intent"]
|
| 447 |
+
evaluation = {}
|
| 448 |
+
|
| 449 |
+
for field in fields:
|
| 450 |
+
y_true = [getattr(t, field) if getattr(t, field) else "general" for t in tests.tests]
|
| 451 |
+
y_pred = [r[field] if r[field] else "general" for r in result]
|
| 452 |
+
|
| 453 |
+
evaluation[field] = {
|
| 454 |
+
"accuracy": accuracy_score(y_true, y_pred),
|
| 455 |
+
"precision": precision_score(y_true, y_pred, average="macro", zero_division=0),
|
| 456 |
+
"recall": recall_score(y_true, y_pred, average="macro", zero_division=0),
|
| 457 |
+
"f1_macro": f1_score(y_true, y_pred, average="macro", zero_division=0),
|
| 458 |
+
"f1_weighted": f1_score(y_true, y_pred, average="weighted", zero_division=0),
|
| 459 |
+
"classification_report": classification_report(y_true, y_pred, zero_division=0, output_dict=True)
|
| 460 |
+
}
|
| 461 |
+
|
| 462 |
+
return {
|
| 463 |
+
"evaluation": evaluation,
|
| 464 |
+
"results": result
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
def delete_database(self):
|
| 468 |
+
self.database.db.delete_collection()
|
| 469 |
+
|
| 470 |
+
# to close the model on destruction
|
| 471 |
+
def model_close(self):
|
| 472 |
+
client = getattr(self.model, "client", None)
|
| 473 |
+
if not client:
|
| 474 |
+
return
|
| 475 |
+
|
| 476 |
+
if hasattr(client, "close"):
|
| 477 |
+
client.close()
|
| 478 |
+
elif hasattr(client, "aclose"):
|
| 479 |
+
import asyncio
|
| 480 |
+
asyncio.run(client.aclose())
|
| 481 |
+
|
| 482 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 483 |
+
self.model_close()
|
app/services/text_splitter.py
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 2 |
+
from langchain_core.documents import Document
|
| 3 |
+
from typing import List, Optional, Literal
|
| 4 |
+
from app.core.config import settings
|
| 5 |
+
|
| 6 |
+
class TextSplitter:
|
| 7 |
+
"""
|
| 8 |
+
A service class for splitting documents into smaller chunks using recursive character text splitting.
|
| 9 |
+
|
| 10 |
+
This class provides flexible text splitting capabilities with support for different document types
|
| 11 |
+
and customizable chunk sizes and overlaps.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
def __init__(
|
| 15 |
+
self,
|
| 16 |
+
chunk_size: int = None,
|
| 17 |
+
chunk_overlap: int = None,
|
| 18 |
+
length_function: callable = len,
|
| 19 |
+
is_separator_regex: bool = False,
|
| 20 |
+
separators: Optional[List[str]] = None,
|
| 21 |
+
keep_separator: bool = True
|
| 22 |
+
):
|
| 23 |
+
"""
|
| 24 |
+
Initialize the TextSplitter with configurable parameters.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
chunk_size: Maximum size of chunks to return (default: from settings)
|
| 28 |
+
chunk_overlap: Overlap in characters between chunks (default: from settings)
|
| 29 |
+
length_function: Function to measure chunk length (default: len)
|
| 30 |
+
is_separator_regex: Whether separators are regex patterns (default: False)
|
| 31 |
+
separators: List of separators to split on (default: None, uses default separators)
|
| 32 |
+
keep_separator: Whether to keep separators in chunks (default: True)
|
| 33 |
+
"""
|
| 34 |
+
# Use settings as defaults
|
| 35 |
+
self.chunk_size = chunk_size if chunk_size is not None else settings.chunk_size
|
| 36 |
+
self.chunk_overlap = chunk_overlap if chunk_overlap is not None else settings.chunk_overlap
|
| 37 |
+
self.length_function = length_function
|
| 38 |
+
self.is_separator_regex = is_separator_regex
|
| 39 |
+
self.keep_separator = keep_separator
|
| 40 |
+
|
| 41 |
+
# Use custom separators if provided, otherwise use default
|
| 42 |
+
self.separators = separators if separators is not None else [
|
| 43 |
+
"\n\n", # Double newline (paragraphs)
|
| 44 |
+
"\n", # Single newline
|
| 45 |
+
" ", # Space
|
| 46 |
+
"" # Character-level split as last resort
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
self._initialize_splitter()
|
| 50 |
+
|
| 51 |
+
def _initialize_splitter(self):
|
| 52 |
+
"""Initialize the RecursiveCharacterTextSplitter with current settings."""
|
| 53 |
+
self.splitter = RecursiveCharacterTextSplitter(
|
| 54 |
+
chunk_size=self.chunk_size,
|
| 55 |
+
chunk_overlap=self.chunk_overlap,
|
| 56 |
+
length_function=self.length_function,
|
| 57 |
+
is_separator_regex=self.is_separator_regex,
|
| 58 |
+
separators=self.separators,
|
| 59 |
+
keep_separator=self.keep_separator
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
def split_documents(self, documents: List[Document]) -> List[Document]:
|
| 63 |
+
"""
|
| 64 |
+
Split a list of documents into smaller chunks.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
documents: List of Document objects to split
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
List of Document objects representing the chunks
|
| 71 |
+
"""
|
| 72 |
+
return self.splitter.split_documents(documents)
|
| 73 |
+
|
| 74 |
+
def split_text(self, text: str) -> List[str]:
|
| 75 |
+
"""
|
| 76 |
+
Split a single text string into smaller chunks.
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
text: Text string to split
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
List of text chunks
|
| 83 |
+
"""
|
| 84 |
+
return self.splitter.split_text(text)
|
| 85 |
+
|
| 86 |
+
def create_document(
|
| 87 |
+
self,
|
| 88 |
+
text: str,
|
| 89 |
+
metadata: dict
|
| 90 |
+
):
|
| 91 |
+
return Document(page_content=text, metadata=metadata)
|
| 92 |
+
|
| 93 |
+
def create_documents(
|
| 94 |
+
self,
|
| 95 |
+
texts: List[str],
|
| 96 |
+
metadatas: Optional[List[dict]] = None
|
| 97 |
+
) -> List[Document]:
|
| 98 |
+
"""
|
| 99 |
+
Create Document objects from texts and split them into chunks.
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
texts: List of text strings to convert to documents
|
| 103 |
+
metadatas: Optional list of metadata dictionaries for each text
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
List of Document objects representing the chunks
|
| 107 |
+
"""
|
| 108 |
+
return self.splitter.create_documents(texts, metadatas)
|
| 109 |
+
|
| 110 |
+
def update_settings(
|
| 111 |
+
self,
|
| 112 |
+
chunk_size: Optional[int] = None,
|
| 113 |
+
chunk_overlap: Optional[int] = None,
|
| 114 |
+
separators: Optional[List[str]] = None
|
| 115 |
+
):
|
| 116 |
+
"""
|
| 117 |
+
Update splitter settings and reinitialize.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
chunk_size: New chunk size (optional)
|
| 121 |
+
chunk_overlap: New chunk overlap (optional)
|
| 122 |
+
separators: New separators list (optional)
|
| 123 |
+
"""
|
| 124 |
+
if chunk_size is not None:
|
| 125 |
+
self.chunk_size = chunk_size
|
| 126 |
+
if chunk_overlap is not None:
|
| 127 |
+
self.chunk_overlap = chunk_overlap
|
| 128 |
+
if separators is not None:
|
| 129 |
+
self.separators = separators
|
| 130 |
+
|
| 131 |
+
self._initialize_splitter()
|
| 132 |
+
|
| 133 |
+
@classmethod
|
| 134 |
+
def from_language(
|
| 135 |
+
cls,
|
| 136 |
+
language: Literal[
|
| 137 |
+
"cpp", "go", "java", "kotlin", "js", "ts", "php", "proto",
|
| 138 |
+
"python", "rst", "ruby", "rust", "scala", "swift", "markdown",
|
| 139 |
+
"latex", "html", "sol", "csharp", "cobol", "c", "lua", "perl"
|
| 140 |
+
],
|
| 141 |
+
chunk_size: int = None,
|
| 142 |
+
chunk_overlap: int = None
|
| 143 |
+
) -> 'TextSplitter':
|
| 144 |
+
"""
|
| 145 |
+
Create a TextSplitter optimized for a specific programming language or format.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
language: Programming language or format type
|
| 149 |
+
chunk_size: Maximum size of chunks to return (default: from settings)
|
| 150 |
+
chunk_overlap: Overlap in characters between chunks (default: from settings)
|
| 151 |
+
|
| 152 |
+
Returns:
|
| 153 |
+
TextSplitter instance configured for the specified language
|
| 154 |
+
"""
|
| 155 |
+
# Use settings as defaults
|
| 156 |
+
chunk_size = chunk_size if chunk_size is not None else settings.chunk_size
|
| 157 |
+
chunk_overlap = chunk_overlap if chunk_overlap is not None else settings.chunk_overlap
|
| 158 |
+
|
| 159 |
+
splitter = RecursiveCharacterTextSplitter.from_language(
|
| 160 |
+
language=language,
|
| 161 |
+
chunk_size=chunk_size,
|
| 162 |
+
chunk_overlap=chunk_overlap
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
instance = cls(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 166 |
+
instance.splitter = splitter
|
| 167 |
+
return instance
|
| 168 |
+
|
| 169 |
+
@classmethod
|
| 170 |
+
def for_markdown(cls, chunk_size: int = None, chunk_overlap: int = None) -> 'TextSplitter':
|
| 171 |
+
"""
|
| 172 |
+
Create a TextSplitter optimized for Markdown documents.
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
chunk_size: Maximum size of chunks to return (default: from settings)
|
| 176 |
+
chunk_overlap: Overlap in characters between chunks (default: from settings)
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
TextSplitter instance configured for Markdown
|
| 180 |
+
"""
|
| 181 |
+
return cls.from_language("markdown", chunk_size, chunk_overlap)
|
| 182 |
+
|
| 183 |
+
@classmethod
|
| 184 |
+
def for_code(
|
| 185 |
+
cls,
|
| 186 |
+
language: str = "python",
|
| 187 |
+
chunk_size: int = None,
|
| 188 |
+
chunk_overlap: int = None
|
| 189 |
+
) -> 'TextSplitter':
|
| 190 |
+
"""
|
| 191 |
+
Create a TextSplitter optimized for code documents.
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
language: Programming language (default: "python")
|
| 195 |
+
chunk_size: Maximum size of chunks to return (default: from settings)
|
| 196 |
+
chunk_overlap: Overlap in characters between chunks (default: from settings)
|
| 197 |
+
|
| 198 |
+
Returns:
|
| 199 |
+
TextSplitter instance configured for code
|
| 200 |
+
"""
|
| 201 |
+
return cls.from_language(language, chunk_size, chunk_overlap)
|
| 202 |
+
|
| 203 |
+
@classmethod
|
| 204 |
+
def for_markdown_with_sections(
|
| 205 |
+
cls,
|
| 206 |
+
chunk_size: int = None,
|
| 207 |
+
chunk_overlap: int = None
|
| 208 |
+
) -> 'TextSplitter':
|
| 209 |
+
"""
|
| 210 |
+
Create a TextSplitter optimized for Markdown with section delimiters (---).
|
| 211 |
+
|
| 212 |
+
This splitter is designed for markdown files that use '---' as section separators
|
| 213 |
+
(common in frontmatter/multi-section documents). It prioritizes keeping sections
|
| 214 |
+
together and prevents splitting on headers, which reduces the number of small chunks.
|
| 215 |
+
|
| 216 |
+
Args:
|
| 217 |
+
chunk_size: Maximum size of chunks to return (default: from settings)
|
| 218 |
+
chunk_overlap: Overlap in characters between chunks (default: from settings)
|
| 219 |
+
|
| 220 |
+
Returns:
|
| 221 |
+
TextSplitter instance with custom separators for sectioned markdown
|
| 222 |
+
"""
|
| 223 |
+
# Use settings as defaults
|
| 224 |
+
chunk_size = chunk_size if chunk_size is not None else settings.chunk_size
|
| 225 |
+
chunk_overlap = chunk_overlap if chunk_overlap is not None else settings.chunk_overlap
|
| 226 |
+
|
| 227 |
+
# Custom separators that respect section boundaries
|
| 228 |
+
# Priority: sections -> paragraphs -> sentences -> words -> characters
|
| 229 |
+
custom_separators = [
|
| 230 |
+
"---", # Section delimiter with newlines
|
| 231 |
+
". ", # Sentences
|
| 232 |
+
" ", # Words
|
| 233 |
+
]
|
| 234 |
+
|
| 235 |
+
return cls(
|
| 236 |
+
chunk_size=chunk_size,
|
| 237 |
+
chunk_overlap=chunk_overlap,
|
| 238 |
+
separators=custom_separators,
|
| 239 |
+
keep_separator=True # Keep separators to maintain structure
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
def get_chunk_info(self, documents: List[Document]) -> dict:
|
| 243 |
+
"""
|
| 244 |
+
Get information about how documents will be split.
|
| 245 |
+
|
| 246 |
+
Args:
|
| 247 |
+
documents: List of documents to analyze
|
| 248 |
+
|
| 249 |
+
Returns:
|
| 250 |
+
Dictionary containing chunk statistics
|
| 251 |
+
"""
|
| 252 |
+
chunks = self.split_documents(documents)
|
| 253 |
+
|
| 254 |
+
chunk_sizes = [len(chunk.page_content) for chunk in chunks]
|
| 255 |
+
|
| 256 |
+
return {
|
| 257 |
+
"total_documents": len(documents),
|
| 258 |
+
"total_chunks": len(chunks),
|
| 259 |
+
"average_chunk_size": sum(chunk_sizes) / len(chunk_sizes) if chunk_sizes else 0,
|
| 260 |
+
"min_chunk_size": min(chunk_sizes) if chunk_sizes else 0,
|
| 261 |
+
"max_chunk_size": max(chunk_sizes) if chunk_sizes else 0,
|
| 262 |
+
"configured_chunk_size": self.chunk_size,
|
| 263 |
+
"configured_overlap": self.chunk_overlap
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
text_splitter = TextSplitter()
|
app/services/vector_store.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_core.documents import Document
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
class VectorStore:
|
| 6 |
+
def __init__(self, db):
|
| 7 |
+
self.db = db
|
| 8 |
+
|
| 9 |
+
def get(self):
|
| 10 |
+
return self.db.get()
|
| 11 |
+
|
| 12 |
+
def get_by_id(self, ids: list[str]):
|
| 13 |
+
return self.db.get(ids=ids)
|
| 14 |
+
|
| 15 |
+
def get_dict(self):
|
| 16 |
+
data = self.db.get()
|
| 17 |
+
|
| 18 |
+
rows = [
|
| 19 |
+
{
|
| 20 |
+
"id": id_,
|
| 21 |
+
"document": doc,
|
| 22 |
+
"metadata": meta,
|
| 23 |
+
}
|
| 24 |
+
for id_, doc, meta in zip(
|
| 25 |
+
data["ids"],
|
| 26 |
+
data["documents"],
|
| 27 |
+
data["metadatas"],
|
| 28 |
+
)
|
| 29 |
+
]
|
| 30 |
+
print(type(rows))
|
| 31 |
+
return json.dumps(rows)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def similarity_search(self, query: str, filter: Optional[dict[str, str]] = None, k: Optional[int] = 5):
|
| 35 |
+
results = []
|
| 36 |
+
if(filter):
|
| 37 |
+
results = self.db.similarity_search(query, filter, k)
|
| 38 |
+
else:
|
| 39 |
+
results = self.db.similarity_search(query, k)
|
| 40 |
+
return results
|
| 41 |
+
|
| 42 |
+
def similarity_search_with_score(self, query: str, filter: Optional[dict[str, str]] = None, k: Optional[int] = 5):
|
| 43 |
+
results = []
|
| 44 |
+
if(filter):
|
| 45 |
+
results = self.db.similarity_search_with_score(query, filter, k)
|
| 46 |
+
else:
|
| 47 |
+
results = self.db.similarity_search_with_score(query, k)
|
| 48 |
+
return results
|
| 49 |
+
|
| 50 |
+
def add_documents(self, docs: List[Document], ids: Optional[List] = None):
|
| 51 |
+
result = []
|
| 52 |
+
final_docs = [doc for doc in docs if doc.page_content.strip()]
|
| 53 |
+
|
| 54 |
+
if(ids is not None):
|
| 55 |
+
result = self.db.add_documents(final_docs,ids)
|
| 56 |
+
else:
|
| 57 |
+
result = self.db.add_documents(final_docs)
|
| 58 |
+
return result
|
| 59 |
+
|
| 60 |
+
def update_document(self, document_id: str, document: Document):
|
| 61 |
+
# safest + guaranteed re-embedding
|
| 62 |
+
self.db.delete(ids=[document_id])
|
| 63 |
+
return self.db.add_documents([document], ids=[document_id])
|
| 64 |
+
|
| 65 |
+
def delete(self, ids: List):
|
| 66 |
+
self.db.delete(ids = ids)
|
| 67 |
+
return True
|
app/utils/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# from .preprocessing import preprocess, normalize, preprocess_documents, preprocess_query
|
| 2 |
+
# from .constants import stopwords
|
| 3 |
+
# from .document_helpers import get_references, create_document, create_documents, build_metadata, clean_metadata,load_json, read_json_file, get_references_v2
|
| 4 |
+
# from .llm_models import load_model
|
| 5 |
+
# from .model_factory import get_embedding_model, get_llm_model, get_local_model, get_gemini_model
|
app/utils/constants.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
short_words_mappings = {
|
| 2 |
+
"IT": "Information Technology",
|
| 3 |
+
"BT": "Biotechnology",
|
| 4 |
+
"ECE": "Electronics and Communication Engineering",
|
| 5 |
+
"CE": "Computer Engineering",
|
| 6 |
+
"dept": "Department",
|
| 7 |
+
"ICT": "Information and Communication Technology",
|
| 8 |
+
"DS": "Data Science",
|
| 9 |
+
"CS": "Computer Science",
|
| 10 |
+
"CSE": "Computer Science and Engineering",
|
| 11 |
+
"MCA": "Master of Computer Application",
|
| 12 |
+
"MSc": "Master of Science",
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
stopwords = """
|
| 16 |
+
a
|
| 17 |
+
an
|
| 18 |
+
the
|
| 19 |
+
but
|
| 20 |
+
if
|
| 21 |
+
then
|
| 22 |
+
else
|
| 23 |
+
because
|
| 24 |
+
so
|
| 25 |
+
of
|
| 26 |
+
to
|
| 27 |
+
from
|
| 28 |
+
in
|
| 29 |
+
on
|
| 30 |
+
at
|
| 31 |
+
by
|
| 32 |
+
for
|
| 33 |
+
with
|
| 34 |
+
about
|
| 35 |
+
into
|
| 36 |
+
over
|
| 37 |
+
under
|
| 38 |
+
between
|
| 39 |
+
after
|
| 40 |
+
before
|
| 41 |
+
during
|
| 42 |
+
through
|
| 43 |
+
above
|
| 44 |
+
below
|
| 45 |
+
up
|
| 46 |
+
down
|
| 47 |
+
out
|
| 48 |
+
off
|
| 49 |
+
again
|
| 50 |
+
further
|
| 51 |
+
once
|
| 52 |
+
only
|
| 53 |
+
some
|
| 54 |
+
any
|
| 55 |
+
each
|
| 56 |
+
few
|
| 57 |
+
more
|
| 58 |
+
most
|
| 59 |
+
other
|
| 60 |
+
such
|
| 61 |
+
very
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# stopwords = """
|
| 66 |
+
# a
|
| 67 |
+
# about
|
| 68 |
+
# above
|
| 69 |
+
# after
|
| 70 |
+
# again
|
| 71 |
+
# against
|
| 72 |
+
# ain
|
| 73 |
+
# all
|
| 74 |
+
# am
|
| 75 |
+
# an
|
| 76 |
+
# and
|
| 77 |
+
# any
|
| 78 |
+
# are
|
| 79 |
+
# aren
|
| 80 |
+
# aren't
|
| 81 |
+
# as
|
| 82 |
+
# at
|
| 83 |
+
# be
|
| 84 |
+
# because
|
| 85 |
+
# been
|
| 86 |
+
# before
|
| 87 |
+
# being
|
| 88 |
+
# below
|
| 89 |
+
# between
|
| 90 |
+
# both
|
| 91 |
+
# but
|
| 92 |
+
# by
|
| 93 |
+
# can
|
| 94 |
+
# couldn
|
| 95 |
+
# couldn't
|
| 96 |
+
# d
|
| 97 |
+
# did
|
| 98 |
+
# didn
|
| 99 |
+
# didn't
|
| 100 |
+
# do
|
| 101 |
+
# does
|
| 102 |
+
# doesn
|
| 103 |
+
# doesn't
|
| 104 |
+
# doing
|
| 105 |
+
# don
|
| 106 |
+
# don't
|
| 107 |
+
# down
|
| 108 |
+
# during
|
| 109 |
+
# each
|
| 110 |
+
# few
|
| 111 |
+
# for
|
| 112 |
+
# from
|
| 113 |
+
# further
|
| 114 |
+
# had
|
| 115 |
+
# hadn
|
| 116 |
+
# hadn't
|
| 117 |
+
# has
|
| 118 |
+
# hasn
|
| 119 |
+
# hasn't
|
| 120 |
+
# have
|
| 121 |
+
# haven
|
| 122 |
+
# haven't
|
| 123 |
+
# having
|
| 124 |
+
# he
|
| 125 |
+
# he'd
|
| 126 |
+
# he'll
|
| 127 |
+
# he's
|
| 128 |
+
# her
|
| 129 |
+
# here
|
| 130 |
+
# hers
|
| 131 |
+
# herself
|
| 132 |
+
# him
|
| 133 |
+
# himself
|
| 134 |
+
# his
|
| 135 |
+
# how
|
| 136 |
+
# i
|
| 137 |
+
# i'd
|
| 138 |
+
# i'll
|
| 139 |
+
# i'm
|
| 140 |
+
# i've
|
| 141 |
+
# if
|
| 142 |
+
# in
|
| 143 |
+
# into
|
| 144 |
+
# is
|
| 145 |
+
# isn
|
| 146 |
+
# isn't
|
| 147 |
+
# it
|
| 148 |
+
# it'd
|
| 149 |
+
# it'll
|
| 150 |
+
# it's
|
| 151 |
+
# its
|
| 152 |
+
# itself
|
| 153 |
+
# just
|
| 154 |
+
# ll
|
| 155 |
+
# m
|
| 156 |
+
# ma
|
| 157 |
+
# me
|
| 158 |
+
# mightn
|
| 159 |
+
# mightn't
|
| 160 |
+
# more
|
| 161 |
+
# most
|
| 162 |
+
# mustn
|
| 163 |
+
# mustn't
|
| 164 |
+
# my
|
| 165 |
+
# myself
|
| 166 |
+
# needn
|
| 167 |
+
# needn't
|
| 168 |
+
# no
|
| 169 |
+
# nor
|
| 170 |
+
# not
|
| 171 |
+
# now
|
| 172 |
+
# o
|
| 173 |
+
# of
|
| 174 |
+
# off
|
| 175 |
+
# on
|
| 176 |
+
# once
|
| 177 |
+
# only
|
| 178 |
+
# or
|
| 179 |
+
# other
|
| 180 |
+
# our
|
| 181 |
+
# ours
|
| 182 |
+
# ourselves
|
| 183 |
+
# out
|
| 184 |
+
# over
|
| 185 |
+
# own
|
| 186 |
+
# re
|
| 187 |
+
# s
|
| 188 |
+
# same
|
| 189 |
+
# shan
|
| 190 |
+
# shan't
|
| 191 |
+
# she
|
| 192 |
+
# she'd
|
| 193 |
+
# she'll
|
| 194 |
+
# she's
|
| 195 |
+
# should
|
| 196 |
+
# should've
|
| 197 |
+
# shouldn
|
| 198 |
+
# shouldn't
|
| 199 |
+
# so
|
| 200 |
+
# some
|
| 201 |
+
# such
|
| 202 |
+
# t
|
| 203 |
+
# than
|
| 204 |
+
# that
|
| 205 |
+
# that'll
|
| 206 |
+
# the
|
| 207 |
+
# their
|
| 208 |
+
# theirs
|
| 209 |
+
# them
|
| 210 |
+
# themselves
|
| 211 |
+
# then
|
| 212 |
+
# there
|
| 213 |
+
# these
|
| 214 |
+
# they
|
| 215 |
+
# they'd
|
| 216 |
+
# they'll
|
| 217 |
+
# they're
|
| 218 |
+
# they've
|
| 219 |
+
# this
|
| 220 |
+
# those
|
| 221 |
+
# through
|
| 222 |
+
# to
|
| 223 |
+
# too
|
| 224 |
+
# under
|
| 225 |
+
# until
|
| 226 |
+
# up
|
| 227 |
+
# ve
|
| 228 |
+
# very
|
| 229 |
+
# was
|
| 230 |
+
# wasn
|
| 231 |
+
# wasn't
|
| 232 |
+
# we
|
| 233 |
+
# we'd
|
| 234 |
+
# we'll
|
| 235 |
+
# we're
|
| 236 |
+
# we've
|
| 237 |
+
# were
|
| 238 |
+
# weren
|
| 239 |
+
# weren't
|
| 240 |
+
# what
|
| 241 |
+
# when
|
| 242 |
+
# where
|
| 243 |
+
# which
|
| 244 |
+
# while
|
| 245 |
+
# who
|
| 246 |
+
# whom
|
| 247 |
+
# why
|
| 248 |
+
# will
|
| 249 |
+
# with
|
| 250 |
+
# won
|
| 251 |
+
# won't
|
| 252 |
+
# wouldn
|
| 253 |
+
# wouldn't
|
| 254 |
+
# y
|
| 255 |
+
# you
|
| 256 |
+
# you'd
|
| 257 |
+
# you'll
|
| 258 |
+
# you're
|
| 259 |
+
# you've
|
| 260 |
+
# your
|
| 261 |
+
# yours
|
| 262 |
+
# yourself
|
| 263 |
+
# yourselves
|
| 264 |
+
# """
|
app/utils/document_helpers.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_core.documents import Document
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Optional, List
|
| 4 |
+
from datetime import datetime, date
|
| 5 |
+
import uuid
|
| 6 |
+
import yaml
|
| 7 |
+
from app.services.text_splitter import TextSplitter
|
| 8 |
+
import json
|
| 9 |
+
|
| 10 |
+
# Allowed types for metadata cleaning
|
| 11 |
+
ALLOWED = (str, int, float, bool, list, type(None))
|
| 12 |
+
|
| 13 |
+
def get_references_v2(docs, threshold: float):
|
| 14 |
+
results = []
|
| 15 |
+
context = ""
|
| 16 |
+
for doc in docs:
|
| 17 |
+
_doc = doc.document
|
| 18 |
+
_similarity = doc.fused_score
|
| 19 |
+
# print(_similarity, threshold)
|
| 20 |
+
if _similarity < threshold:
|
| 21 |
+
continue
|
| 22 |
+
metadata = _doc.metadata
|
| 23 |
+
document = {
|
| 24 |
+
"title": metadata.get("title", metadata.get("name", metadata.get("topic", "untitled"))),
|
| 25 |
+
"chunk_index": metadata.get("chunk_index"),
|
| 26 |
+
"source": metadata.get("source_file", metadata.get("source", "untitled")),
|
| 27 |
+
"page_content": _doc.page_content,
|
| 28 |
+
"similarity": _similarity
|
| 29 |
+
}
|
| 30 |
+
ctx = f"""{document['title']} page_content: {document['page_content']}, from source: {document['source']}.\n\n"""
|
| 31 |
+
context += ctx
|
| 32 |
+
results.append(document)
|
| 33 |
+
return {
|
| 34 |
+
"documents": results,
|
| 35 |
+
"context": context
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
def get_references(docs, threshold: float):
|
| 39 |
+
results = []
|
| 40 |
+
context = ""
|
| 41 |
+
for doc in docs:
|
| 42 |
+
_doc = doc[0]
|
| 43 |
+
_similarity = 1 - doc[1]
|
| 44 |
+
if _similarity < threshold:
|
| 45 |
+
continue
|
| 46 |
+
metadata = _doc.metadata
|
| 47 |
+
document = {
|
| 48 |
+
"title": metadata.get("title", metadata.get("name", metadata.get("topic", "untitled"))),
|
| 49 |
+
"chunk_index": metadata.get("chunk_index"),
|
| 50 |
+
"source": metadata.get("source_file", metadata.get("source", "untitled")),
|
| 51 |
+
"page_content": _doc.page_content,
|
| 52 |
+
"similarity": _similarity
|
| 53 |
+
}
|
| 54 |
+
ctx = f"""
|
| 55 |
+
page_content: {document['page_content']}, from source: {document['source']}.
|
| 56 |
+
"""
|
| 57 |
+
context += ctx
|
| 58 |
+
results.append(document)
|
| 59 |
+
return {
|
| 60 |
+
"documents": results,
|
| 61 |
+
"context": context
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
def create_documents(
|
| 65 |
+
chunks: List[str],
|
| 66 |
+
filePath: Optional[Path] = None,
|
| 67 |
+
built_in_metadata: Optional[dict] = {},
|
| 68 |
+
title: Optional[str] = None
|
| 69 |
+
) -> List[Document]:
|
| 70 |
+
"""
|
| 71 |
+
Create Document objects from text chunks with standard metadata (UUIDs, timestamps, indices).
|
| 72 |
+
Works for both files (filePath provided) and raw text (filePath=None).
|
| 73 |
+
"""
|
| 74 |
+
if filePath and filePath.exists():
|
| 75 |
+
created_date = datetime.fromtimestamp(filePath.stat().st_ctime).isoformat()
|
| 76 |
+
modified_date = datetime.fromtimestamp(filePath.stat().st_mtime).isoformat()
|
| 77 |
+
source = filePath.name
|
| 78 |
+
given_title = title or filePath.stem
|
| 79 |
+
else:
|
| 80 |
+
now = datetime.now().isoformat()
|
| 81 |
+
created_date = now
|
| 82 |
+
modified_date = now
|
| 83 |
+
# Use existing source from metadata if available, else empty
|
| 84 |
+
source = built_in_metadata.get("source", "")
|
| 85 |
+
if not source and filePath:
|
| 86 |
+
source = filePath.name
|
| 87 |
+
given_title = title or built_in_metadata.get("title", "Untitled")
|
| 88 |
+
|
| 89 |
+
docs = []
|
| 90 |
+
for i, chunk in enumerate(chunks):
|
| 91 |
+
# Base metadata
|
| 92 |
+
metadata = {
|
| 93 |
+
"doc_id": str(uuid.uuid4()), # unique chunk id
|
| 94 |
+
"source": source,
|
| 95 |
+
"title": given_title,
|
| 96 |
+
"created_date": created_date,
|
| 97 |
+
"modified_date": modified_date,
|
| 98 |
+
"chunk_index": i,
|
| 99 |
+
}
|
| 100 |
+
# Merge built-in, but don't overwrite our system fields if they exist
|
| 101 |
+
# actually, built-in should probably take precedence for some things?
|
| 102 |
+
# Let's simple merge:
|
| 103 |
+
metadata.update(built_in_metadata)
|
| 104 |
+
|
| 105 |
+
# Ensure our critical fields are set correctly after merge (if built-in had conflict)
|
| 106 |
+
metadata["doc_id"] = metadata.get("doc_id", str(uuid.uuid4()))
|
| 107 |
+
metadata["chunk_index"] = i
|
| 108 |
+
|
| 109 |
+
doc = Document(page_content=chunk, metadata=metadata)
|
| 110 |
+
docs.append(doc)
|
| 111 |
+
return docs
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def create_document(
|
| 115 |
+
text: str,
|
| 116 |
+
metadata: dict
|
| 117 |
+
):
|
| 118 |
+
return Document(page_content=text, metadata=metadata)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def clean_metadata(metadata: dict):
|
| 122 |
+
cleaned = {}
|
| 123 |
+
for k, v in metadata.items():
|
| 124 |
+
if isinstance(v, (datetime, date)):
|
| 125 |
+
cleaned[k] = v.isoformat()
|
| 126 |
+
elif isinstance(v, ALLOWED):
|
| 127 |
+
cleaned[k] = v
|
| 128 |
+
else:
|
| 129 |
+
cleaned[k] = str(v)
|
| 130 |
+
return cleaned
|
| 131 |
+
|
| 132 |
+
def read_text_file(filePath: Path):
|
| 133 |
+
with open(filePath, "r", encoding="utf-8") as f:
|
| 134 |
+
content = f.read()
|
| 135 |
+
return content
|
| 136 |
+
|
| 137 |
+
def read_json_file(filePath: Path):
|
| 138 |
+
with open(filePath, 'r') as file:
|
| 139 |
+
data = json.load(file)
|
| 140 |
+
return data
|
| 141 |
+
|
| 142 |
+
def build_metadata(filePath: Optional[Path] = None, content: Optional[str] = None):
|
| 143 |
+
if filePath:
|
| 144 |
+
content = read_text_file(filePath)
|
| 145 |
+
|
| 146 |
+
parts = content.split("---", 2)
|
| 147 |
+
|
| 148 |
+
if len(parts) >= 3:
|
| 149 |
+
frontmatter = yaml.safe_load(parts[1]) or {}
|
| 150 |
+
frontmatter = clean_metadata(frontmatter)
|
| 151 |
+
|
| 152 |
+
# add file name as source always
|
| 153 |
+
if filePath:
|
| 154 |
+
frontmatter["source"] = filePath.name
|
| 155 |
+
elif "source" not in frontmatter:
|
| 156 |
+
frontmatter["source"] = ""
|
| 157 |
+
|
| 158 |
+
return {
|
| 159 |
+
"metadata": frontmatter,
|
| 160 |
+
"content": parts[2].strip()
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
else:
|
| 164 |
+
# Don't enforce empty source if not provided, allows external metadata to stick
|
| 165 |
+
meta = {}
|
| 166 |
+
if filePath:
|
| 167 |
+
meta["source"] = filePath.name
|
| 168 |
+
|
| 169 |
+
return {
|
| 170 |
+
"metadata": meta,
|
| 171 |
+
"content": content.strip()
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
def create_documents_from_text(text: str, metadata: dict = {}):
|
| 175 |
+
"""
|
| 176 |
+
Create documents from raw text with automatic splitting and metadata enrichment.
|
| 177 |
+
"""
|
| 178 |
+
text = text.strip()
|
| 179 |
+
data = build_metadata(content=text)
|
| 180 |
+
|
| 181 |
+
# 1. Smart Metadata Merge
|
| 182 |
+
final_metadata = data["metadata"].copy()
|
| 183 |
+
|
| 184 |
+
# Update with provided metadata
|
| 185 |
+
if final_metadata.get("source") == "" and metadata.get("source"):
|
| 186 |
+
final_metadata["source"] = metadata["source"]
|
| 187 |
+
|
| 188 |
+
# Merge regular keys
|
| 189 |
+
final_metadata.update({k:v for k,v in metadata.items() if k != "source"})
|
| 190 |
+
|
| 191 |
+
text = data["content"]
|
| 192 |
+
|
| 193 |
+
# 2. Split text into chunks (strings)
|
| 194 |
+
# Use section-aware splitter if text contains markdown section delimiters
|
| 195 |
+
if "\n---\n" in text or text.startswith("---\n"):
|
| 196 |
+
splitter = TextSplitter.for_markdown_with_sections()
|
| 197 |
+
else:
|
| 198 |
+
splitter = TextSplitter()
|
| 199 |
+
chunks = splitter.split_text(text)
|
| 200 |
+
|
| 201 |
+
# 3. Create documents using standard helper (adds IDs, indices, dates)
|
| 202 |
+
return create_documents(
|
| 203 |
+
chunks=chunks,
|
| 204 |
+
filePath=None,
|
| 205 |
+
built_in_metadata=final_metadata
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
def load_json(filePath: Path):
|
| 209 |
+
data = read_json_file(filePath=filePath)
|
| 210 |
+
filePath = Path(filePath)
|
| 211 |
+
file_name = filePath.name
|
| 212 |
+
|
| 213 |
+
metadata = {
|
| 214 |
+
"id": data["id"],
|
| 215 |
+
"title": data.get("name", data.get("title", "Untitled")),
|
| 216 |
+
"source": data["source"],
|
| 217 |
+
"source_file": file_name or "Untitled",
|
| 218 |
+
"created_date": datetime.now().isoformat()
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
docs= []
|
| 222 |
+
splitter = TextSplitter()
|
| 223 |
+
for key,value in data["content"].items():
|
| 224 |
+
ctx = splitter.split_text(value.strip())
|
| 225 |
+
for idx, chunk in enumerate(ctx):
|
| 226 |
+
if(chunk.strip() == ""):
|
| 227 |
+
continue
|
| 228 |
+
else:
|
| 229 |
+
chunk = f"{key}: {chunk.strip()}"
|
| 230 |
+
docs.append(Document(page_content=chunk, metadata={**metadata, "topic": key, "chunk_index": idx}))
|
| 231 |
+
return docs
|
app/utils/embeddings.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 2 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
def get_embedding_model():
|
| 8 |
+
embeddings = GoogleGenerativeAIEmbeddings(
|
| 9 |
+
model="models/gemini-embedding-001"
|
| 10 |
+
)
|
| 11 |
+
return embeddings
|
app/utils/llm_models.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from langchain_community.llms import LlamaCpp
|
| 3 |
+
from langchain_community.chat_models import ChatLlamaCpp
|
| 4 |
+
from app.core.config import settings
|
| 5 |
+
model_file = Path(settings.model_path) / settings.local_model_name
|
| 6 |
+
|
| 7 |
+
def load_model():
|
| 8 |
+
return ChatLlamaCpp(
|
| 9 |
+
model_path=str(model_file), # Direct path
|
| 10 |
+
n_ctx=8192,
|
| 11 |
+
n_batch=512,
|
| 12 |
+
n_threads=4,
|
| 13 |
+
temperature=0.05,
|
| 14 |
+
top_p=0.8,
|
| 15 |
+
top_k=20,
|
| 16 |
+
repeat_penalty=1.1,
|
| 17 |
+
f16_kv=True,
|
| 18 |
+
verbose=False,
|
| 19 |
+
)
|
| 20 |
+
|
app/utils/model_factory.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Model factory for creating LLM and embedding models.
|
| 3 |
+
Handles model switching and fallback logic.
|
| 4 |
+
"""
|
| 5 |
+
from typing import Optional
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
|
| 8 |
+
from langchain_community.chat_models import ChatLlamaCpp
|
| 9 |
+
from app.core.config import settings
|
| 10 |
+
import logging
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def get_embedding_model():
|
| 16 |
+
"""
|
| 17 |
+
Get the embedding model (currently only Gemini).
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
GoogleGenerativeAIEmbeddings: Embedding model instance
|
| 21 |
+
"""
|
| 22 |
+
try:
|
| 23 |
+
embeddings = GoogleGenerativeAIEmbeddings(
|
| 24 |
+
model=settings.embedding_model_name,
|
| 25 |
+
google_api_key=settings.google_api_key
|
| 26 |
+
)
|
| 27 |
+
logger.info(f"Loaded embedding model: {settings.embedding_model_name}")
|
| 28 |
+
return embeddings
|
| 29 |
+
except Exception as e:
|
| 30 |
+
logger.error(f"Failed to load embedding model: {e}")
|
| 31 |
+
raise
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def get_gemini_model():
|
| 35 |
+
"""
|
| 36 |
+
Get Google Gemini chat model.
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
ChatGoogleGenerativeAI: Gemini model instance
|
| 40 |
+
"""
|
| 41 |
+
try:
|
| 42 |
+
model = ChatGoogleGenerativeAI(
|
| 43 |
+
model=settings.gemini_model_name,
|
| 44 |
+
google_api_key=settings.google_api_key,
|
| 45 |
+
)
|
| 46 |
+
logger.info(f"Loaded Gemini model: {settings.gemini_model_name}")
|
| 47 |
+
return model
|
| 48 |
+
except Exception as e:
|
| 49 |
+
logger.error(f"Failed to load Gemini model: {e}")
|
| 50 |
+
raise
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def get_local_model():
|
| 54 |
+
"""
|
| 55 |
+
Get local Qwen model (LlamaCpp).
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
ChatLlamaCpp: Local model instance
|
| 59 |
+
"""
|
| 60 |
+
try:
|
| 61 |
+
model_file = settings.model_path / settings.local_model_name
|
| 62 |
+
|
| 63 |
+
if not model_file.exists():
|
| 64 |
+
raise FileNotFoundError(
|
| 65 |
+
f"Model file not found: {model_file}\n"
|
| 66 |
+
f"Please download it to {settings.model_path}/"
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# model = ChatLlamaCpp(
|
| 70 |
+
# model_path=str(model_file),
|
| 71 |
+
# n_ctx=4096, # Context window size
|
| 72 |
+
# n_batch=512, # Batch size for prompt processing
|
| 73 |
+
# n_threads=4, # Number of CPU threads
|
| 74 |
+
# max_tokens=settings.local_max_tokens, # Maximum tokens to generate
|
| 75 |
+
# temperature=0.05, # Low temperature for more focused responses
|
| 76 |
+
# top_p=0.8, # Nucleus sampling
|
| 77 |
+
# top_k=20, # Top-k sampling
|
| 78 |
+
# repeat_penalty=1.1, # Penalty for repetition
|
| 79 |
+
# f16_kv=True, # Use half-precision for KV cache
|
| 80 |
+
# verbose=False,
|
| 81 |
+
# )
|
| 82 |
+
model = ChatLlamaCpp(
|
| 83 |
+
model_path=str(model_file),
|
| 84 |
+
n_ctx=8096, # Small context to fit ~2GB total RAM usage [web:14]
|
| 85 |
+
n_batch=512, # Smaller batch for low memory throughput
|
| 86 |
+
n_threads=4, # Conservative threads (avoid RAM thrashing on 4GB) [web:12]
|
| 87 |
+
max_tokens= settings.local_max_tokens, # Short responses keep memory low
|
| 88 |
+
temperature=0.1, # Focused output, less randomness
|
| 89 |
+
top_p=0.9,
|
| 90 |
+
top_k=30,
|
| 91 |
+
repeat_penalty=1.05,
|
| 92 |
+
f16_kv=True, # Essential half-precision KV cache [web:14]
|
| 93 |
+
f16=True, # Full f16 where possible
|
| 94 |
+
verbose=True,
|
| 95 |
+
chat_format="chatml", # Proper templating
|
| 96 |
+
# Low-RAM must-haves:
|
| 97 |
+
numa=False, # Disable NUMA for single-CPU setups
|
| 98 |
+
use_mlock=False, # Skip memory locking (saves overhead)
|
| 99 |
+
use_mmap=True, # Memory-map model file (streams from disk)
|
| 100 |
+
)
|
| 101 |
+
# model = ChatLlamaCpp(
|
| 102 |
+
# model_path=str(model_file),
|
| 103 |
+
# n_ctx=4096, # Small context to fit ~2GB total RAM usage [web:14]
|
| 104 |
+
# n_batch=512, # Smaller batch for low memory throughput
|
| 105 |
+
# n_threads=4, # Conservative threads (avoid RAM thrashing on 4GB) [web:12]
|
| 106 |
+
# max_tokens= settings.local_max_tokens, # Short responses keep memory low
|
| 107 |
+
# temperature=0.1, # Focused output, less randomness
|
| 108 |
+
# top_p=0.9,
|
| 109 |
+
# min_p=0.15,
|
| 110 |
+
# top_k=30,
|
| 111 |
+
# repeat_penalty=1.05,
|
| 112 |
+
# f16_kv=True, # Essential half-precision KV cache [web:14]
|
| 113 |
+
# f16=True, # Full f16 where possible
|
| 114 |
+
# verbose=False,
|
| 115 |
+
# chat_format="qwen", # Proper templating,
|
| 116 |
+
# verbos=True
|
| 117 |
+
# )
|
| 118 |
+
logger.info(f"Loaded local model: {settings.local_model_name}")
|
| 119 |
+
return model
|
| 120 |
+
except Exception as e:
|
| 121 |
+
logger.error(f"Failed to load local model: {e}")
|
| 122 |
+
raise
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def get_llm_model(provider: Optional[str] = None):
|
| 126 |
+
"""
|
| 127 |
+
Get LLM model based on configuration with fallback support.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
provider: Override the default provider ("gemini" or "local")
|
| 131 |
+
If None, uses settings.llm_provider
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
LLM model instance (Gemini or Local)
|
| 135 |
+
|
| 136 |
+
Raises:
|
| 137 |
+
RuntimeError: If all models fail to load
|
| 138 |
+
"""
|
| 139 |
+
provider = provider or settings.llm_provider
|
| 140 |
+
|
| 141 |
+
if provider == "gemini":
|
| 142 |
+
print("gemini loaded")
|
| 143 |
+
try:
|
| 144 |
+
return get_gemini_model()
|
| 145 |
+
except Exception as e:
|
| 146 |
+
logger.warning(f"Gemini model failed: {e}")
|
| 147 |
+
if settings.enable_fallback:
|
| 148 |
+
logger.info("Falling back to local model...")
|
| 149 |
+
return get_local_model()
|
| 150 |
+
raise
|
| 151 |
+
|
| 152 |
+
elif provider == "local":
|
| 153 |
+
print("local loaded")
|
| 154 |
+
try:
|
| 155 |
+
return get_local_model()
|
| 156 |
+
except Exception as e:
|
| 157 |
+
logger.warning(f"Local model failed: {e}")
|
| 158 |
+
if settings.enable_fallback:
|
| 159 |
+
logger.info("Falling back to Gemini model...")
|
| 160 |
+
return get_gemini_model()
|
| 161 |
+
raise
|
| 162 |
+
|
| 163 |
+
else:
|
| 164 |
+
raise ValueError(f"Unknown provider: {provider}. Use 'gemini' or 'local'")
|
app/utils/preprocessing.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
from .constants import stopwords, short_words_mappings
|
| 3 |
+
from langchain_core.documents import Document
|
| 4 |
+
from app.utils.model_factory import get_local_model
|
| 5 |
+
from nltk.stem import PorterStemmer
|
| 6 |
+
import spacy
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import re
|
| 9 |
+
|
| 10 |
+
nlp = spacy.load('en_core_web_sm')
|
| 11 |
+
|
| 12 |
+
def lowercase(text: str):
|
| 13 |
+
return text.strip()
|
| 14 |
+
|
| 15 |
+
def tokenization(text: str):
|
| 16 |
+
if text is None or len(text) == 0:
|
| 17 |
+
return []
|
| 18 |
+
results = lowercase(text).split(" ")
|
| 19 |
+
return results
|
| 20 |
+
|
| 21 |
+
def stop_words_removal(text: str, short_words_mapping: bool = False):
|
| 22 |
+
if not text:
|
| 23 |
+
return []
|
| 24 |
+
|
| 25 |
+
doc = nlp(text)
|
| 26 |
+
results = []
|
| 27 |
+
|
| 28 |
+
for token in doc:
|
| 29 |
+
if token.is_space:
|
| 30 |
+
continue
|
| 31 |
+
if token.pos_ not in ["NOUN", "PROPN", "VERB", "NUM", "ADJ"]:
|
| 32 |
+
continue
|
| 33 |
+
word = token.text.lower()
|
| 34 |
+
|
| 35 |
+
if short_words_mapping and word in short_words_mappings:
|
| 36 |
+
word = short_words_mappings[word]
|
| 37 |
+
doc2 = nlp(word)
|
| 38 |
+
lemma = doc2[0].lemma_
|
| 39 |
+
else:
|
| 40 |
+
lemma = token.lemma_
|
| 41 |
+
|
| 42 |
+
lemma = lemma.strip().lower()
|
| 43 |
+
|
| 44 |
+
if lemma and lemma not in stopwords:
|
| 45 |
+
results.append(lemma)
|
| 46 |
+
|
| 47 |
+
return results
|
| 48 |
+
|
| 49 |
+
def space_removal(words: List[str]):
|
| 50 |
+
results = []
|
| 51 |
+
for word in words:
|
| 52 |
+
word = word.strip()
|
| 53 |
+
if(word == ""):
|
| 54 |
+
continue
|
| 55 |
+
results.append(word.strip())
|
| 56 |
+
return results
|
| 57 |
+
|
| 58 |
+
def preprocess(text: str, short_words_mapping: bool = False) -> str:
|
| 59 |
+
if text is None or len(text) == 0:
|
| 60 |
+
raise ValueError("Text cannot be empty")
|
| 61 |
+
stop_words_removed_chunks = stop_words_removal(text, short_words_mapping)
|
| 62 |
+
return " ".join(stop_words_removed_chunks)
|
| 63 |
+
|
| 64 |
+
def normalize(text: str) -> str:
|
| 65 |
+
if text is None or len(text) == 0:
|
| 66 |
+
raise ValueError("Text cannot be empty")
|
| 67 |
+
chunks = tokenization(text)
|
| 68 |
+
r1 = space_removal(chunks)
|
| 69 |
+
return " ".join(r1)
|
| 70 |
+
|
| 71 |
+
def preprocess_document(doc: Document):
|
| 72 |
+
if(doc.page_content == ""):
|
| 73 |
+
return
|
| 74 |
+
doc.page_content = preprocess(doc.page_content)
|
| 75 |
+
|
| 76 |
+
def preprocess_documents(docs: List[Document]):
|
| 77 |
+
for doc in docs:
|
| 78 |
+
preprocess_document(doc)
|
| 79 |
+
|
| 80 |
+
def preprocess_query(query: str) -> str:
|
| 81 |
+
if query is None or len(query.strip()) == 0:
|
| 82 |
+
raise ValueError("Query cannot be empty")
|
| 83 |
+
|
| 84 |
+
# model = get_local_model()
|
| 85 |
+
|
| 86 |
+
# prompt = f"""Rewrite this query for better semantic search/embeddings:
|
| 87 |
+
# Make it more descriptive, clear, natural. Keep core intent.
|
| 88 |
+
# Query: "{query}"
|
| 89 |
+
# Improved:
|
| 90 |
+
# """
|
| 91 |
+
# response = model.invoke(prompt)
|
| 92 |
+
# cleaned = re.sub(r'^\s*Improved:\s*', '', response.content.strip(), flags=re.IGNORECASE).strip()
|
| 93 |
+
return normalize(query)
|
| 94 |
+
|
| 95 |
+
def preprocess_filename(filePath: Path) -> str:
|
| 96 |
+
file_name = filePath.name
|
| 97 |
+
name = Path(file_name).stem
|
| 98 |
+
ext = Path(file_name).suffix.lower()
|
| 99 |
+
|
| 100 |
+
# Remove special characters but keep letters, numbers, _ and -
|
| 101 |
+
safe_name = re.sub(r'[^a-zA-Z0-9_-]', '', name)
|
| 102 |
+
# Convert to lowercase
|
| 103 |
+
safe_name = safe_name.lower()
|
| 104 |
+
# Fallback if name becomes empty (e.g. "!!!.pdf")
|
| 105 |
+
if not safe_name:
|
| 106 |
+
safe_name = "file"
|
| 107 |
+
return safe_name + ext
|
app/utils/tests.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|